[
  {
    "path": ".bandit.yml",
    "content": "skips:\n- B101\n- B311\n- B113  # `Requests call without timeout` these requests are done in the benchmark and examples scripts only\n- B403  # We are using pickle for tests only\n- B404  # Using subprocess library\n- B602  # subprocess call with shell=True identified\n- B110  # Try, Except, Pass detected.\n- B104  # Possible binding to all interfaces.\n- B301  # Pickle and modules that wrap it can be unsafe when used to deserialize untrusted data, possible security issue.\n- B108  # Probable insecure usage of temp file/directory."
  },
  {
    "path": ".dockerignore",
    "content": "# Github\n.github/\n\n# docs\ndocs/\nimages/\n.cache/\n.claude/\n\n# cached files\n__pycache__/\n*.py[cod]\n.cache\n.DS_Store\n*~\n.*.sw[po]\n.build\n.ve\n.env\n.pytest\n.benchmarks\n.bootstrap\n.appveyor.token\n*.bak\n*.db\n*.db-*\n\n# installation package\n*.egg-info/\ndist/\nbuild/\n\n# environments\n.venv\nenv/\nvenv/\nENV/\nenv.bak/\nvenv.bak/\n\n# C extensions\n*.so\n\n# pycharm\n.idea/\n\n# vscode\n*.code-workspace\n\n# Packages\n*.egg\n*.egg-info\ndist\nbuild\neggs\n.eggs\nparts\nbin\nvar\nsdist\nwheelhouse\ndevelop-eggs\n.installed.cfg\nlib\nlib64\nvenv*/\n.venv*/\npyvenv*/\npip-wheel-metadata/\npoetry.lock\n\n# Installer logs\npip-log.txt\n\n# mypy\n.mypy_cache/\n.dmypy.json\ndmypy.json\nmypy.ini\n\n# test caches\n.tox/\n.pytest_cache/\n.coverage\nhtmlcov\nreport.xml\nnosetests.xml\ncoverage.xml\n\n# Translations\n*.mo\n\n# Buildout\n.mr.developer.cfg\n\n# IDE project files\n.project\n.pydevproject\n.idea\n*.iml\n*.komodoproject\n\n# Complexity\noutput/*.html\noutput/*/index.html\n\n# Sphinx\ndocs/_build\npublic/\nweb/\n"
  },
  {
    "path": ".github/FUNDING.yml",
    "content": "github: D4Vinci\nbuy_me_a_coffee: d4vinci\nko_fi: d4vinci\n"
  },
  {
    "path": ".github/ISSUE_TEMPLATE/01-bug_report.yml",
    "content": "name: Bug report\ndescription: Create a bug report to help us address errors in the repository\nlabels: [bug]\nbody:\n  - type: checkboxes\n    attributes:\n      label: Have you searched if there an existing issue for this?\n      description: Please search [existing issues](https://github.com/D4Vinci/Scrapling/labels/bug).\n      options:\n        - label: I have searched the existing issues\n          required: true\n\n  - type: input\n    attributes:\n      label: \"Python version (python --version)\"\n      placeholder: \"Python 3.8\"\n    validations:\n      required: true\n\n  - type: input\n    attributes:\n      label: \"Scrapling version (scrapling.__version__)\"\n      placeholder: \"0.1\"\n    validations:\n      required: true\n\n  - type: textarea\n    attributes:\n      label: \"Dependencies version (pip3 freeze)\"\n      description: >\n        This is the output of the command `pip3 freeze --all`. Note that the\n        actual output might be different as compared to the placeholder text.\n      placeholder: |\n        cssselect==1.2.0\n        lxml==5.3.0\n        orjson==3.10.7\n        ...\n    validations:\n      required: true\n\n  - type: input\n    attributes:\n      label: \"What's your operating system?\"\n      placeholder: \"Windows 10\"\n    validations:\n      required: true\n\n  - type: dropdown\n    attributes:\n      label: 'Are you using a separate virtual environment?'\n      description: \"Please pay attention to this question\"\n      options:\n        - 'No'\n        - 'Yes'\n      default: 0\n    validations:\n      required: true\n\n  - type: textarea\n    attributes:\n      label: \"Expected behavior\"\n      description: \"Describe the behavior you expect. May include images or videos.\"\n    validations:\n      required: true\n\n  - type: textarea\n    attributes:\n      label: \"Actual behavior\"\n    validations:\n      required: true\n\n  - type: textarea\n    attributes:\n      label: Steps To Reproduce\n      description: Steps to reproduce the behavior.\n      placeholder: |\n        1. In this environment...\n        2. With this config...\n        3. Run '...'\n        4. See error...\n    validations:\n      required: false\n"
  },
  {
    "path": ".github/ISSUE_TEMPLATE/02-feature_request.yml",
    "content": "name: Feature request\ndescription: Suggest features, propose improvements, discuss new ideas.\nlabels: [enhancement]\nbody:\n  - type: checkboxes\n    attributes:\n      label: Have you searched if there an existing feature request for this?\n      description: Please search [existing requests](https://github.com/D4Vinci/Scrapling/labels/enhancement).\n      options:\n        - label: I have searched the existing requests\n          required: true\n\n  - type: textarea\n    attributes:\n      label: \"Feature description\"\n      description: >\n        This could include new topics or improving any existing features/implementations.\n    validations:\n      required: true"
  },
  {
    "path": ".github/ISSUE_TEMPLATE/03-other.yml",
    "content": "name: Other\ndescription: Use this for any other issues. PLEASE provide as much information as possible.\nlabels: [\"awaiting triage\"]\nbody:\n  - type: textarea\n    id: issuedescription\n    attributes:\n      label: What would you like to share?\n      description: Provide a clear and concise explanation of your issue.\n    validations:\n      required: true\n\n  - type: textarea\n    id: extrainfo\n    attributes:\n      label: Additional information\n      description: Is there anything else we should know about this issue?\n    validations:\n      required: false"
  },
  {
    "path": ".github/ISSUE_TEMPLATE/04-docs_issue.yml",
    "content": "name: Documentation issue\ndescription: Report incorrect, unclear, or missing documentation.\nlabels: [documentation]\nbody:\n  - type: checkboxes\n    attributes:\n      label: Have you searched if there an existing issue for this?\n      description: Please search [existing issues](https://github.com/D4Vinci/Scrapling/labels/documentation).\n      options:\n        - label: I have searched the existing issues\n          required: true\n\n  - type: input\n    attributes:\n      label: \"Page URL\"\n      description: \"Link to the documentation page with the issue.\"\n      placeholder: \"https://scrapling.readthedocs.io/en/latest/...\"\n    validations:\n      required: true\n\n  - type: dropdown\n    attributes:\n      label: \"Type of issue\"\n      options:\n        - Incorrect information\n        - Unclear or confusing\n        - Missing information\n        - Typo or formatting\n        - Broken link\n        - Other\n      default: 0\n    validations:\n      required: true\n\n  - type: textarea\n    attributes:\n      label: \"Description\"\n      description: \"Describe what's wrong and what you expected to find.\"\n    validations:\n      required: true\n"
  },
  {
    "path": ".github/ISSUE_TEMPLATE/config.yml",
    "content": "blank_issues_enabled: false\ncontact_links:\n- name: Discussions\n  url: https://github.com/D4Vinci/Scrapling/discussions\n  about: >\n    The \"Discussions\" forum is where you want to start. 💖\n- name: Ask on our discord server\n  url: https://discord.gg/EMgGbDceNQ\n  about: >\n    Our community chat forum."
  },
  {
    "path": ".github/PULL_REQUEST_TEMPLATE.md",
    "content": "<!--\n  You are amazing! Thanks for contributing to Scrapling!\n  Please, DO NOT DELETE ANY TEXT from this template! (unless instructed).\n-->\n\n## Proposed change\n<!--\n  Describe the big picture of your changes here to communicate to the maintainers why we should accept this pull request.\n  If it fixes a bug or resolves a feature request, be sure to link to that issue in the additional information section.\n-->\n\n\n### Type of change:\n<!--\n  What type of change does your PR introduce to Scrapling?\n  NOTE: Please, check at least 1 box!\n  If your PR requires multiple boxes to be checked, you'll most likely need to\n  split it into multiple PRs. This makes things easier and faster to code review.\n-->\n\n\n\n- [ ] Dependency upgrade\n- [ ] Bugfix (non-breaking change which fixes an issue)\n- [ ] New integration (thank you!)\n- [ ] New feature (which adds functionality to an existing integration)\n- [ ] Deprecation (breaking change to happen in the future)\n- [ ] Breaking change (fix/feature causing existing functionality to break)\n- [ ] Code quality improvements to existing code or addition of tests\n- [ ] Add or change doctests? -- Note: Please avoid changing both code and tests in a single pull request.\n- [ ] Documentation change?\n\n### Additional information\n<!--\n  Details are important and help maintainers processing your PR.\n  Please be sure to fill out additional details, if applicable.\n-->\n\n- This PR fixes or closes an issue: fixes #\n- This PR is related to an issue: #\n- Link to documentation pull request: **\n\n### Checklist:\n* [ ] I have read [CONTRIBUTING.md](https://github.com/D4Vinci/Scrapling/blob/main/CONTRIBUTING.md).\n* [ ] This pull request is all my own work -- I have not plagiarized.\n* [ ] I know that pull requests will not be merged if they fail the automated tests.\n* [ ] All new Python files are placed inside an existing directory.\n* [ ] All filenames are in all lowercase characters with no spaces or dashes.\n* [ ] All functions and variable names follow Python naming conventions.\n* [ ] All function parameters and return values are annotated with Python [type hints](https://docs.python.org/3/library/typing.html).\n* [ ] All functions have doc-strings.\n"
  },
  {
    "path": ".github/workflows/code-quality.yml",
    "content": "name: Code Quality\n\non:\n  push:\n    branches:\n      - main\n      - dev\n    paths-ignore:\n      - '*.md'\n      - '**/*.md'\n      - 'docs/**'\n      - 'images/**'\n      - '.github/**'\n      - 'agent-skill/**'\n      - '!.github/workflows/code-quality.yml'  # Always run when this workflow changes\n  pull_request:\n    branches:\n      - main\n      - dev\n    paths-ignore:\n      - '*.md'\n      - '**/*.md'\n      - 'docs/**'\n      - 'images/**'\n      - '.github/**'\n      - 'agent-skill/**'\n      - '*.yml'\n      - '*.yaml'\n      - 'ruff.toml'\n  workflow_dispatch:  # Allow manual triggering\n\nconcurrency:\n  group: ${{ github.workflow }}-${{ github.ref }}\n  cancel-in-progress: true\n\njobs:\n  code-quality:\n    name: Code Quality Checks\n    runs-on: ubuntu-latest\n    permissions:\n      contents: read\n      pull-requests: write  # For PR annotations\n\n    steps:\n      - name: Checkout code\n        uses: actions/checkout@v6\n        with:\n          fetch-depth: 0  # Full history for better analysis\n\n      - name: Set up Python\n        uses: actions/setup-python@v6\n        with:\n          python-version: '3.10'\n          cache: 'pip'\n\n      - name: Install dependencies\n        run: |\n          python -m pip install --upgrade pip\n          pip install bandit[toml] ruff vermin mypy pyright\n          pip install -e \".[all]\"\n          pip install lxml-stubs\n\n      - name: Run Bandit (Security Linter)\n        id: bandit\n        continue-on-error: true\n        run: |\n          echo \"::group::Bandit - Security Linter\"\n          bandit -r -c .bandit.yml scrapling/ -f json -o bandit-report.json\n          bandit -r -c .bandit.yml scrapling/\n          echo \"::endgroup::\"\n\n      - name: Run Ruff Linter\n        id: ruff-lint\n        continue-on-error: true\n        run: |\n          echo \"::group::Ruff - Linter\"\n          ruff check scrapling/ --output-format=github\n          echo \"::endgroup::\"\n\n      - name: Run Ruff Formatter Check\n        id: ruff-format\n        continue-on-error: true\n        run: |\n          echo \"::group::Ruff - Formatter Check\"\n          ruff format --check scrapling/ --diff\n          echo \"::endgroup::\"\n\n      - name: Run Vermin (Python Version Compatibility)\n        id: vermin\n        continue-on-error: true\n        run: |\n          echo \"::group::Vermin - Python 3.10+ Compatibility Check\"\n          vermin -t=3.10- --violations --eval-annotations --no-tips scrapling/\n          echo \"::endgroup::\"\n\n      - name: Run Mypy (Static Type Checker)\n        id: mypy\n        continue-on-error: true\n        run: |\n          echo \"::group::Mypy - Static Type Checker\"\n          mypy scrapling/\n          echo \"::endgroup::\"\n\n      - name: Run Pyright (Static Type Checker)\n        id: pyright\n        continue-on-error: true\n        run: |\n          echo \"::group::Pyright - Static Type Checker\"\n          pyright scrapling/\n          echo \"::endgroup::\"\n\n      - name: Check results and create summary\n        if: always()\n        run: |\n          echo \"# Code Quality Check Results\" >> $GITHUB_STEP_SUMMARY\n          echo \"\" >> $GITHUB_STEP_SUMMARY\n\n          # Initialize status\n          all_passed=true\n\n          # Check Bandit\n          if [ \"${{ steps.bandit.outcome }}\" == \"success\" ]; then\n            echo \"✅ **Bandit (Security)**: Passed\" >> $GITHUB_STEP_SUMMARY\n          else\n            echo \"❌ **Bandit (Security)**: Failed\" >> $GITHUB_STEP_SUMMARY\n            all_passed=false\n          fi\n\n          # Check Ruff Linter\n          if [ \"${{ steps.ruff-lint.outcome }}\" == \"success\" ]; then\n            echo \"✅ **Ruff Linter**: Passed\" >> $GITHUB_STEP_SUMMARY\n          else\n            echo \"❌ **Ruff Linter**: Failed\" >> $GITHUB_STEP_SUMMARY\n            all_passed=false\n          fi\n\n          # Check Ruff Formatter\n          if [ \"${{ steps.ruff-format.outcome }}\" == \"success\" ]; then\n            echo \"✅ **Ruff Formatter**: Passed\" >> $GITHUB_STEP_SUMMARY\n          else\n            echo \"❌ **Ruff Formatter**: Failed\" >> $GITHUB_STEP_SUMMARY\n            all_passed=false\n          fi\n\n          # Check Vermin\n          if [ \"${{ steps.vermin.outcome }}\" == \"success\" ]; then\n            echo \"✅ **Vermin (Python 3.10+)**: Passed\" >> $GITHUB_STEP_SUMMARY\n          else\n            echo \"❌ **Vermin (Python 3.10+)**: Failed\" >> $GITHUB_STEP_SUMMARY\n            all_passed=false\n          fi\n\n          # Check Mypy\n          if [ \"${{ steps.mypy.outcome }}\" == \"success\" ]; then\n            echo \"✅ **Mypy (Type Checker)**: Passed\" >> $GITHUB_STEP_SUMMARY\n          else\n            echo \"❌ **Mypy (Type Checker)**: Failed\" >> $GITHUB_STEP_SUMMARY\n            all_passed=false\n          fi\n\n          # Check Pyright\n          if [ \"${{ steps.pyright.outcome }}\" == \"success\" ]; then\n            echo \"✅ **Pyright (Type Checker)**: Passed\" >> $GITHUB_STEP_SUMMARY\n          else\n            echo \"❌ **Pyright (Type Checker)**: Failed\" >> $GITHUB_STEP_SUMMARY\n            all_passed=false\n          fi\n\n          echo \"\" >> $GITHUB_STEP_SUMMARY\n\n          if [ \"$all_passed\" == \"true\" ]; then\n            echo \"### 🎉 All checks passed!\" >> $GITHUB_STEP_SUMMARY\n            echo \"\" >> $GITHUB_STEP_SUMMARY\n            echo \"Your code meets all quality standards.\" >> $GITHUB_STEP_SUMMARY\n          else\n            echo \"### ⚠️ Some checks failed\" >> $GITHUB_STEP_SUMMARY\n            echo \"\" >> $GITHUB_STEP_SUMMARY\n            echo \"Please review the errors above and fix them.\" >> $GITHUB_STEP_SUMMARY\n            echo \"\" >> $GITHUB_STEP_SUMMARY\n            echo \"**Tip**: Run \\`pre-commit run --all-files\\` locally to catch these issues before pushing.\" >> $GITHUB_STEP_SUMMARY\n            exit 1\n          fi\n\n      - name: Upload Bandit report\n        if: always() && steps.bandit.outcome != 'skipped'\n        uses: actions/upload-artifact@v6\n        with:\n          name: bandit-security-report\n          path: bandit-report.json\n          retention-days: 30\n"
  },
  {
    "path": ".github/workflows/docker-build.yml",
    "content": "name: Build and Push Docker Image\n\non:\n  pull_request:\n    types: [closed]\n    branches:\n      - main\n  workflow_dispatch:\n    inputs:\n      tag:\n        description: 'Docker image tag'\n        required: true\n        default: 'latest'\n\nenv:\n  DOCKERHUB_IMAGE: pyd4vinci/scrapling\n  GHCR_IMAGE: ghcr.io/${{ github.repository_owner }}/scrapling\n\njobs:\n  build-and-push:\n    runs-on: ubuntu-latest\n    permissions:\n      contents: read\n      packages: write\n\n    steps:\n    - name: Checkout repository\n      uses: actions/checkout@v6\n\n    - name: Set up Docker Buildx\n      uses: docker/setup-buildx-action@v3\n      with:\n        platforms: linux/amd64,linux/arm64\n\n    - name: Log in to Docker Hub\n      uses: docker/login-action@v3\n      with:\n        registry: docker.io\n        username: ${{ secrets.DOCKER_USERNAME }}\n        password: ${{ secrets.DOCKER_PASSWORD }}\n\n    - name: Log in to GitHub Container Registry\n      uses: docker/login-action@v3\n      with:\n        registry: ghcr.io\n        username: ${{ github.actor }}\n        password: ${{ secrets.CONTAINER_TOKEN }}\n\n    - name: Extract metadata\n      id: meta\n      uses: docker/metadata-action@v5\n      with:\n        images: |\n          ${{ env.DOCKERHUB_IMAGE }}\n          ${{ env.GHCR_IMAGE }}\n        tags: |\n          type=ref,event=branch\n          type=ref,event=pr\n          type=semver,pattern={{version}}\n          type=semver,pattern={{major}}.{{minor}}\n          type=semver,pattern={{major}}\n          type=raw,value=latest,enable={{is_default_branch}}\n        labels: |\n          org.opencontainers.image.title=Scrapling\n          org.opencontainers.image.description=An undetectable, powerful, flexible, high-performance Python library that makes Web Scraping easy and effortless as it should be!\n          org.opencontainers.image.vendor=D4Vinci\n          org.opencontainers.image.licenses=BSD\n          org.opencontainers.image.url=https://scrapling.readthedocs.io/en/latest/\n          org.opencontainers.image.source=${{ github.server_url }}/${{ github.repository }}\n          org.opencontainers.image.documentation=https://scrapling.readthedocs.io/en/latest/\n\n    - name: Build and push Docker image\n      uses: docker/build-push-action@v6\n      with:\n        context: .\n        platforms: linux/amd64,linux/arm64\n        push: true\n        tags: ${{ steps.meta.outputs.tags }}\n        labels: ${{ steps.meta.outputs.labels }}\n        cache-from: type=gha\n        cache-to: type=gha,mode=max\n        build-args: |\n          BUILDKIT_INLINE_CACHE=1\n\n    - name: Image digest\n      run: echo ${{ steps.build.outputs.digest }}"
  },
  {
    "path": ".github/workflows/release-and-publish.yml",
    "content": "name: Create Release and Publish to PyPI\n# Creates a GitHub release when a PR is merged to main (using PR title as version and body as release notes), then publishes to PyPI.\n\non:\n  pull_request:\n    types: [closed]\n    branches:\n      - main\n\njobs:\n  create-release-and-publish:\n    if: github.event.pull_request.merged == true\n    runs-on: ubuntu-latest\n    environment:\n      name: PyPI\n      url: https://pypi.org/p/scrapling\n    permissions:\n      contents: write\n      id-token: write\n    steps:\n      - uses: actions/checkout@v6\n        with:\n          fetch-depth: 0\n\n      - name: Get PR title\n        id: pr_title\n        run: echo \"title=${{ github.event.pull_request.title }}\" >> $GITHUB_OUTPUT\n\n      - name: Save PR body to file\n        uses: actions/github-script@v8\n        with:\n          script: |\n            const fs = require('fs');\n            fs.writeFileSync('pr_body.md', context.payload.pull_request.body || '');\n\n      - name: Extract version\n        id: extract_version\n        run: |\n          PR_TITLE=\"${{ steps.pr_title.outputs.title }}\"\n          if [[ $PR_TITLE =~ ^v ]]; then\n            echo \"version=$PR_TITLE\" >> $GITHUB_OUTPUT\n            echo \"Valid version format found in PR title: $PR_TITLE\"\n          else\n            echo \"Error: PR title '$PR_TITLE' must start with 'v' (e.g., 'v1.0.0') to create a release.\"\n            exit 1\n          fi\n\n      - name: Create Release\n        uses: softprops/action-gh-release@v2\n        with:\n          tag_name: ${{ steps.extract_version.outputs.version }}\n          name: Release ${{ steps.extract_version.outputs.version }}\n          body_path: pr_body.md\n          draft: false\n          prerelease: false\n        env:\n          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}\n\n      - name: Set up Python\n        uses: actions/setup-python@v6\n        with:\n          python-version: 3.12\n\n      - name: Upgrade pip\n        run: python3 -m pip install --upgrade pip\n\n      - name: Install build\n        run: python3 -m pip install --upgrade build twine setuptools\n\n      - name: Build a binary wheel and a source tarball\n        run: python3 -m build --sdist --wheel --outdir dist/\n\n      - name: Publish distribution 📦 to PyPI\n        uses: pypa/gh-action-pypi-publish@release/v1"
  },
  {
    "path": ".github/workflows/tests.yml",
    "content": "name: Tests\non:\n  push:\n    branches:\n      - main\n      - dev\n    paths-ignore:\n      - '*.md'\n      - '**/*.md'\n      - 'docs/**'\n      - 'images/**'\n      - '.github/**'\n      - 'agent-skill/**'\n      - '*.yml'\n      - '*.yaml'\n      - 'ruff.toml'\n  pull_request:\n    branches:\n      - main\n      - dev\n    paths-ignore:\n      - '*.md'\n      - '**/*.md'\n      - 'docs/**'\n      - 'images/**'\n      - '.github/**'\n      - 'agent-skill/**'\n      - '*.yml'\n      - '*.yaml'\n      - 'ruff.toml'\n\nconcurrency:\n  group: ${{github.workflow}}-${{ github.ref }}\n  cancel-in-progress: true\n\njobs:\n  tests:\n    timeout-minutes: 60\n    runs-on: ${{ matrix.os }}\n    strategy:\n      fail-fast: false\n      matrix:\n        include:\n        - python-version: \"3.10\"\n          os: macos-latest\n          env:\n            TOXENV: py310\n        - python-version: \"3.11\"\n          os: macos-latest\n          env:\n            TOXENV: py311\n        - python-version: \"3.12\"\n          os: macos-latest\n          env:\n            TOXENV: py312\n        - python-version: \"3.13\"\n          os: macos-latest\n          env:\n            TOXENV: py313\n\n    steps:\n    - uses: actions/checkout@v6\n\n    - name: Set up Python ${{ matrix.python-version }}\n      uses: actions/setup-python@v6\n      with:\n        python-version: ${{ matrix.python-version }}\n        cache: 'pip'\n        cache-dependency-path: |\n          pyproject.toml\n          tox.ini\n\n    - name: Install all browsers dependencies\n      run: |\n        python3 -m pip install --upgrade pip\n        python3 -m pip install playwright==1.58.0 patchright==1.58.2\n\n    - name: Get Playwright version\n      id: playwright-version\n      run: |\n        PLAYWRIGHT_VERSION=$(python3 -c \"import importlib.metadata; print(importlib.metadata.version('playwright'))\")\n        echo \"version=$PLAYWRIGHT_VERSION\" >> $GITHUB_OUTPUT\n        echo \"Playwright version: $PLAYWRIGHT_VERSION\"\n\n    - name: Retrieve Playwright browsers from cache if any\n      id: playwright-cache\n      uses: actions/cache@v5\n      with:\n        path: |\n          ~/.cache/ms-playwright\n          ~/Library/Caches/ms-playwright\n          ~/.ms-playwright\n        key: ${{ runner.os }}-playwright-${{ steps.playwright-version.outputs.version }}-v1\n        restore-keys: |\n          ${{ runner.os }}-playwright-${{ steps.playwright-version.outputs.version }}-\n          ${{ runner.os }}-playwright-\n\n    - name: Install Playwright browsers\n      run: |\n        echo \"Cache hit: ${{ steps.playwright-cache.outputs.cache-hit }}\"\n        if [ \"${{ steps.playwright-cache.outputs.cache-hit }}\" != \"true\" ]; then\n          python3 -m playwright install chromium\n        else\n          echo \"Skipping install - using cached Playwright browsers\"\n        fi\n        python3 -m playwright install-deps chromium\n\n    # Cache tox environments\n    - name: Cache tox environments\n      uses: actions/cache@v5\n      with:\n        path: .tox\n        # Include python version and os in the cache key\n        key: tox-v1-${{ runner.os }}-py${{ matrix.python-version }}-${{ hashFiles('/Users/runner/work/Scrapling/pyproject.toml') }}\n        restore-keys: |\n          tox-v1-${{ runner.os }}-py${{ matrix.python-version }}-\n          tox-v1-${{ runner.os }}-\n\n    - name: Install tox\n      run: pip install -U tox\n\n    - name: Run tests\n      env: ${{ matrix.env }}\n      run: tox"
  },
  {
    "path": ".gitignore",
    "content": "# local files\nsite/*\nlocal_tests/*\n.mcpregistry_*\n\n# AI related files\n.claude/*\nCLAUDE.md\n\n# cached files\n__pycache__/\n*.py[cod]\n.cache\n.DS_Store\n*~\n.*.sw[po]\n.build\n.ve\n.env\n.pytest\n.benchmarks\n.bootstrap\n.appveyor.token\n*.bak\n*.db\n*.db-*\n\n# installation package\n*.egg-info/\ndist/\nbuild/\n\n# environments\n.venv\nenv/\nvenv/\nENV/\nenv.bak/\nvenv.bak/\n\n# C extensions\n*.so\n\n# pycharm\n.idea/\n\n# vscode\n*.code-workspace\n\n# Packages\n*.egg\n*.egg-info\ndist\nbuild\neggs\n.eggs\nparts\nbin\nvar\nsdist\nwheelhouse\ndevelop-eggs\n.installed.cfg\nlib\nlib64\nvenv*/\n.venv*/\npyvenv*/\npip-wheel-metadata/\npoetry.lock\n\n# Installer logs\npip-log.txt\n\n# mypy\n.mypy_cache/\n.dmypy.json\ndmypy.json\nmypy.ini\n\n# test caches\n.tox/\n.pytest_cache/\n.coverage\nhtmlcov\nreport.xml\nnosetests.xml\ncoverage.xml\n\n# Translations\n*.mo\n\n# Buildout\n.mr.developer.cfg\n\n# IDE project files\n.project\n.pydevproject\n.idea\n*.iml\n*.komodoproject\n\n# Complexity\noutput/*.html\noutput/*/index.html\n\n# Sphinx\ndocs/_build\npublic/\nweb/\n"
  },
  {
    "path": ".pre-commit-config.yaml",
    "content": "repos:\n- repo: https://github.com/PyCQA/bandit\n  rev: 1.9.0\n  hooks:\n  - id: bandit\n    args: [-r, -c, .bandit.yml]\n- repo: https://github.com/astral-sh/ruff-pre-commit\n  # Ruff version.\n  rev: v0.14.5\n  hooks:\n    # Run the linter.\n    - id: ruff\n      args: [ --fix ]\n    # Run the formatter.\n    - id: ruff-format\n- repo: https://github.com/netromdk/vermin\n  rev: v1.7.0\n  hooks:\n  - id: vermin\n    args: ['-t=3.10-', '--violations', '--eval-annotations', '--no-tips']\n"
  },
  {
    "path": ".readthedocs.yaml",
    "content": "# See https://docs.readthedocs.com/platform/stable/intro/zensical.html for details\n# Example: https://github.com/readthedocs/test-builds/tree/zensical\n\nversion: 2\n\nbuild:\n  os: ubuntu-24.04\n  apt_packages:\n    - pngquant\n  tools:\n    python: \"3.13\"\n  jobs:\n    install:\n      - pip install -r docs/requirements.txt\n      - pip install \".[all]\"\n    build:\n      html:\n        - zensical build\n    post_build:\n      - mkdir -p $READTHEDOCS_OUTPUT/html/\n      - cp --recursive site/* $READTHEDOCS_OUTPUT/html/\n"
  },
  {
    "path": "CODE_OF_CONDUCT.md",
    "content": "# Contributor Covenant Code of Conduct\n\n## Our Pledge\n\nWe as members, contributors, and leaders pledge to make participation in our\ncommunity a harassment-free experience for everyone, regardless of age, body\nsize, visible or invisible disability, ethnicity, sex characteristics, gender\nidentity and expression, level of experience, education, socio-economic status,\nnationality, personal appearance, race, religion, or sexual identity\nand orientation.\n\nWe pledge to act and interact in ways that contribute to an open, welcoming,\ndiverse, inclusive, and healthy community.\n\n## Our Standards\n\nExamples of behavior that contributes to a positive environment for our\ncommunity include:\n\n* Demonstrating empathy and kindness toward other people\n* Being respectful of differing opinions, viewpoints, and experiences\n* Giving and gracefully accepting constructive feedback\n* Accepting responsibility and apologizing to those affected by our mistakes,\n  and learning from the experience\n* Focusing on what is best not just for us as individuals, but for the\n  overall community\n\nExamples of unacceptable behavior include:\n\n* The use of sexualized language or imagery, and sexual attention or\n  advances of any kind\n* Trolling, insulting or derogatory comments, and personal or political attacks\n* Public or private harassment\n* Publishing others' private information, such as a physical or email\n  address, without their explicit permission\n* Other conduct which could reasonably be considered inappropriate in a\n  professional setting\n\n## Enforcement Responsibilities\n\nCommunity leaders are responsible for clarifying and enforcing our standards of\nacceptable behavior and will take appropriate and fair corrective action in\nresponse to any behavior that they deem inappropriate, threatening, offensive,\nor harmful.\n\nCommunity leaders have the right and responsibility to remove, edit, or reject\ncomments, commits, code, wiki edits, issues, and other contributions that are\nnot aligned to this Code of Conduct, and will communicate reasons for moderation\ndecisions when appropriate.\n\n## Scope\n\nThis Code of Conduct applies within all community spaces, and also applies when\nan individual is officially representing the community in public spaces.\nExamples of representing our community include using an official e-mail address,\nposting via an official social media account, or acting as an appointed\nrepresentative at an online or offline event.\n\n## Enforcement\n\nInstances of abusive, harassing, or otherwise unacceptable behavior may be\nreported to the community leaders responsible for enforcement at\nkarim.shoair@pm.me.\nAll complaints will be reviewed and investigated promptly and fairly.\n\nAll community leaders are obligated to respect the privacy and security of the\nreporter of any incident.\n\n## Enforcement Guidelines\n\nCommunity leaders will follow these Community Impact Guidelines in determining\nthe consequences for any action they deem in violation of this Code of Conduct:\n\n### 1. Correction\n\n**Community Impact**: Use of inappropriate language or other behavior deemed\nunprofessional or unwelcome in the community.\n\n**Consequence**: A private, written warning from community leaders, providing\nclarity around the nature of the violation and an explanation of why the\nbehavior was inappropriate. A public apology may be requested.\n\n### 2. Warning\n\n**Community Impact**: A violation through a single incident or series\nof actions.\n\n**Consequence**: A warning with consequences for continued behavior. No\ninteraction with the people involved, including unsolicited interaction with\nthose enforcing the Code of Conduct, for a specified period of time. This\nincludes avoiding interactions in community spaces as well as external channels\nlike social media. Violating these terms may lead to a temporary or\npermanent ban.\n\n### 3. Temporary Ban\n\n**Community Impact**: A serious violation of community standards, including\nsustained inappropriate behavior.\n\n**Consequence**: A temporary ban from any sort of interaction or public\ncommunication with the community for a specified period of time. No public or\nprivate interaction with the people involved, including unsolicited interaction\nwith those enforcing the Code of Conduct, is allowed during this period.\nViolating these terms may lead to a permanent ban.\n\n### 4. Permanent Ban\n\n**Community Impact**: Demonstrating a pattern of violation of community\nstandards, including sustained inappropriate behavior,  harassment of an\nindividual, or aggression toward or disparagement of classes of individuals.\n\n**Consequence**: A permanent ban from any sort of public interaction within\nthe community.\n\n## Attribution\n\nThis Code of Conduct is adapted from the [Contributor Covenant][homepage],\nversion 2.0, available at\nhttps://www.contributor-covenant.org/version/2/0/code_of_conduct.html.\n\nCommunity Impact Guidelines were inspired by [Mozilla's code of conduct\nenforcement ladder](https://github.com/mozilla/diversity).\n\n[homepage]: https://www.contributor-covenant.org\n\nFor answers to common questions about this code of conduct, see the FAQ at\nhttps://www.contributor-covenant.org/faq. Translations are available at\nhttps://www.contributor-covenant.org/translations.\n"
  },
  {
    "path": "CONTRIBUTING.md",
    "content": "# Contributing to Scrapling\n\nThank you for your interest in contributing to Scrapling! \n\nEverybody is invited and welcome to contribute to Scrapling. \n\nMinor changes are more likely to be included promptly. Adding unit tests for new features or test cases for bugs you've fixed helps us ensure that the Pull Request (PR) is acceptable.\n\nThere are many ways to contribute to Scrapling. Here are some of them:\n\n- Report bugs and request features using the [GitHub issues](https://github.com/D4Vinci/Scrapling/issues). Please follow the issue template to help us resolve your issue quickly.\n- Blog about Scrapling. Tell the world how you’re using Scrapling. This will help newcomers with more examples and increase the Scrapling project's visibility.\n- Join the [Discord community](https://discord.gg/EMgGbDceNQ) and share your ideas on how to improve Scrapling. We’re always open to suggestions.\n- If you are not a developer, perhaps you would like to help with translating the [documentation](https://github.com/D4Vinci/Scrapling/tree/docs)?\n\n## Making a Pull Request\nTo ensure that your PR gets accepted, please make sure that your PR is based on the latest changes from the dev branch and that it satisfies the following requirements:\n\n- **The PR must be made against the [**dev**](https://github.com/D4Vinci/Scrapling/tree/dev) branch of Scrapling. Any PR made against the main branch will be rejected.**\n- **The code should be passing all available tests. We use tox with GitHub's CI to run the current tests on all supported Python versions for every code-related commit.**\n- **The code should be passing all code quality checks like `mypy` and `pyright`. We are using GitHub's CI to enforce code style checks as well.**\n- **Make your changes, keep the code clean with an explanation of any part that might be vague, and remember to create a separate virtual environment for this project.**\n- If you are adding a new feature, please add tests for it.\n- If you are fixing a bug, please add code with the PR that reproduces the bug.\n- Please follow the rules and coding style rules we explain below.\n\n\n## Finding work\n\nIf you have decided to make a contribution to Scrapling, but you do not know what to contribute, here are some ways to find pending work:\n\n- Check out the [contribution](https://github.com/D4Vinci/Scrapling/contribute) GitHub page, which lists open issues tagged as `good first issue`. These issues provide a good starting point.\n- There are also the [help wanted](https://github.com/D4Vinci/Scrapling/issues?q=is%3Aissue%20label%3A%22help%20wanted%22%20state%3Aopen) issues, but know that some may require familiarity with the Scrapling code base first. You can also target any other issue, provided it is not tagged as `invalid`, `wontfix`, or similar tags.\n- If you enjoy writing automated tests, you can work on increasing our test coverage. Currently, the test coverage is around 90–92%.\n- Join the [Discord community](https://discord.gg/EMgGbDceNQ) and ask questions in the `#help` channel.\n\n## Coding style\nPlease follow these coding conventions as we do when writing code for Scrapling:\n- We use [pre-commit](https://pre-commit.com/) to automatically address simple code issues before every commit, so please install it and run `pre-commit install` to set it up. This will install hooks to run [ruff](https://docs.astral.sh/ruff/), [bandit](https://github.com/PyCQA/bandit), and [vermin](https://github.com/netromdk/vermin) on every commit. We are currently using a workflow to automatically run these tools on every PR, so if your code doesn't pass these checks, the PR will be rejected.\n- We use type hints for better code clarity and [pyright](https://github.com/microsoft/pyright)/[mypy](https://github.com/python/mypy) for static type checking. If your code isn't acceptable by those tools, your PR won't pass the code quality rule.\n- We use the conventional commit messages format as [here](https://gist.github.com/qoomon/5dfcdf8eec66a051ecd85625518cfd13#types), so for example, we use the following prefixes for commit messages:\n   \n   | Prefix      | When to use it           |\n   |-------------|--------------------------|\n   | `feat:`     | New feature added        |\n   | `fix:`      | Bug fix                  |\n   | `docs:`     | Documentation change/add |\n   | `test:`     | Tests                    |\n   | `refactor:` | Code refactoring         |\n   | `chore:`    | Maintenance tasks        |\n    \n    Then include the details of the change in the commit message body/description.\n\n   Example:\n   ```\n   feat: add `adaptive` for similar elements\n   \n   - Added find_similar() method\n   - Implemented pattern matching\n   - Added tests and documentation\n   ```\n\n> Please don’t put your name in the code you contribute; git provides enough metadata to identify the author of the code.\n\n## Development\n\n### Getting started\n\n1. Fork the repository and clone your fork:\n   ```bash\n   git clone https://github.com/<your-username>/Scrapling.git\n   cd Scrapling\n   git checkout dev\n   ```\n\n2. Create a virtual environment and install dependencies:\n   ```bash\n   python -m venv .venv\n   source .venv/bin/activate  # On Windows: .venv\\Scripts\\activate\n   pip install -e \".[all]\"\n   pip install -r tests/requirements.txt\n   ```\n\n3. Install browser dependencies:\n   ```bash\n   scrapling install\n   ```\n\n4. Set up pre-commit hooks:\n   ```bash\n   pip install pre-commit\n   pre-commit install\n   ```\n\n### Tips\n\nSetting the scrapling logging level to `debug` makes it easier to know what's happening in the background.\n```python\nimport logging\nlogging.getLogger(\"scrapling\").setLevel(logging.DEBUG)\n```\nBonus: You can install the beta of the upcoming update from the dev branch as follows\n```commandline\npip3 install git+https://github.com/D4Vinci/Scrapling.git@dev\n```\n\n## Tests\nScrapling includes a comprehensive test suite that can be executed with pytest. However, first, you need to install all libraries and `pytest-plugins` listed in `tests/requirements.txt`. Then, running the tests will result in an output like this:\n   ```bash\n   $ pytest tests -n auto\n   =============================== test session starts ===============================\n   platform darwin -- Python 3.13.8, pytest-8.4.2, pluggy-1.6.0 -- /Users/<redacted>/.venv/bin/python3.13\n   cachedir: .pytest_cache\n   rootdir: /Users/<redacted>/scrapling\n   configfile: pytest.ini\n   plugins: asyncio-1.2.0, anyio-4.11.0, xdist-3.8.0, httpbin-2.1.0, cov-7.0.0\n   asyncio: mode=Mode.STRICT, asyncio_default_fixture_loop_scope=function, asyncio_default_test_loop_scope=function\n   10 workers [515 items]\n   scheduling tests via LoadScheduling\n\n   ...<shortened>...\n\n   =============================== 271 passed in 52.68s ==============================\n   ```\nHere, `-n auto` runs tests in parallel across multiple processes to increase speed.\n\n**Note:** You may need to run browser tests sequentially (`DynamicFetcher`/`StealthyFetcher`) to avoid conflicts. To run non-browser tests in parallel and browser tests separately:\n```bash\n# Non-browser tests (parallel)\npytest tests/ -k \"not (DynamicFetcher or StealthyFetcher)\" -n auto\n\n# Browser tests (sequential)\npytest tests/ -k \"DynamicFetcher or StealthyFetcher\"\n```\n\nBonus: You can also see the test coverage with the `pytest` plugin below\n```bash\npytest --cov=scrapling tests/\n```\n\n## Building Documentation\nDocumentation is built using [Zensical](https://zensical.org/). You can build it locally using the following commands:\n```bash\npip install zensical\npip install -r docs/requirements.txt\nzensical build --clean  # Build the static site\nzensical serve          # Local preview\n```\n"
  },
  {
    "path": "Dockerfile",
    "content": "FROM python:3.12-slim-trixie\n\nLABEL io.modelcontextprotocol.server.name=\"io.github.D4Vinci/Scrapling\"\nCOPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/\n\n# Set environment variables\nENV DEBIAN_FRONTEND=noninteractive \\\n    PYTHONUNBUFFERED=1 \\\n    PYTHONDONTWRITEBYTECODE=1\n\nWORKDIR /app\n\n# Copy dependency file first for better layer caching\nCOPY pyproject.toml ./\n\n# Install dependencies only\nRUN --mount=type=cache,target=/root/.cache/uv \\\n    uv sync --no-install-project --all-extras --compile-bytecode\n\n# Copy source code\nCOPY . .\n\n# Install browsers and project in one optimized layer\nRUN --mount=type=cache,target=/root/.cache/uv \\\n    --mount=type=cache,target=/var/cache/apt \\\n    --mount=type=cache,target=/var/lib/apt \\\n    apt-get update && \\\n    uv run playwright install-deps chromium && \\\n    uv run playwright install chromium && \\\n    uv sync --all-extras --compile-bytecode && \\\n    apt-get clean && \\\n    rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*\n\n# Expose port for MCP server HTTP transport\nEXPOSE 8000\n\n# Set entrypoint to run scrapling\nENTRYPOINT [\"uv\", \"run\", \"scrapling\"]\n\n# Default command (can be overridden)\nCMD [\"--help\"]"
  },
  {
    "path": "LICENSE",
    "content": "BSD 3-Clause License\n\nCopyright (c) 2024, Karim shoair\n\nRedistribution and use in source and binary forms, with or without\nmodification, are permitted provided that the following conditions are met:\n\n1. Redistributions of source code must retain the above copyright notice, this\n   list of conditions and the following disclaimer.\n\n2. Redistributions in binary form must reproduce the above copyright notice,\n   this list of conditions and the following disclaimer in the documentation\n   and/or other materials provided with the distribution.\n\n3. Neither the name of the copyright holder nor the names of its\n   contributors may be used to endorse or promote products derived from\n   this software without specific prior written permission.\n\nTHIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS \"AS IS\"\nAND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE\nIMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE\nDISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE\nFOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL\nDAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR\nSERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER\nCAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,\nOR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\nOF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n"
  },
  {
    "path": "MANIFEST.in",
    "content": "include LICENSE\ninclude *.db\ninclude *.js\ninclude scrapling/*.db\ninclude scrapling/*.db*\ninclude scrapling/*.db-*\ninclude scrapling/py.typed\ninclude scrapling/.scrapling_dependencies_installed\ninclude .scrapling_dependencies_installed\n\nrecursive-exclude * __pycache__\nrecursive-exclude * *.py[co]"
  },
  {
    "path": "README.md",
    "content": "<!-- mcp-name: io.github.D4Vinci/Scrapling -->\n\n<h1 align=\"center\">\n    <a href=\"https://scrapling.readthedocs.io\">\n        <picture>\n          <source media=\"(prefers-color-scheme: dark)\" srcset=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/docs/assets/cover_dark.svg?sanitize=true\">\n          <img alt=\"Scrapling Poster\" src=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/docs/assets/cover_light.svg?sanitize=true\">\n        </picture>\n    </a>\n    <br>\n    <small>Effortless Web Scraping for the Modern Web</small>\n</h1>\n\n<p align=\"center\">\n    <a href=\"https://trendshift.io/repositories/14244\" target=\"_blank\"><img src=\"https://trendshift.io/api/badge/repositories/14244\" alt=\"D4Vinci%2FScrapling | Trendshift\" style=\"width: 250px; height: 55px;\" width=\"250\" height=\"55\"/></a>\n    <br/>\n    <a href=\"https://github.com/D4Vinci/Scrapling/blob/main/docs/README_AR.md\">العربيه</a> | <a href=\"https://github.com/D4Vinci/Scrapling/blob/main/docs/README_ES.md\">Español</a> | <a href=\"https://github.com/D4Vinci/Scrapling/blob/main/docs/README_FR.md\">Français</a> | <a href=\"https://github.com/D4Vinci/Scrapling/blob/main/docs/README_DE.md\">Deutsch</a> | <a href=\"https://github.com/D4Vinci/Scrapling/blob/main/docs/README_CN.md\">简体中文</a> | <a href=\"https://github.com/D4Vinci/Scrapling/blob/main/docs/README_JP.md\">日本語</a> |  <a href=\"https://github.com/D4Vinci/Scrapling/blob/main/docs/README_RU.md\">Русский</a> | <a href=\"https://github.com/D4Vinci/Scrapling/blob/main/docs/README_KR.md\">한국어</a>\n    <br/>\n    <a href=\"https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml\" alt=\"Tests\">\n        <img alt=\"Tests\" src=\"https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml/badge.svg\"></a>\n    <a href=\"https://badge.fury.io/py/Scrapling\" alt=\"PyPI version\">\n        <img alt=\"PyPI version\" src=\"https://badge.fury.io/py/Scrapling.svg\"></a>\n    <a href=\"https://clickpy.clickhouse.com/dashboard/scrapling\" rel=\"nofollow\"><img src=\"https://img.shields.io/pypi/dm/scrapling\" alt=\"PyPI package downloads\"></a>\n    <a href=\"https://github.com/D4Vinci/Scrapling/tree/main/agent-skill\" alt=\"AI Agent Skill directory\">\n        <img alt=\"Static Badge\" src=\"https://img.shields.io/badge/Skill-black?style=flat&label=Agent&link=https%3A%2F%2Fgithub.com%2FD4Vinci%2FScrapling%2Ftree%2Fmain%2Fagent-skill\"></a>\n    <a href=\"https://clawhub.ai/D4Vinci/scrapling-official\" alt=\"OpenClaw Skill\">\n        <img alt=\"OpenClaw Skill\" src=\"https://img.shields.io/badge/Clawhub-darkred?style=flat&label=OpenClaw&link=https%3A%2F%2Fclawhub.ai%2FD4Vinci%2Fscrapling-official\"></a>\n    <br/>\n    <a href=\"https://discord.gg/EMgGbDceNQ\" alt=\"Discord\" target=\"_blank\">\n      <img alt=\"Discord\" src=\"https://img.shields.io/discord/1360786381042880532?style=social&logo=discord&link=https%3A%2F%2Fdiscord.gg%2FEMgGbDceNQ\">\n    </a>\n    <a href=\"https://x.com/Scrapling_dev\" alt=\"X (formerly Twitter)\">\n      <img alt=\"X (formerly Twitter) Follow\" src=\"https://img.shields.io/twitter/follow/Scrapling_dev?style=social&logo=x&link=https%3A%2F%2Fx.com%2FScrapling_dev\">\n    </a>\n    <br/>\n    <a href=\"https://pypi.org/project/scrapling/\" alt=\"Supported Python versions\">\n        <img alt=\"Supported Python versions\" src=\"https://img.shields.io/pypi/pyversions/scrapling.svg\"></a>\n</p>\n\n<p align=\"center\">\n    <a href=\"https://scrapling.readthedocs.io/en/latest/parsing/selection.html\"><strong>Selection methods</strong></a>\n    &middot;\n    <a href=\"https://scrapling.readthedocs.io/en/latest/fetching/choosing.html\"><strong>Fetchers</strong></a>\n    &middot;\n    <a href=\"https://scrapling.readthedocs.io/en/latest/spiders/architecture.html\"><strong>Spiders</strong></a>\n    &middot;\n    <a href=\"https://scrapling.readthedocs.io/en/latest/spiders/proxy-blocking.html\"><strong>Proxy Rotation</strong></a>\n    &middot;\n    <a href=\"https://scrapling.readthedocs.io/en/latest/cli/overview.html\"><strong>CLI</strong></a>\n    &middot;\n    <a href=\"https://scrapling.readthedocs.io/en/latest/ai/mcp-server.html\"><strong>MCP</strong></a>\n</p>\n\nScrapling is an adaptive Web Scraping framework that handles everything from a single request to a full-scale crawl.\n\nIts parser learns from website changes and automatically relocates your elements when pages update. Its fetchers bypass anti-bot systems like Cloudflare Turnstile out of the box. And its spider framework lets you scale up to concurrent, multi-session crawls with pause/resume and automatic proxy rotation — all in a few lines of Python. One library, zero compromises.\n\nBlazing fast crawls with real-time stats and streaming. Built by Web Scrapers for Web Scrapers and regular users, there's something for everyone.\n\n```python\nfrom scrapling.fetchers import Fetcher, AsyncFetcher, StealthyFetcher, DynamicFetcher\nStealthyFetcher.adaptive = True\np = StealthyFetcher.fetch('https://example.com', headless=True, network_idle=True)  # Fetch website under the radar!\nproducts = p.css('.product', auto_save=True)                                        # Scrape data that survives website design changes!\nproducts = p.css('.product', adaptive=True)                                         # Later, if the website structure changes, pass `adaptive=True` to find them!\n```\nOr scale up to full crawls\n```python\nfrom scrapling.spiders import Spider, Response\n\nclass MySpider(Spider):\n  name = \"demo\"\n  start_urls = [\"https://example.com/\"]\n\n  async def parse(self, response: Response):\n      for item in response.css('.product'):\n          yield {\"title\": item.css('h2::text').get()}\n\nMySpider().start()\n```\n\n<p align=\"center\">\n    <a href=\"https://dataimpulse.com/?utm_source=scrapling&utm_medium=banner&utm_campaign=scrapling\" target=\"_blank\" style=\"display:flex; justify-content:center; padding:4px 0;\">\n        <img src=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/DataImpulse.png\" alt=\"At DataImpulse, we specialize in developing custom proxy services for your business. Make requests from anywhere, collect data, and enjoy fast connections with our premium proxies.\" style=\"max-height:60px;\">\n    </a>\n</p>\n\n# Platinum Sponsors\n<table>\n  <tr>\n    <td width=\"200\">\n      <a href=\"https://hypersolutions.co/?utm_source=github&utm_medium=readme&utm_campaign=scrapling\" target=\"_blank\" title=\"Bot Protection Bypass API for Akamai, DataDome, Incapsula & Kasada\">\n        <img src=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/HyperSolutions.png\">\n      </a>\n    </td>\n    <td> Scrapling handles Cloudflare Turnstile. For enterprise-grade protection, <a href=\"https://hypersolutions.co?utm_source=github&utm_medium=readme&utm_campaign=scrapling\">\n        <b>Hyper Solutions</b>\n      </a> provides API endpoints that generate valid antibot tokens for <b>Akamai</b>, <b>DataDome</b>, <b>Kasada</b>, and <b>Incapsula</b>. Simple API calls, no browser automation required. </td>\n  </tr>\n  <tr>\n    <td width=\"200\">\n      <a href=\"https://birdproxies.com/t/scrapling\" target=\"_blank\" title=\"At Bird Proxies, we eliminate your pains such as banned IPs, geo restriction, and high costs so you can focus on your work.\">\n        <img src=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/BirdProxies.jpg\">\n      </a>\n    </td>\n    <td>Hey, we built <a href=\"https://birdproxies.com/t/scrapling\">\n        <b>BirdProxies</b>\n      </a> because proxies shouldn't be complicated or overpriced. Fast residential and ISP proxies in 195+ locations, fair pricing, and real support. <br />\n      <b>Try our FlappyBird game on the landing page for free data!</b>\n    </td>\n  </tr>\n  <tr>\n    <td width=\"200\">\n      <a href=\"https://evomi.com?utm_source=github&utm_medium=banner&utm_campaign=d4vinci-scrapling\" target=\"_blank\" title=\"Evomi is your Swiss Quality Proxy Provider, starting at $0.49/GB\">\n        <img src=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/evomi.png\">\n      </a>\n    </td>\n    <td>\n      <a href=\"https://evomi.com?utm_source=github&utm_medium=banner&utm_campaign=d4vinci-scrapling\">\n        <b>Evomi</b>\n      </a>: residential proxies from $0.49/GB. Scraping browser with fully spoofed Chromium, residential IPs, auto CAPTCHA solving, and anti-bot bypass. </br>\n      <b>Scraper API for hassle-free results. MCP and N8N integrations are available.</b>\n    </td>\n  </tr>\n  <tr>\n    <td width=\"200\">\n      <a href=\"https://tikhub.io/?ref=KarimShoair\" target=\"_blank\" title=\"Unlock the Power of Social Media Data & AI\">\n        <img src=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/TikHub.jpg\">\n      </a>\n    </td>\n    <td>\n      <a href=\"https://tikhub.io/?ref=KarimShoair\" target=\"_blank\">TikHub.io</a> provides 900+ stable APIs across 16+ platforms including TikTok, X, YouTube & Instagram, with 40M+ datasets. <br /> Also offers <a href=\"https://ai.tikhub.io/?ref=KarimShoair\" target=\"_blank\">DISCOUNTED AI models</a> — Claude, GPT, GEMINI & more up to 71% off.\n    </td>\n  </tr>\n  <tr>\n    <td width=\"200\">\n      <a href=\"https://www.nsocks.com/?keyword=2p67aivg\" target=\"_blank\" title=\"Scalable Web Data Access for AI Applications\">\n        <img src=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/nsocks.png\">\n      </a>\n    </td>\n    <td>\n    <a href=\"https://www.nsocks.com/?keyword=2p67aivg\" target=\"_blank\">Nsocks</a> provides fast Residential and ISP proxies for developers and scrapers. Global IP coverage, high anonymity, smart rotation, and reliable performance for automation and data extraction. Use <a href=\"https://www.xcrawl.com/?keyword=2p67aivg\" target=\"_blank\">Xcrawl</a> to simplify large-scale web crawling.\n    </td>\n  </tr>\n  <tr>\n    <td width=\"200\">\n      <a href=\"https://petrosky.io/d4vinci\" target=\"_blank\" title=\"PetroSky delivers cutting-edge VPS hosting.\">\n        <img src=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/petrosky.png\">\n      </a>\n    </td>\n    <td>\n    Close your laptop. Your scrapers keep running. <br />\n    <a href=\"https://petrosky.io/d4vinci\" target=\"_blank\">PetroSky VPS</a> - cloud servers built for nonstop automation. Windows and Linux machines with full control. From €6.99/mo.\n    </td>\n  </tr>\n  <tr>\n    <td width=\"200\">\n      <a href=\"https://substack.thewebscraping.club/p/scrapling-hands-on-guide?utm_source=github&utm_medium=repo&utm_campaign=scrapling\" target=\"_blank\" title=\"The #1 newsletter dedicated to Web Scraping\">\n        <img src=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/TWSC.png\">\n      </a>\n    </td>\n    <td>\n    Read a full review of <a href=\"https://substack.thewebscraping.club/p/scrapling-hands-on-guide?utm_source=github&utm_medium=repo&utm_campaign=scrapling\" target=\"_blank\">Scrapling on The Web Scraping Club</a> (Nov 2025), the #1 newsletter dedicated to Web Scraping.\n    </td>\n  </tr>\n  <tr>\n    <td width=\"200\">\n      <a href=\"https://proxy-seller.com/?partner=CU9CAA5TBYFFT2\" target=\"_blank\" title=\"Proxy-Seller provides reliable proxy infrastructure for Web Scraping\">\n        <img src=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/ProxySeller.png\">\n      </a>\n    </td>\n    <td>\n    <a href=\"https://proxy-seller.com/?partner=CU9CAA5TBYFFT2\" target=\"_blank\">Proxy-Seller</a> provides reliable proxy infrastructure for web scraping, offering IPv4, IPv6, ISP, Residential, and Mobile proxies with stable performance, broad geo coverage, and flexible plans for business-scale data collection.\n    </td>\n  </tr>\n</table>\n\n<i><sub>Do you want to show your ad here? Click [here](https://github.com/sponsors/D4Vinci/sponsorships?tier_id=586646)</sub></i>\n# Sponsors \n\n<!-- sponsors -->\n\n\n<a href=\"https://serpapi.com/?utm_source=scrapling\" target=\"_blank\" title=\"Scrape Google and other search engines with SerpApi\"><img src=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/SerpApi.png\"></a>\n<a href=\"https://visit.decodo.com/Dy6W0b\" target=\"_blank\" title=\"Try the Most Efficient Residential Proxies for Free\"><img src=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/decodo.png\"></a>\n<a href=\"https://hasdata.com/?utm_source=github&utm_medium=banner&utm_campaign=D4Vinci\" target=\"_blank\" title=\"The web scraping service that actually beats anti-bot systems!\"><img src=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/hasdata.png\"></a>\n<a href=\"https://proxyempire.io/?ref=scrapling&utm_source=scrapling\" target=\"_blank\" title=\"Collect The Data Your Project Needs with the Best Residential Proxies\"><img src=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/ProxyEmpire.png\"></a><a href=\"https://www.swiftproxy.net/\" target=\"_blank\" title=\"Unlock Reliable Proxy Services with Swiftproxy!\"><img src=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/swiftproxy.png\"></a>\n<a href=\"https://www.rapidproxy.io/?ref=d4v\" target=\"_blank\" title=\"Affordable Access to the Proxy World – bypass CAPTCHAs blocks, and avoid additional costs.\"><img src=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/rapidproxy.jpg\"></a>\n<a href=\"https://browser.cash/?utm_source=D4Vinci&utm_medium=referral\" target=\"_blank\" title=\"Browser Automation & AI Browser Agent Platform\"><img src=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/browserCash.png\"></a>\n\n<!-- /sponsors -->\n\n<i><sub>Do you want to show your ad here? Click [here](https://github.com/sponsors/D4Vinci) and choose the tier that suites you!</sub></i>\n\n---\n\n## Key Features\n\n### Spiders — A Full Crawling Framework\n- 🕷️ **Scrapy-like Spider API**: Define spiders with `start_urls`, async `parse` callbacks, and `Request`/`Response` objects.\n- ⚡ **Concurrent Crawling**: Configurable concurrency limits, per-domain throttling, and download delays.\n- 🔄 **Multi-Session Support**: Unified interface for HTTP requests, and stealthy headless browsers in a single spider — route requests to different sessions by ID.\n- 💾 **Pause & Resume**: Checkpoint-based crawl persistence. Press Ctrl+C for a graceful shutdown; restart to resume from where you left off.\n- 📡 **Streaming Mode**: Stream scraped items as they arrive via `async for item in spider.stream()` with real-time stats — ideal for UI, pipelines, and long-running crawls.\n- 🛡️ **Blocked Request Detection**: Automatic detection and retry of blocked requests with customizable logic.\n- 📦 **Built-in Export**: Export results through hooks and your own pipeline or the built-in JSON/JSONL with `result.items.to_json()` / `result.items.to_jsonl()` respectively.\n\n### Advanced Websites Fetching with Session Support\n- **HTTP Requests**: Fast and stealthy HTTP requests with the `Fetcher` class. Can impersonate browsers' TLS fingerprint, headers, and use HTTP/3.\n- **Dynamic Loading**: Fetch dynamic websites with full browser automation through the `DynamicFetcher` class supporting Playwright's Chromium and Google's Chrome.\n- **Anti-bot Bypass**: Advanced stealth capabilities with `StealthyFetcher` and fingerprint spoofing. Can easily bypass all types of Cloudflare's Turnstile/Interstitial with automation.\n- **Session Management**: Persistent session support with `FetcherSession`, `StealthySession`, and `DynamicSession` classes for cookie and state management across requests.\n- **Proxy Rotation**: Built-in `ProxyRotator` with cyclic or custom rotation strategies across all session types, plus per-request proxy overrides.\n- **Domain Blocking**: Block requests to specific domains (and their subdomains) in browser-based fetchers.\n- **Async Support**: Complete async support across all fetchers and dedicated async session classes.\n\n### Adaptive Scraping & AI Integration\n- 🔄 **Smart Element Tracking**: Relocate elements after website changes using intelligent similarity algorithms.\n- 🎯 **Smart Flexible Selection**: CSS selectors, XPath selectors, filter-based search, text search, regex search, and more.\n- 🔍 **Find Similar Elements**: Automatically locate elements similar to found elements.\n- 🤖 **MCP Server to be used with AI**: Built-in MCP server for AI-assisted Web Scraping and data extraction. The MCP server features powerful, custom capabilities that leverage Scrapling to extract targeted content before passing it to the AI (Claude/Cursor/etc), thereby speeding up operations and reducing costs by minimizing token usage. ([demo video](https://www.youtube.com/watch?v=qyFk3ZNwOxE))\n\n### High-Performance & battle-tested Architecture\n- 🚀 **Lightning Fast**: Optimized performance outperforming most Python scraping libraries.\n- 🔋 **Memory Efficient**: Optimized data structures and lazy loading for a minimal memory footprint.\n- ⚡ **Fast JSON Serialization**: 10x faster than the standard library.\n- 🏗️ **Battle tested**: Not only does Scrapling have 92% test coverage and full type hints coverage, but it has been used daily by hundreds of Web Scrapers over the past year.\n\n### Developer/Web Scraper Friendly Experience\n- 🎯 **Interactive Web Scraping Shell**: Optional built-in IPython shell with Scrapling integration, shortcuts, and new tools to speed up Web Scraping scripts development, like converting curl requests to Scrapling requests and viewing requests results in your browser.\n- 🚀 **Use it directly from the Terminal**: Optionally, you can use Scrapling to scrape a URL without writing a single line of code!\n- 🛠️ **Rich Navigation API**: Advanced DOM traversal with parent, sibling, and child navigation methods.\n- 🧬 **Enhanced Text Processing**: Built-in regex, cleaning methods, and optimized string operations.\n- 📝 **Auto Selector Generation**: Generate robust CSS/XPath selectors for any element.\n- 🔌 **Familiar API**: Similar to Scrapy/BeautifulSoup with the same pseudo-elements used in Scrapy/Parsel.\n- 📘 **Complete Type Coverage**: Full type hints for excellent IDE support and code completion. The entire codebase is automatically scanned with **PyRight** and **MyPy** with each change.\n- 🔋 **Ready Docker image**: With each release, a Docker image containing all browsers is automatically built and pushed.\n\n## Getting Started\n\nLet's give you a quick glimpse of what Scrapling can do without deep diving.\n\n### Basic Usage\nHTTP requests with session support\n```python\nfrom scrapling.fetchers import Fetcher, FetcherSession\n\nwith FetcherSession(impersonate='chrome') as session:  # Use latest version of Chrome's TLS fingerprint\n    page = session.get('https://quotes.toscrape.com/', stealthy_headers=True)\n    quotes = page.css('.quote .text::text').getall()\n\n# Or use one-off requests\npage = Fetcher.get('https://quotes.toscrape.com/')\nquotes = page.css('.quote .text::text').getall()\n```\nAdvanced stealth mode\n```python\nfrom scrapling.fetchers import StealthyFetcher, StealthySession\n\nwith StealthySession(headless=True, solve_cloudflare=True) as session:  # Keep the browser open until you finish\n    page = session.fetch('https://nopecha.com/demo/cloudflare', google_search=False)\n    data = page.css('#padded_content a').getall()\n\n# Or use one-off request style, it opens the browser for this request, then closes it after finishing\npage = StealthyFetcher.fetch('https://nopecha.com/demo/cloudflare')\ndata = page.css('#padded_content a').getall()\n```\nFull browser automation\n```python\nfrom scrapling.fetchers import DynamicFetcher, DynamicSession\n\nwith DynamicSession(headless=True, disable_resources=False, network_idle=True) as session:  # Keep the browser open until you finish\n    page = session.fetch('https://quotes.toscrape.com/', load_dom=False)\n    data = page.xpath('//span[@class=\"text\"]/text()').getall()  # XPath selector if you prefer it\n\n# Or use one-off request style, it opens the browser for this request, then closes it after finishing\npage = DynamicFetcher.fetch('https://quotes.toscrape.com/')\ndata = page.css('.quote .text::text').getall()\n```\n\n### Spiders\nBuild full crawlers with concurrent requests, multiple session types, and pause/resume:\n```python\nfrom scrapling.spiders import Spider, Request, Response\n\nclass QuotesSpider(Spider):\n    name = \"quotes\"\n    start_urls = [\"https://quotes.toscrape.com/\"]\n    concurrent_requests = 10\n    \n    async def parse(self, response: Response):\n        for quote in response.css('.quote'):\n            yield {\n                \"text\": quote.css('.text::text').get(),\n                \"author\": quote.css('.author::text').get(),\n            }\n            \n        next_page = response.css('.next a')\n        if next_page:\n            yield response.follow(next_page[0].attrib['href'])\n\nresult = QuotesSpider().start()\nprint(f\"Scraped {len(result.items)} quotes\")\nresult.items.to_json(\"quotes.json\")\n```\nUse multiple session types in a single spider:\n```python\nfrom scrapling.spiders import Spider, Request, Response\nfrom scrapling.fetchers import FetcherSession, AsyncStealthySession\n\nclass MultiSessionSpider(Spider):\n    name = \"multi\"\n    start_urls = [\"https://example.com/\"]\n    \n    def configure_sessions(self, manager):\n        manager.add(\"fast\", FetcherSession(impersonate=\"chrome\"))\n        manager.add(\"stealth\", AsyncStealthySession(headless=True), lazy=True)\n    \n    async def parse(self, response: Response):\n        for link in response.css('a::attr(href)').getall():\n            # Route protected pages through the stealth session\n            if \"protected\" in link:\n                yield Request(link, sid=\"stealth\")\n            else:\n                yield Request(link, sid=\"fast\", callback=self.parse)  # explicit callback\n```\nPause and resume long crawls with checkpoints by running the spider like this:\n```python\nQuotesSpider(crawldir=\"./crawl_data\").start()\n```\nPress Ctrl+C to pause gracefully — progress is saved automatically. Later, when you start the spider again, pass the same `crawldir`, and it will resume from where it stopped.\n\n### Advanced Parsing & Navigation\n```python\nfrom scrapling.fetchers import Fetcher\n\n# Rich element selection and navigation\npage = Fetcher.get('https://quotes.toscrape.com/')\n\n# Get quotes with multiple selection methods\nquotes = page.css('.quote')  # CSS selector\nquotes = page.xpath('//div[@class=\"quote\"]')  # XPath\nquotes = page.find_all('div', {'class': 'quote'})  # BeautifulSoup-style\n# Same as\nquotes = page.find_all('div', class_='quote')\nquotes = page.find_all(['div'], class_='quote')\nquotes = page.find_all(class_='quote')  # and so on...\n# Find element by text content\nquotes = page.find_by_text('quote', tag='div')\n\n# Advanced navigation\nquote_text = page.css('.quote')[0].css('.text::text').get()\nquote_text = page.css('.quote').css('.text::text').getall()  # Chained selectors\nfirst_quote = page.css('.quote')[0]\nauthor = first_quote.next_sibling.css('.author::text')\nparent_container = first_quote.parent\n\n# Element relationships and similarity\nsimilar_elements = first_quote.find_similar()\nbelow_elements = first_quote.below_elements()\n```\nYou can use the parser right away if you don't want to fetch websites like below:\n```python\nfrom scrapling.parser import Selector\n\npage = Selector(\"<html>...</html>\")\n```\nAnd it works precisely the same way!\n\n### Async Session Management Examples\n```python\nimport asyncio\nfrom scrapling.fetchers import FetcherSession, AsyncStealthySession, AsyncDynamicSession\n\nasync with FetcherSession(http3=True) as session:  # `FetcherSession` is context-aware and can work in both sync/async patterns\n    page1 = session.get('https://quotes.toscrape.com/')\n    page2 = session.get('https://quotes.toscrape.com/', impersonate='firefox135')\n\n# Async session usage\nasync with AsyncStealthySession(max_pages=2) as session:\n    tasks = []\n    urls = ['https://example.com/page1', 'https://example.com/page2']\n    \n    for url in urls:\n        task = session.fetch(url)\n        tasks.append(task)\n    \n    print(session.get_pool_stats())  # Optional - The status of the browser tabs pool (busy/free/error)\n    results = await asyncio.gather(*tasks)\n    print(session.get_pool_stats())\n```\n\n## CLI & Interactive Shell\n\nScrapling includes a powerful command-line interface:\n\n[![asciicast](https://asciinema.org/a/736339.svg)](https://asciinema.org/a/736339)\n\nLaunch the interactive Web Scraping shell\n```bash\nscrapling shell\n```\nExtract pages to a file directly without programming (Extracts the content inside the `body` tag by default). If the output file ends with `.txt`, then the text content of the target will be extracted. If it ends in `.md`, it will be a Markdown representation of the HTML content; if it ends in `.html`, it will be the HTML content itself.\n```bash\nscrapling extract get 'https://example.com' content.md\nscrapling extract get 'https://example.com' content.txt --css-selector '#fromSkipToProducts' --impersonate 'chrome'  # All elements matching the CSS selector '#fromSkipToProducts'\nscrapling extract fetch 'https://example.com' content.md --css-selector '#fromSkipToProducts' --no-headless\nscrapling extract stealthy-fetch 'https://nopecha.com/demo/cloudflare' captchas.html --css-selector '#padded_content a' --solve-cloudflare\n```\n\n> [!NOTE]\n> There are many additional features, but we want to keep this page concise, including the MCP server and the interactive Web Scraping Shell. Check out the full documentation [here](https://scrapling.readthedocs.io/en/latest/)\n\n## Performance Benchmarks\n\nScrapling isn't just powerful—it's also blazing fast. The following benchmarks compare Scrapling's parser with the latest versions of other popular libraries.\n\n### Text Extraction Speed Test (5000 nested elements)\n\n| # |      Library      | Time (ms) | vs Scrapling | \n|---|:-----------------:|:---------:|:------------:|\n| 1 |     Scrapling     |   2.02    |     1.0x     |\n| 2 |   Parsel/Scrapy   |   2.04    |     1.01     |\n| 3 |     Raw Lxml      |   2.54    |    1.257     |\n| 4 |      PyQuery      |   24.17   |     ~12x     |\n| 5 |    Selectolax     |   82.63   |     ~41x     |\n| 6 |  MechanicalSoup   |  1549.71  |   ~767.1x    |\n| 7 |   BS4 with Lxml   |  1584.31  |   ~784.3x    |\n| 8 | BS4 with html5lib |  3391.91  |   ~1679.1x   |\n\n\n### Element Similarity & Text Search Performance\n\nScrapling's adaptive element finding capabilities significantly outperform alternatives:\n\n| Library     | Time (ms) | vs Scrapling |\n|-------------|:---------:|:------------:|\n| Scrapling   |   2.39    |     1.0x     |\n| AutoScraper |   12.45   |    5.209x    |\n\n\n> All benchmarks represent averages of 100+ runs. See [benchmarks.py](https://github.com/D4Vinci/Scrapling/blob/main/benchmarks.py) for methodology.\n\n## Installation\n\nScrapling requires Python 3.10 or higher:\n\n```bash\npip install scrapling\n```\n\nThis installation only includes the parser engine and its dependencies, without any fetchers or commandline dependencies.\n\n### Optional Dependencies\n\n1. If you are going to use any of the extra features below, the fetchers, or their classes, you will need to install fetchers' dependencies and their browser dependencies as follows:\n    ```bash\n    pip install \"scrapling[fetchers]\"\n    \n    scrapling install           # normal install\n    scrapling install  --force  # force reinstall\n    ```\n\n    This downloads all browsers, along with their system dependencies and fingerprint manipulation dependencies.\n\n    Or you can install them from the code instead of running a command like this:\n    ```python\n    from scrapling.cli import install\n    \n    install([], standalone_mode=False)          # normal install\n    install([\"--force\"], standalone_mode=False) # force reinstall\n    ```\n\n2. Extra features:\n   - Install the MCP server feature:\n       ```bash\n       pip install \"scrapling[ai]\"\n       ```\n   - Install shell features (Web Scraping shell and the `extract` command): \n       ```bash\n       pip install \"scrapling[shell]\"\n       ```\n   - Install everything: \n       ```bash\n       pip install \"scrapling[all]\"\n       ```\n   Remember that you need to install the browser dependencies with `scrapling install` after any of these extras (if you didn't already)\n\n### Docker\nYou can also install a Docker image with all extras and browsers with the following command from DockerHub:\n```bash\ndocker pull pyd4vinci/scrapling\n```\nOr download it from the GitHub registry:\n```bash\ndocker pull ghcr.io/d4vinci/scrapling:latest\n```\nThis image is automatically built and pushed using GitHub Actions and the repository's main branch.\n\n## Contributing\n\nWe welcome contributions! Please read our [contributing guidelines](https://github.com/D4Vinci/Scrapling/blob/main/CONTRIBUTING.md) before getting started.\n\n## Disclaimer\n\n> [!CAUTION]\n> This library is provided for educational and research purposes only. By using this library, you agree to comply with local and international data scraping and privacy laws. The authors and contributors are not responsible for any misuse of this software. Always respect the terms of service of websites and robots.txt files.\n\n## 🎓 Citations\nIf you have used our library for research purposes please quote us with the following reference:\n```text\n  @misc{scrapling,\n    author = {Karim Shoair},\n    title = {Scrapling},\n    year = {2024},\n    url = {https://github.com/D4Vinci/Scrapling},\n    note = {An adaptive Web Scraping framework that handles everything from a single request to a full-scale crawl!}\n  }\n```\n\n## License\n\nThis work is licensed under the BSD-3-Clause License.\n\n## Acknowledgments\n\nThis project includes code adapted from:\n- Parsel (BSD License)—Used for [translator](https://github.com/D4Vinci/Scrapling/blob/main/scrapling/core/translator.py) submodule\n\n---\n<div align=\"center\"><small>Designed & crafted with ❤️ by Karim Shoair.</small></div><br>\n"
  },
  {
    "path": "ROADMAP.md",
    "content": "## TODOs\n- [x] Add more tests and increase the code coverage.\n- [x] Structure the tests folder in a better way.\n- [x] Add more documentation.\n- [x] Add the browsing ability.\n- [x] Create detailed documentation for the 'readthedocs' website, preferably add GitHub action for deploying it.\n- [ ] Create a Scrapy plugin/decorator to make it replace parsel in the response argument when needed.\n- [x] Need to add more functionality to `AttributesHandler` and more navigation functions to `Selector` object (ex: functions similar to map, filter, and reduce functions but here pass it to the element and the function is executed on children, siblings, next elements, etc...)\n- [x] Add `.filter` method to `Selectors` object and other similar methods.\n- [ ] Add functionality to automatically detect pagination URLs\n- [ ] Add the ability to auto-detect schemas in pages and manipulate them.\n- [ ] Add `analyzer` ability that tries to learn about the page through meta-elements and return what it learned\n- [ ] Add the ability to generate a regex from a group of elements (Like for all href attributes)\n- "
  },
  {
    "path": "agent-skill/README.md",
    "content": "# Scrapling Agent Skill\n\nThe skill aligns with the [AgentSkill](https://agentskills.io/specification) specification, so it will be readable by [OpenClaw](https://github.com/openclaw/openclaw), [Claude Code](https://claude.com/product/claude-code), and other agentic tools. It encapsulates almost all of the documentation website's content in Markdown, so the agent doesn't have to guess anything.\n\nIt can be used to answer almost 90% of any questions you would have about scrapling. We tested it on [OpenClaw](https://github.com/openclaw/openclaw) and [Claude Code](https://claude.com/product/claude-code), but please open a [ticket](https://github.com/D4Vinci/Scrapling/issues/new/choose) if you faced any issues or use our [Discord server](https://discord.gg/EMgGbDceNQ).\n\n## Installation\n\nYou can use this [direct URL](https://github.com/D4Vinci/Scrapling/raw/refs/heads/main/agent-skill/Scrapling-Skill.zip) to download the ZIP file of the skill directly. We will try to update this page with all available methods.\n\n### Clawhub\nIf you are an [OpenClaw](https://github.com/openclaw/openclaw) and [Claude Code](https://claude.com/product/claude-code), you can install the skill using [Clawhub](https://docs.openclaw.ai/tools/clawhub) directly:\n```bash\nclawhub install scrapling-official\n```\n\nOr go to the [Clawhub](https://docs.openclaw.ai/tools/clawhub) page from [here](https://clawhub.ai/D4Vinci/scrapling-official)."
  },
  {
    "path": "agent-skill/Scrapling-Skill/LICENSE.txt",
    "content": "BSD 3-Clause License\n\nCopyright (c) 2024, Karim shoair\n\nRedistribution and use in source and binary forms, with or without\nmodification, are permitted provided that the following conditions are met:\n\n1. Redistributions of source code must retain the above copyright notice, this\n   list of conditions and the following disclaimer.\n\n2. Redistributions in binary form must reproduce the above copyright notice,\n   this list of conditions and the following disclaimer in the documentation\n   and/or other materials provided with the distribution.\n\n3. Neither the name of the copyright holder nor the names of its\n   contributors may be used to endorse or promote products derived from\n   this software without specific prior written permission.\n\nTHIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS \"AS IS\"\nAND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE\nIMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE\nDISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE\nFOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL\nDAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR\nSERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER\nCAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,\nOR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\nOF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n"
  },
  {
    "path": "agent-skill/Scrapling-Skill/SKILL.md",
    "content": "---\nname: scrapling-official\ndescription: Scrape web pages using Scrapling with anti-bot bypass (like Cloudflare Turnstile), stealth headless browsing, spiders framework, adaptive scraping, and JavaScript rendering. Use when asked to scrape, crawl, or extract data from websites; web_fetch fails; the site has anti-bot protections; write Python code to scrape/crawl; or write spiders.\nversion: 0.4.2\nlicense: Complete terms in LICENSE.txt\n---\n\n# Scrapling\n\nScrapling is an adaptive Web Scraping framework that handles everything from a single request to a full-scale crawl.\n\nIts parser learns from website changes and automatically relocates your elements when pages update. Its fetchers bypass anti-bot systems like Cloudflare Turnstile out of the box. And its spider framework lets you scale up to concurrent, multi-session crawls with pause/resume and automatic proxy rotation — all in a few lines of Python. One library, zero compromises.\n\nBlazing fast crawls with real-time stats and streaming. Built by Web Scrapers for Web Scrapers and regular users, there's something for everyone.\n\n**Requires: Python 3.10+**\n\n**This is the official skill for the scrapling library by the library author.**\n\n\n## Setup (once)\n\nCreate a virtual Python environment through any way available, like `venv`, then inside the environment do:\n\n`pip install \"scrapling[all]>=0.4.2\"`\n\nThen do this to download all the browsers' dependencies:\n\n```bash\nscrapling install --force\n```\n\nMake note of the `scrapling` binary path and use it instead of `scrapling` from now on with all commands (if `scrapling` is not on `$PATH`).\n\n### Docker\nAnother option if the user doesn't have Python or doesn't want to use it is to use the Docker image, but this can be used only in the commands, so no writing Python code for scrapling this way:\n\n```bash\ndocker pull pyd4vinci/scrapling\n```\nor\n```bash\ndocker pull ghcr.io/d4vinci/scrapling:latest\n```\n\n## CLI Usage\n\nThe `scrapling extract` command group lets you download and extract content from websites directly without writing any code.\n\n```bash\nUsage: scrapling extract [OPTIONS] COMMAND [ARGS]...\n\nCommands:\n  get             Perform a GET request and save the content to a file.\n  post            Perform a POST request and save the content to a file.\n  put             Perform a PUT request and save the content to a file.\n  delete          Perform a DELETE request and save the content to a file.\n  fetch           Use a browser to fetch content with browser automation and flexible options.\n  stealthy-fetch  Use a stealthy browser to fetch content with advanced stealth features.\n```\n\n### Usage pattern\n- Choose your output format by changing the file extension. Here are some examples for the `scrapling extract get` command:\n  - Convert the HTML content to Markdown, then save it to the file (great for documentation): `scrapling extract get \"https://blog.example.com\" article.md`\n  - Save the HTML content as it is to the file: `scrapling extract get \"https://example.com\" page.html`\n  - Save a clean version of the text content of the webpage to the file: `scrapling extract get \"https://example.com\" content.txt`\n- Output to a temp file, read it back, then clean up.\n- All commands can use CSS selectors to extract specific parts of the page through `--css-selector` or `-s`.\n\nWhich command to use generally:\n- Use **`get`** with simple websites, blogs, or news articles.\n- Use **`fetch`** with modern web apps, or sites with dynamic content.\n- Use **`stealthy-fetch`** with protected sites, Cloudflare, or anti-bot systems.\n\n> When unsure, start with `get`. If it fails or returns empty content, escalate to `fetch`, then `stealthy-fetch`. The speed of `fetch` and `stealthy-fetch` is nearly the same, so you are not sacrificing anything.\n\n#### Key options (requests)\n\nThose options are shared between the 4 HTTP request commands:\n\n| Option                                     | Input type | Description                                                                                                                                    |\n|:-------------------------------------------|:----------:|:-----------------------------------------------------------------------------------------------------------------------------------------------|\n| -H, --headers                              |    TEXT    | HTTP headers in format \"Key: Value\" (can be used multiple times)                                                                               |\n| --cookies                                  |    TEXT    | Cookies string in format \"name1=value1; name2=value2\"                                                                                          |\n| --timeout                                  |  INTEGER   | Request timeout in seconds (default: 30)                                                                                                       |\n| --proxy                                    |    TEXT    | Proxy URL in format \"http://username:password@host:port\"                                                                                       |\n| -s, --css-selector                         |    TEXT    | CSS selector to extract specific content from the page. It returns all matches.                                                                |\n| -p, --params                               |    TEXT    | Query parameters in format \"key=value\" (can be used multiple times)                                                                            |\n| --follow-redirects / --no-follow-redirects |    None    | Whether to follow redirects (default: True)                                                                                                    |\n| --verify / --no-verify                     |    None    | Whether to verify SSL certificates (default: True)                                                                                             |\n| --impersonate                              |    TEXT    | Browser to impersonate. Can be a single browser (e.g., Chrome) or a comma-separated list for random selection (e.g., Chrome, Firefox, Safari). |\n| --stealthy-headers / --no-stealthy-headers |    None    | Use stealthy browser headers (default: True)                                                                                                   |\n\nOptions shared between `post` and `put` only:\n\n| Option     | Input type | Description                                                                             |\n|:-----------|:----------:|:----------------------------------------------------------------------------------------|\n| -d, --data |    TEXT    | Form data to include in the request body (as string, ex: \"param1=value1&param2=value2\") |\n| -j, --json |    TEXT    | JSON data to include in the request body (as string)                                    |\n\nExamples:\n\n```bash\n# Basic download\nscrapling extract get \"https://news.site.com\" news.md\n\n# Download with custom timeout\nscrapling extract get \"https://example.com\" content.txt --timeout 60\n\n# Extract only specific content using CSS selectors\nscrapling extract get \"https://blog.example.com\" articles.md --css-selector \"article\"\n\n# Send a request with cookies\nscrapling extract get \"https://scrapling.requestcatcher.com\" content.md --cookies \"session=abc123; user=john\"\n\n# Add user agent\nscrapling extract get \"https://api.site.com\" data.json -H \"User-Agent: MyBot 1.0\"\n\n# Add multiple headers\nscrapling extract get \"https://site.com\" page.html -H \"Accept: text/html\" -H \"Accept-Language: en-US\"\n```\n\n#### Key options (browsers)\n\nBoth (`fetch` / `stealthy-fetch`) share options:\n\n\n| Option                                   | Input type | Description                                                                                                                                              |\n|:-----------------------------------------|:----------:|:---------------------------------------------------------------------------------------------------------------------------------------------------------|\n| --headless / --no-headless               |    None    | Run browser in headless mode (default: True)                                                                                                             |\n| --disable-resources / --enable-resources |    None    | Drop unnecessary resources for speed boost (default: False)                                                                                              |\n| --network-idle / --no-network-idle       |    None    | Wait for network idle (default: False)                                                                                                                   |\n| --real-chrome / --no-real-chrome         |    None    | If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it. (default: False) |\n| --timeout                                |  INTEGER   | Timeout in milliseconds (default: 30000)                                                                                                                 |\n| --wait                                   |  INTEGER   | Additional wait time in milliseconds after page load (default: 0)                                                                                        |\n| -s, --css-selector                       |    TEXT    | CSS selector to extract specific content from the page. It returns all matches.                                                                          |\n| --wait-selector                          |    TEXT    | CSS selector to wait for before proceeding                                                                                                               |\n| --proxy                                  |    TEXT    | Proxy URL in format \"http://username:password@host:port\"                                                                                                 |\n| -H, --extra-headers                      |    TEXT    | Extra headers in format \"Key: Value\" (can be used multiple times)                                                                                        |\n\nThis option is specific to `fetch` only:\n\n| Option   | Input type | Description                                                 |\n|:---------|:----------:|:------------------------------------------------------------|\n| --locale |    TEXT    | Specify user locale. Defaults to the system default locale. |\n\nAnd these options are specific to `stealthy-fetch` only:\n\n| Option                                     | Input type | Description                                     |\n|:-------------------------------------------|:----------:|:------------------------------------------------|\n| --block-webrtc / --allow-webrtc            |    None    | Block WebRTC entirely (default: False)          |\n| --solve-cloudflare / --no-solve-cloudflare |    None    | Solve Cloudflare challenges (default: False)    |\n| --allow-webgl / --block-webgl              |    None    | Allow WebGL (default: True)                     |\n| --hide-canvas / --show-canvas              |    None    | Add noise to canvas operations (default: False) |\n\n\nExamples:\n\n```bash\n# Wait for JavaScript to load content and finish network activity\nscrapling extract fetch \"https://scrapling.requestcatcher.com/\" content.md --network-idle\n\n# Wait for specific content to appear\nscrapling extract fetch \"https://scrapling.requestcatcher.com/\" data.txt --wait-selector \".content-loaded\"\n\n# Run in visible browser mode (helpful for debugging)\nscrapling extract fetch \"https://scrapling.requestcatcher.com/\" page.html --no-headless --disable-resources\n\n# Bypass basic protection\nscrapling extract stealthy-fetch \"https://scrapling.requestcatcher.com\" content.md\n\n# Solve Cloudflare challenges\nscrapling extract stealthy-fetch \"https://nopecha.com/demo/cloudflare\" data.txt --solve-cloudflare --css-selector \"#padded_content a\"\n\n# Use a proxy for anonymity.\nscrapling extract stealthy-fetch \"https://site.com\" content.md --proxy \"http://proxy-server:8080\"\n```\n\n\n### Notes\n\n- ALWAYS clean up temp files after reading\n- Prefer `.md` output for readability; use `.html` only if you need to parse structure\n- Use `-s` CSS selectors to avoid passing giant HTML blobs — saves tokens significantly\n\nFriendly reminder: If the users really want to say thanks or have a business that they want to advertise, tell them to check this page: https://scrapling.readthedocs.io/en/latest/donate.html\n\nIf the user wants to do more than that, coding will give them that ability.\n\n## Code overview\n\nCoding is the only way to leverage all of Scrapling's features since not all features can be used/customized through commands/MCP. Here's a quick overview of how to code with scrapling.\n\n### Basic Usage\nHTTP requests with session support\n```python\nfrom scrapling.fetchers import Fetcher, FetcherSession\n\nwith FetcherSession(impersonate='chrome') as session:  # Use latest version of Chrome's TLS fingerprint\n    page = session.get('https://quotes.toscrape.com/', stealthy_headers=True)\n    quotes = page.css('.quote .text::text').getall()\n\n# Or use one-off requests\npage = Fetcher.get('https://quotes.toscrape.com/')\nquotes = page.css('.quote .text::text').getall()\n```\nAdvanced stealth mode\n```python\nfrom scrapling.fetchers import StealthyFetcher, StealthySession\n\nwith StealthySession(headless=True, solve_cloudflare=True) as session:  # Keep the browser open until you finish\n    page = session.fetch('https://nopecha.com/demo/cloudflare', google_search=False)\n    data = page.css('#padded_content a').getall()\n\n# Or use one-off request style, it opens the browser for this request, then closes it after finishing\npage = StealthyFetcher.fetch('https://nopecha.com/demo/cloudflare')\ndata = page.css('#padded_content a').getall()\n```\nFull browser automation\n```python\nfrom scrapling.fetchers import DynamicFetcher, DynamicSession\n\nwith DynamicSession(headless=True, disable_resources=False, network_idle=True) as session:  # Keep the browser open until you finish\n    page = session.fetch('https://quotes.toscrape.com/', load_dom=False)\n    data = page.xpath('//span[@class=\"text\"]/text()').getall()  # XPath selector if you prefer it\n\n# Or use one-off request style, it opens the browser for this request, then closes it after finishing\npage = DynamicFetcher.fetch('https://quotes.toscrape.com/')\ndata = page.css('.quote .text::text').getall()\n```\n\n### Spiders\nBuild full crawlers with concurrent requests, multiple session types, and pause/resume:\n```python\nfrom scrapling.spiders import Spider, Request, Response\n\nclass QuotesSpider(Spider):\n    name = \"quotes\"\n    start_urls = [\"https://quotes.toscrape.com/\"]\n    concurrent_requests = 10\n    \n    async def parse(self, response: Response):\n        for quote in response.css('.quote'):\n            yield {\n                \"text\": quote.css('.text::text').get(),\n                \"author\": quote.css('.author::text').get(),\n            }\n            \n        next_page = response.css('.next a')\n        if next_page:\n            yield response.follow(next_page[0].attrib['href'])\n\nresult = QuotesSpider().start()\nprint(f\"Scraped {len(result.items)} quotes\")\nresult.items.to_json(\"quotes.json\")\n```\nUse multiple session types in a single spider:\n```python\nfrom scrapling.spiders import Spider, Request, Response\nfrom scrapling.fetchers import FetcherSession, AsyncStealthySession\n\nclass MultiSessionSpider(Spider):\n    name = \"multi\"\n    start_urls = [\"https://example.com/\"]\n    \n    def configure_sessions(self, manager):\n        manager.add(\"fast\", FetcherSession(impersonate=\"chrome\"))\n        manager.add(\"stealth\", AsyncStealthySession(headless=True), lazy=True)\n    \n    async def parse(self, response: Response):\n        for link in response.css('a::attr(href)').getall():\n            # Route protected pages through the stealth session\n            if \"protected\" in link:\n                yield Request(link, sid=\"stealth\")\n            else:\n                yield Request(link, sid=\"fast\", callback=self.parse)  # explicit callback\n```\nPause and resume long crawls with checkpoints by running the spider like this:\n```python\nQuotesSpider(crawldir=\"./crawl_data\").start()\n```\nPress Ctrl+C to pause gracefully — progress is saved automatically. Later, when you start the spider again, pass the same `crawldir`, and it will resume from where it stopped.\n\n### Advanced Parsing & Navigation\n```python\nfrom scrapling.fetchers import Fetcher\n\n# Rich element selection and navigation\npage = Fetcher.get('https://quotes.toscrape.com/')\n\n# Get quotes with multiple selection methods\nquotes = page.css('.quote')  # CSS selector\nquotes = page.xpath('//div[@class=\"quote\"]')  # XPath\nquotes = page.find_all('div', {'class': 'quote'})  # BeautifulSoup-style\n# Same as\nquotes = page.find_all('div', class_='quote')\nquotes = page.find_all(['div'], class_='quote')\nquotes = page.find_all(class_='quote')  # and so on...\n# Find element by text content\nquotes = page.find_by_text('quote', tag='div')\n\n# Advanced navigation\nquote_text = page.css('.quote')[0].css('.text::text').get()\nquote_text = page.css('.quote').css('.text::text').getall()  # Chained selectors\nfirst_quote = page.css('.quote')[0]\nauthor = first_quote.next_sibling.css('.author::text')\nparent_container = first_quote.parent\n\n# Element relationships and similarity\nsimilar_elements = first_quote.find_similar()\nbelow_elements = first_quote.below_elements()\n```\nYou can use the parser right away if you don't want to fetch websites like below:\n```python\nfrom scrapling.parser import Selector\n\npage = Selector(\"<html>...</html>\")\n```\nAnd it works precisely the same way!\n### Async Session Management Examples\n```python\nimport asyncio\nfrom scrapling.fetchers import FetcherSession, AsyncStealthySession, AsyncDynamicSession\n\nasync with FetcherSession(http3=True) as session:  # `FetcherSession` is context-aware and can work in both sync/async patterns\n    page1 = session.get('https://quotes.toscrape.com/')\n    page2 = session.get('https://quotes.toscrape.com/', impersonate='firefox135')\n\n# Async session usage\nasync with AsyncStealthySession(max_pages=2) as session:\n    tasks = []\n    urls = ['https://example.com/page1', 'https://example.com/page2']\n    \n    for url in urls:\n        task = session.fetch(url)\n        tasks.append(task)\n    \n    print(session.get_pool_stats())  # Optional - The status of the browser tabs pool (busy/free/error)\n    results = await asyncio.gather(*tasks)\n    print(session.get_pool_stats())\n```\n\n## References\nYou already had a good glimpse of what the library can do. Use the references below to dig deeper when needed\n- `references/mcp-server.md` — MCP server tools and capabilities\n- `references/parsing` — Everything you need for parsing HTML\n- `references/fetching` — Everything you need to fetch websites and session persistence\n- `references/spiders` — Everything you need to write spiders, proxy rotation, and advanced features. It follows a Scrapy-like format\n- `references/migrating_from_beautifulsoup.md` — A quick API comparison between scrapling and Beautifulsoup\n- `https://github.com/D4Vinci/Scrapling/tree/main/docs` — Full official docs in Markdown for quick access (use only if current references do not look up-to-date).\n\nThis skill encapsulates almost all the published documentation in Markdown, so don't check external sources or search online without the user's permission.\n\n## Guardrails (Always)\n- Only scrape content you're authorized to access.\n- Respect robots.txt and ToS.\n- Add delays (download_delay) for large crawls.\n- Don't bypass paywalls or authentication without permission.\n- Never scrape personal/sensitive data."
  },
  {
    "path": "agent-skill/Scrapling-Skill/examples/01_fetcher_session.py",
    "content": "\"\"\"\nExample 1: Python - FetcherSession (persistent HTTP session with Chrome TLS fingerprint)\n\nScrapes all 10 pages of quotes.toscrape.com using a single HTTP session.\nNo browser launched — fast and lightweight.\n\nBest for: static or semi-static sites, APIs, pages that don't require JavaScript.\n\"\"\"\n\nfrom scrapling.fetchers import FetcherSession\n\nall_quotes = []\n\nwith FetcherSession(impersonate=\"chrome\") as session:\n    for i in range(1, 11):\n        page = session.get(\n            f\"https://quotes.toscrape.com/page/{i}/\",\n            stealthy_headers=True,\n        )\n        quotes = page.css(\".quote .text::text\").getall()\n        all_quotes.extend(quotes)\n        print(f\"Page {i}: {len(quotes)} quotes (status {page.status})\")\n\nprint(f\"\\nTotal: {len(all_quotes)} quotes\\n\")\nfor i, quote in enumerate(all_quotes, 1):\n    print(f\"{i:>3}. {quote}\")\n"
  },
  {
    "path": "agent-skill/Scrapling-Skill/examples/02_dynamic_session.py",
    "content": "\"\"\"\nExample 2: Python - DynamicSession (Playwright browser automation, visible)\n\nScrapes all 10 pages of quotes.toscrape.com using a persistent browser session.\nThe browser window stays open across all page requests for efficiency.\n\nBest for: JavaScript-heavy pages, SPAs, sites with dynamic content loading.\n\nSet headless=True to run the browser hidden.\nSet disable_resources=True to skip loading images/fonts for a speed boost.\n\"\"\"\n\nfrom scrapling.fetchers import DynamicSession\n\nall_quotes = []\n\nwith DynamicSession(headless=False, disable_resources=True) as session:\n    for i in range(1, 11):\n        page = session.fetch(f\"https://quotes.toscrape.com/page/{i}/\")\n        quotes = page.css(\".quote .text::text\").getall()\n        all_quotes.extend(quotes)\n        print(f\"Page {i}: {len(quotes)} quotes (status {page.status})\")\n\nprint(f\"\\nTotal: {len(all_quotes)} quotes\\n\")\nfor i, quote in enumerate(all_quotes, 1):\n    print(f\"{i:>3}. {quote}\")\n"
  },
  {
    "path": "agent-skill/Scrapling-Skill/examples/03_stealthy_session.py",
    "content": "\"\"\"\nExample 3: Python - StealthySession (Patchright stealth browser, visible)\n\nScrapes all 10 pages of quotes.toscrape.com using a persistent stealth browser session.\nBypasses anti-bot protections automatically (Cloudflare Turnstile, fingerprinting, etc.).\n\nBest for: well-protected sites, Cloudflare-gated pages, sites that detect Playwright.\n\nSet headless=True to run the browser hidden.\nAdd solve_cloudflare=True to auto-solve Cloudflare challenges.\n\"\"\"\n\nfrom scrapling.fetchers import StealthySession\n\nall_quotes = []\n\nwith StealthySession(headless=False) as session:\n    for i in range(1, 11):\n        page = session.fetch(f\"https://quotes.toscrape.com/page/{i}/\")\n        quotes = page.css(\".quote .text::text\").getall()\n        all_quotes.extend(quotes)\n        print(f\"Page {i}: {len(quotes)} quotes (status {page.status})\")\n\nprint(f\"\\nTotal: {len(all_quotes)} quotes\\n\")\nfor i, quote in enumerate(all_quotes, 1):\n    print(f\"{i:>3}. {quote}\")\n"
  },
  {
    "path": "agent-skill/Scrapling-Skill/examples/04_spider.py",
    "content": "\"\"\"\nExample 4: Python - Spider (auto-crawling framework)\n\nScrapes ALL pages of quotes.toscrape.com by following \"Next\" pagination links\nautomatically. No manual page looping needed.\n\nThe spider yields structured items (text + author + tags) and exports them to JSON.\n\nBest for: multi-page crawls, full-site scraping, anything needing pagination or\nlink following across many pages.\n\nOutputs:\n  - Live stats to terminal during crawl\n  - Final crawl stats at the end\n  - quotes.json in the current directory\n\"\"\"\n\nfrom scrapling.spiders import Spider, Response\n\n\nclass QuotesSpider(Spider):\n    name = \"quotes\"\n    start_urls = [\"https://quotes.toscrape.com/\"]\n    concurrent_requests = 5  # Fetch up to 5 pages at once\n\n    async def parse(self, response: Response):\n        # Extract all quotes on the current page\n        for quote in response.css(\".quote\"):\n            yield {\n                \"text\": quote.css(\".text::text\").get(),\n                \"author\": quote.css(\".author::text\").get(),\n                \"tags\": quote.css(\".tags .tag::text\").getall(),\n            }\n\n        # Follow the \"Next\" button to the next page (if it exists)\n        next_page = response.css(\".next a\")\n        if next_page:\n            yield response.follow(next_page[0].attrib[\"href\"])\n\n\nif __name__ == \"__main__\":\n    result = QuotesSpider().start()\n\n    print(f\"\\n{'=' * 50}\")\n    print(f\"Scraped : {result.stats.items_scraped} quotes\")\n    print(f\"Requests: {result.stats.requests_count}\")\n    print(f\"Time    : {result.stats.elapsed_seconds:.2f}s\")\n    print(f\"Speed   : {result.stats.requests_per_second:.2f} req/s\")\n    print(f\"{'=' * 50}\\n\")\n\n    for i, item in enumerate(result.items, 1):\n        print(f\"{i:>3}. [{item['author']}] {item['text']}\")\n        if item[\"tags\"]:\n            print(f\"       Tags: {', '.join(item['tags'])}\")\n\n    # Export to JSON\n    result.items.to_json(\"quotes.json\", indent=True)\n    print(\"\\nExported to quotes.json\")\n"
  },
  {
    "path": "agent-skill/Scrapling-Skill/examples/README.md",
    "content": "# Scrapling Examples\n\nThese examples scrape [quotes.toscrape.com](https://quotes.toscrape.com) — a safe, purpose-built scraping sandbox — and demonstrate every tool available in Scrapling, from plain HTTP to full browser automation and spiders.\n\nAll examples collect **all 100 quotes across 10 pages**.\n\n## Quick Start\n\nMake sure Scrapling is installed:\n\n```bash\npip install \"scrapling[all]>=0.4.2\"\nscrapling install --force\n```\n\n## Examples\n\n| File                     | Tool              | Type                        | Best For                              |\n|--------------------------|-------------------|-----------------------------|---------------------------------------|\n| `01_fetcher_session.py`  | `FetcherSession`  | Python — persistent HTTP    | APIs, fast multi-page scraping        |\n| `02_dynamic_session.py`  | `DynamicSession`  | Python — browser automation | Dynamic/SPA pages                     |\n| `03_stealthy_session.py` | `StealthySession` | Python — stealth browser    | Cloudflare, fingerprint bypass        |\n| `04_spider.py`           | `Spider`          | Python — auto-crawling      | Multi-page crawls, full-site scraping |\n\n## Running\n\n**Python scripts:**\n\n```bash\npython examples/01_fetcher_session.py\npython examples/02_dynamic_session.py  # Opens a visible browser\npython examples/03_stealthy_session.py # Opens a visible stealth browser\npython examples/04_spider.py           # Auto-crawls all pages, exports quotes.json\n```\n\n## Escalation Guide\n\nStart with the fastest, lightest option and escalate only if needed:\n\n```\nget / FetcherSession\n  └─ If JS required → fetch / DynamicSession\n       └─ If blocked → stealthy-fetch / StealthySession\n            └─ If multi-page → Spider\n```\n"
  },
  {
    "path": "agent-skill/Scrapling-Skill/references/fetching/choosing.md",
    "content": "# Fetchers basics\n\n## Introduction\nFetchers are classes that do requests or fetch pages in a single-line fashion with many features and return a [Response](#response-object) object. All fetchers have separate session classes to keep the session running (e.g., a browser fetcher keeps the browser open until you finish all requests).\n\nFetchers are not wrappers built on top of other libraries. They use these libraries as an engine to request/fetch pages but add features the underlying engines don't have, while still fully leveraging and optimizing them for web scraping.\n\n## Fetchers Overview\n\nScrapling provides three different fetcher classes with their session classes; each fetcher is designed for a specific use case.\n\nThe following table compares them and can be quickly used for guidance.\n\n\n| Feature            | Fetcher                                           | DynamicFetcher                                                                    | StealthyFetcher                                                                            |\n|--------------------|---------------------------------------------------|-----------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------|\n| Relative speed     | 🐇🐇🐇🐇🐇                                        | 🐇🐇🐇                                                                            | 🐇🐇🐇                                                                                     |\n| Stealth            | ⭐⭐                                                | ⭐⭐⭐                                                                               | ⭐⭐⭐⭐⭐                                                                                      |\n| Anti-Bot options   | ⭐⭐                                                | ⭐⭐⭐                                                                               | ⭐⭐⭐⭐⭐                                                                                      |\n| JavaScript loading | ❌                                                 | ✅                                                                                 | ✅                                                                                          |\n| Memory Usage       | ⭐                                                 | ⭐⭐⭐                                                                               | ⭐⭐⭐                                                                                        |\n| Best used for      | Basic scraping when HTTP requests alone can do it | - Dynamically loaded websites <br/>- Small automation<br/>- Small-Mid protections | - Dynamically loaded websites <br/>- Small automation <br/>- Small-Complicated protections |\n| Browser(s)         | ❌                                                 | Chromium and Google Chrome                                                        | Chromium and Google Chrome                                                                 |\n| Browser API used   | ❌                                                 | PlayWright                                                                        | PlayWright                                                                                 |\n| Setup Complexity   | Simple                                            | Simple                                                                            | Simple                                                                                     |\n\n## Parser configuration in all fetchers\nAll fetchers share the same import method, as you will see in the upcoming pages\n```python\n>>> from scrapling.fetchers import Fetcher, AsyncFetcher, StealthyFetcher, DynamicFetcher\n```\nThen you use it right away without initializing like this, and it will use the default parser settings:\n```python\n>>> page = StealthyFetcher.fetch('https://example.com') \n```\nIf you want to configure the parser ([Selector class](parsing/main_classes.md#selector)) that will be used on the response before returning it for you, then do this first:\n```python\n>>> from scrapling.fetchers import Fetcher\n>>> Fetcher.configure(adaptive=True, keep_comments=False, keep_cdata=False)  # and the rest\n```\nor\n```python\n>>> from scrapling.fetchers import Fetcher\n>>> Fetcher.adaptive=True\n>>> Fetcher.keep_comments=False\n>>> Fetcher.keep_cdata=False  # and the rest\n```\nThen, continue your code as usual.\n\nThe available configuration arguments are: `adaptive`, `adaptive_domain`, `huge_tree`, `keep_comments`, `keep_cdata`, `storage`, and `storage_args`, which are the same ones you give to the [Selector](parsing/main_classes.md#selector) class. You can display the current configuration anytime by running `<fetcher_class>.display_config()`.\n\n**Info:** The `adaptive` argument is disabled by default; you must enable it to use that feature.\n\n### Set parser config per request\nAs you probably understand, the logic above for setting the parser config will apply globally to all requests/fetches made through that class, and it's intended for simplicity.\n\nIf your use case requires a different configuration for each request/fetch, you can pass a dictionary to the request method (`fetch`/`get`/`post`/...) to an argument named `selector_config`.\n\n## Response Object\nThe `Response` object is the same as the [Selector](parsing/main_classes.md#selector) class, but it has additional details about the response, like response headers, status, cookies, etc., as shown below:\n```python\n>>> from scrapling.fetchers import Fetcher\n>>> page = Fetcher.get('https://example.com')\n\n>>> page.status          # HTTP status code\n>>> page.reason          # Status message\n>>> page.cookies         # Response cookies as a dictionary\n>>> page.headers         # Response headers\n>>> page.request_headers # Request headers\n>>> page.history         # Response history of redirections, if any\n>>> page.body            # Raw response body as bytes\n>>> page.encoding        # Response encoding\n>>> page.meta            # Response metadata dictionary (e.g., proxy used). Mainly helpful with the spiders system.\n```\nAll fetchers return the `Response` object.\n\n**Note:** Unlike the [Selector](parsing/main_classes.md#selector) class, the `Response` class's body is always bytes since v0.4."
  },
  {
    "path": "agent-skill/Scrapling-Skill/references/fetching/dynamic.md",
    "content": "# Fetching dynamic websites\n\n`DynamicFetcher` (formerly `PlayWrightFetcher`) provides flexible browser automation with multiple configuration options and built-in stealth improvements.\n\nAs we will explain later, to automate the page, you need some knowledge of [Playwright's Page API](https://playwright.dev/python/docs/api/class-page).\n\n## Basic Usage\nYou have one primary way to import this Fetcher, which is the same for all fetchers.\n\n```python\n>>> from scrapling.fetchers import DynamicFetcher\n```\nCheck out how to configure the parsing options [here](choosing.md#parser-configuration-in-all-fetchers)\n\n**Note:** The async version of the `fetch` method is `async_fetch`.\n\nThis fetcher provides three main run options that can be combined as desired.\n\nWhich are:\n\n### 1. Vanilla Playwright\n```python\nDynamicFetcher.fetch('https://example.com')\n```\nUsing it in that manner will open a Chromium browser and load the page. There are optimizations for speed, and some stealth goes automatically under the hood, but other than that, there are no tricks or extra features unless you enable some; it's just a plain PlayWright API.\n\n### 2. Real Chrome\n```python\nDynamicFetcher.fetch('https://example.com', real_chrome=True)\n```\nIf you have a Google Chrome browser installed, use this option. It's the same as the first option, but it will use the Google Chrome browser you installed on your device instead of Chromium. This will make your requests look more authentic, so they're less detectable for better results.\n\nIf you don't have Google Chrome installed and want to use this option, you can use the command below in the terminal to install it for the library instead of installing it manually:\n```commandline\nplaywright install chrome\n```\n\n### 3. CDP Connection\n```python\nDynamicFetcher.fetch('https://example.com', cdp_url='ws://localhost:9222')\n```\nInstead of launching a browser locally (Chromium/Google Chrome), you can connect to a remote browser through the [Chrome DevTools Protocol](https://chromedevtools.github.io/devtools-protocol/).\n\n\n**Notes:**\n* There was a `stealth` option here, but it was moved to the `StealthyFetcher` class, as explained on the next page, with additional features since version 0.3.13.\n* This makes it less confusing for new users, easier to maintain, and provides other benefits, as explained on the [StealthyFetcher page](stealthy.md).\n\n## Full list of arguments\nAll arguments for `DynamicFetcher` and its session classes:\n\n|      Argument       | Description                                                                                                                                                                                                                         | Optional |\n|:-------------------:|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:--------:|\n|         url         | Target url                                                                                                                                                                                                                          |    ❌     |\n|      headless       | Pass `True` to run the browser in headless/hidden (**default**) or `False` for headful/visible mode.                                                                                                                                |    ✔️    |\n|  disable_resources  | Drop requests for unnecessary resources for a speed boost. Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.                         |    ✔️    |\n|       cookies       | Set cookies for the next request.                                                                                                                                                                                                   |    ✔️    |\n|      useragent      | Pass a useragent string to be used. **Otherwise, the fetcher will generate and use a real Useragent of the same browser and version.**                                                                                              |    ✔️    |\n|    network_idle     | Wait for the page until there are no network connections for at least 500 ms.                                                                                                                                                       |    ✔️    |\n|      load_dom       | Enabled by default, wait for all JavaScript on page(s) to fully load and execute (wait for the `domcontentloaded` state).                                                                                                           |    ✔️    |\n|       timeout       | The timeout (milliseconds) used in all operations and waits through the page. The default is 30,000 ms (30 seconds).                                                                                                                |    ✔️    |\n|        wait         | The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the `Response` object.                                                                                                |    ✔️    |\n|     page_action     | Added for automation. Pass a function that takes the `page` object and does the necessary automation.                                                                                                                               |    ✔️    |\n|    wait_selector    | Wait for a specific css selector to be in a specific state.                                                                                                                                                                         |    ✔️    |\n|     init_script     | An absolute path to a JavaScript file to be executed on page creation for all pages in this session.                                                                                                                                |    ✔️    |\n| wait_selector_state | Scrapling will wait for the given state to be fulfilled for the selector given with `wait_selector`. _Default state is `attached`._                                                                                                 |    ✔️    |\n|    google_search    | Enabled by default, Scrapling will set a Google referer header.                                                                                               |    ✔️    |\n|    extra_headers    | A dictionary of extra headers to add to the request. _The referer set by `google_search` takes priority over the referer set here if used together._                                                                   |    ✔️    |\n|        proxy        | The proxy to be used with requests. It can be a string or a dictionary with only the keys 'server', 'username', and 'password'.                                                                                                     |    ✔️    |\n|     real_chrome     | If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch and use an instance of your browser.                                                                                                |    ✔️    |\n|       locale        | Specify user locale, for example, `en-GB`, `de-DE`, etc. Locale will affect `navigator.language` value, `Accept-Language` request header value, as well as number and date formatting rules. Defaults to the system default locale. |    ✔️    |\n|     timezone_id     | Changes the timezone of the browser. Defaults to the system timezone.                                                                                                                                                               |    ✔️    |\n|       cdp_url       | Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.                                                                                                                          |    ✔️    |\n|    user_data_dir    | Path to a User Data Directory, which stores browser session data like cookies and local storage. The default is to create a temporary directory. **Only Works with sessions**                                                       |    ✔️    |\n|     extra_flags     | A list of additional browser flags to pass to the browser on launch.                                                                                                                                                                |    ✔️    |\n|   additional_args   | Additional arguments to be passed to Playwright's context as additional settings, and they take higher priority than Scrapling's settings.                                                                                          |    ✔️    |\n|   selector_config   | A dictionary of custom parsing arguments to be used when creating the final `Selector`/`Response` class.                                                                                                                            |    ✔️    |\n|   blocked_domains   | A set of domain names to block requests to. Subdomains are also matched (e.g., `\"example.com\"` blocks `\"sub.example.com\"` too).                                                                                                     |    ✔️    |\n|    proxy_rotator    | A `ProxyRotator` instance for automatic proxy rotation. Cannot be combined with `proxy`.                                                                                                                                            |    ✔️    |\n|       retries       | Number of retry attempts for failed requests. Defaults to 3.                                                                                                                                                                        |    ✔️    |\n|     retry_delay     | Seconds to wait between retry attempts. Defaults to 1.                                                                                                                                                                              |    ✔️    |\n\nIn session classes, all these arguments can be set globally for the session. Still, you can configure each request individually by passing some of the arguments here that can be configured on the browser tab level like: `google_search`, `timeout`, `wait`, `page_action`, `extra_headers`, `disable_resources`, `wait_selector`, `wait_selector_state`, `network_idle`, `load_dom`, `blocked_domains`, `proxy`, and `selector_config`.\n\n**Notes:**\n1. The `disable_resources` option made requests ~25% faster in tests for some websites and can help save proxy usage, but be careful with it, as it can cause some websites to never finish loading.\n2. The `google_search` argument is enabled by default for all requests, setting the referer to `https://www.google.com/`. If used together with `extra_headers`, it takes priority over the referer set there.\n3. Since version 0.3.13, the `stealth` option has been removed here in favor of the `StealthyFetcher` class, and the `hide_canvas` option has been moved to it. The `disable_webgl` argument has been moved to the `StealthyFetcher` class and renamed as `allow_webgl`.\n4. If you didn't set a user agent and enabled headless mode, the fetcher will generate a real user agent for the same browser version and use it. If you didn't set a user agent and didn't enable headless mode, the fetcher will use the browser's default user agent, which is the same as in standard browsers in the latest versions.\n\n\n## Examples\n\n### Resource Control\n\n```python\n# Disable unnecessary resources\npage = DynamicFetcher.fetch('https://example.com', disable_resources=True)  # Blocks fonts, images, media, etc.\n```\n\n### Domain Blocking\n\n```python\n# Block requests to specific domains (and their subdomains)\npage = DynamicFetcher.fetch('https://example.com', blocked_domains={\"ads.example.com\", \"tracker.net\"})\n```\n\n### Network Control\n\n```python\n# Wait for network idle (Consider fetch to be finished when there are no network connections for at least 500 ms)\npage = DynamicFetcher.fetch('https://example.com', network_idle=True)\n\n# Custom timeout (in milliseconds)\npage = DynamicFetcher.fetch('https://example.com', timeout=30000)  # 30 seconds\n\n# Proxy support (It can also be a dictionary with only the keys 'server', 'username', and 'password'.)\npage = DynamicFetcher.fetch('https://example.com', proxy='http://username:password@host:port')\n```\n\n### Proxy Rotation\n\n```python\nfrom scrapling.fetchers import DynamicSession, ProxyRotator\n\n# Set up proxy rotation\nrotator = ProxyRotator([\n    \"http://proxy1:8080\",\n    \"http://proxy2:8080\",\n    \"http://proxy3:8080\",\n])\n\n# Use with session - rotates proxy automatically with each request\nwith DynamicSession(proxy_rotator=rotator, headless=True) as session:\n    page1 = session.fetch('https://example1.com')\n    page2 = session.fetch('https://example2.com')\n\n    # Override rotator for a specific request\n    page3 = session.fetch('https://example3.com', proxy='http://specific-proxy:8080')\n```\n\n**Warning:** By default, all browser-based fetchers and sessions use a persistent browser context with a pool of tabs. However, since browsers can't set a proxy per tab, when you use a `ProxyRotator`, the fetcher will automatically open a separate context for each proxy, with one tab per context. Once the tab's job is done, both the tab and its context are closed.\n\n### Downloading Files\n\n```python\npage = DynamicFetcher.fetch('https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/main_cover.png')\n\nwith open(file='main_cover.png', mode='wb') as f:\n    f.write(page.body)\n```\n\nThe `body` attribute of the `Response` object always returns `bytes`.\n\n### Browser Automation\nThis is where your knowledge about [Playwright's Page API](https://playwright.dev/python/docs/api/class-page) comes into play. The function you pass here takes the page object from Playwright's API, performs the desired action, and then the fetcher continues.\n\nThis function is executed immediately after waiting for `network_idle` (if enabled) and before waiting for the `wait_selector` argument, allowing it to be used for purposes beyond automation. You can alter the page as you want.\n\nIn the example below, I used the pages' [mouse events](https://playwright.dev/python/docs/api/class-mouse) to scroll the page with the mouse wheel, then move the mouse.\n```python\nfrom playwright.sync_api import Page\n\ndef scroll_page(page: Page):\n    page.mouse.wheel(10, 0)\n    page.mouse.move(100, 400)\n    page.mouse.up()\n\npage = DynamicFetcher.fetch('https://example.com', page_action=scroll_page)\n```\nOf course, if you use the async fetch version, the function must also be async.\n```python\nfrom playwright.async_api import Page\n\nasync def scroll_page(page: Page):\n   await page.mouse.wheel(10, 0)\n   await page.mouse.move(100, 400)\n   await page.mouse.up()\n\npage = await DynamicFetcher.async_fetch('https://example.com', page_action=scroll_page)\n```\n\n### Wait Conditions\n\n```python\n# Wait for the selector\npage = DynamicFetcher.fetch(\n    'https://example.com',\n    wait_selector='h1',\n    wait_selector_state='visible'\n)\n```\nThis is the last wait the fetcher will do before returning the response (if enabled). You pass a CSS selector to the `wait_selector` argument, and the fetcher will wait for the state you passed in the `wait_selector_state` argument to be fulfilled. If you didn't pass a state, the default would be `attached`, which means it will wait for the element to be present in the DOM.\n\nAfter that, if `load_dom` is enabled (the default), the fetcher will check again to see if all JavaScript files are loaded and executed (in the `domcontentloaded` state) or continue waiting. If you have enabled `network_idle`, the fetcher will wait for `network_idle` to be fulfilled again, as explained above.\n\nThe states the fetcher can wait for can be any of the following ([source](https://playwright.dev/python/docs/api/class-page#page-wait-for-selector)):\n\n- `attached`: Wait for an element to be present in the DOM.\n- `detached`: Wait for an element to not be present in the DOM.\n- `visible`: wait for an element to have a non-empty bounding box and no `visibility:hidden`. Note that an element without any content or with `display:none` has an empty bounding box and is not considered visible.\n- `hidden`: wait for an element to be either detached from the DOM, or have an empty bounding box, or `visibility:hidden`. This is opposite to the `'visible'` option.\n\n### Some Stealth Features\n\n```python\npage = DynamicFetcher.fetch(\n    'https://example.com',\n    google_search=True,\n    useragent='Mozilla/5.0...',  # Custom user agent\n    locale='en-US',  # Set browser locale\n)\n```\n\n### General example\n```python\nfrom scrapling.fetchers import DynamicFetcher\n\ndef scrape_dynamic_content():\n    # Use Playwright for JavaScript content\n    page = DynamicFetcher.fetch(\n        'https://example.com/dynamic',\n        network_idle=True,\n        wait_selector='.content'\n    )\n    \n    # Extract dynamic content\n    content = page.css('.content')\n    \n    return {\n        'title': content.css('h1::text').get(),\n        'items': [\n            item.text for item in content.css('.item')\n        ]\n    }\n```\n\n## Session Management\n\nTo keep the browser open until you make multiple requests with the same configuration, use `DynamicSession`/`AsyncDynamicSession` classes. Those classes can accept all the arguments that the `fetch` function can take, which enables you to specify a config for the entire session.\n\n```python\nfrom scrapling.fetchers import DynamicSession\n\n# Create a session with default configuration\nwith DynamicSession(\n    headless=True,\n    disable_resources=True,\n    real_chrome=True\n) as session:\n    # Make multiple requests with the same browser instance\n    page1 = session.fetch('https://example1.com')\n    page2 = session.fetch('https://example2.com')\n    page3 = session.fetch('https://dynamic-site.com')\n    \n    # All requests reuse the same tab on the same browser instance\n```\n\n### Async Session Usage\n\n```python\nimport asyncio\nfrom scrapling.fetchers import AsyncDynamicSession\n\nasync def scrape_multiple_sites():\n    async with AsyncDynamicSession(\n        network_idle=True,\n        timeout=30000,\n        max_pages=3\n    ) as session:\n        # Make async requests with shared browser configuration\n        pages = await asyncio.gather(\n            session.fetch('https://spa-app1.com'),\n            session.fetch('https://spa-app2.com'),\n            session.fetch('https://dynamic-content.com')\n        )\n        return pages\n```\n\nYou may have noticed the `max_pages` argument. This is a new argument that enables the fetcher to create a **rotating pool of Browser tabs**. Instead of using a single tab for all your requests, you set a limit on the maximum number of pages that can be displayed at once. With each request, the library will close all tabs that have finished their task and check if the number of the current tabs is lower than the maximum allowed number of pages/tabs, then:\n\n1. If you are within the allowed range, the fetcher will create a new tab for you, and then all is as normal.\n2. Otherwise, it will keep checking every subsecond if creating a new tab is allowed or not for 60 seconds, then raise `TimeoutError`. This can happen when the website you are fetching becomes unresponsive.\n\nThis logic allows for multiple URLs to be fetched at the same time in the same browser, which saves a lot of resources, but most importantly, is so fast :)\n\nIn versions 0.3 and 0.3.1, the pool was reusing finished tabs to save more resources/time. That logic proved flawed, as it's nearly impossible to protect pages/tabs from contamination by the previous configuration used in the request before this one.\n\n### Session Benefits\n\n- **Browser reuse**: Much faster subsequent requests by reusing the same browser instance.\n- **Cookie persistence**: Automatic cookie and session state handling as any browser does automatically.\n- **Consistent fingerprint**: Same browser fingerprint across all requests.\n- **Memory efficiency**: Better resource usage compared to launching new browsers with each fetch.\n\n## When to Use\n\nUse DynamicFetcher when:\n\n- Need browser automation\n- Want multiple browser options\n- Using a real Chrome browser\n- Need custom browser config\n- Want a few stealth options \n\nIf you want more stealth and control without much config, check out the [StealthyFetcher](stealthy.md)."
  },
  {
    "path": "agent-skill/Scrapling-Skill/references/fetching/static.md",
    "content": "# HTTP requests\n\nThe `Fetcher` class provides rapid and lightweight HTTP requests using the high-performance `curl_cffi` library with a lot of stealth capabilities.\n\n## Basic Usage\nImport the Fetcher (same import pattern for all fetchers):\n\n```python\n>>> from scrapling.fetchers import Fetcher\n```\nCheck out how to configure the parsing options [here](choosing.md#parser-configuration-in-all-fetchers)\n\n### Shared arguments\nAll methods for making requests here share some arguments, so let's discuss them first.\n\n- **url**: The targeted URL\n- **stealthy_headers**: If enabled (default), it creates and adds real browser headers. It also sets a Google referer header.\n- **follow_redirects**: As the name implies, tell the fetcher to follow redirections. **Enabled by default**\n- **timeout**: The number of seconds to wait for each request to be finished. **Defaults to 30 seconds**.\n- **retries**: The number of retries that the fetcher will do for failed requests. **Defaults to three retries**.\n- **retry_delay**: Number of seconds to wait between retry attempts. **Defaults to 1 second**.\n- **impersonate**: Impersonate specific browsers' TLS fingerprints. Accepts browser strings or a list of them like `\"chrome110\"`, `\"firefox102\"`, `\"safari15_5\"` to use specific versions or `\"chrome\"`, `\"firefox\"`, `\"safari\"`, `\"edge\"` to automatically use the latest version available. This makes your requests appear to come from real browsers at the TLS level. If you pass it a list of strings, it will choose a random one with each request. **Defaults to the latest available Chrome version.**\n- **http3**: Use HTTP/3 protocol for requests. **Defaults to False**. It might be problematic if used with `impersonate`.\n- **cookies**: Cookies to use in the request. Can be a dictionary of `name→value` or a list of dictionaries.\n- **proxy**: As the name implies, the proxy for this request is used to route all traffic (HTTP and HTTPS). The format accepted here is `http://username:password@localhost:8030`.\n- **proxy_auth**: HTTP basic auth for proxy, tuple of (username, password).\n- **proxies**: Dict of proxies to use. Format: `{\"http\": proxy_url, \"https\": proxy_url}`.\n- **proxy_rotator**: A `ProxyRotator` instance for automatic proxy rotation. Cannot be combined with `proxy` or `proxies`.\n- **headers**: Headers to include in the request. Can override any header generated by the `stealthy_headers` argument\n- **max_redirects**: Maximum number of redirects. **Defaults to 30**, use -1 for unlimited.\n- **verify**: Whether to verify HTTPS certificates. **Defaults to True**.\n- **cert**: Tuple of (cert, key) filenames for the client certificate.\n- **selector_config**: A dictionary of custom parsing arguments to be used when creating the final `Selector`/`Response` class.\n\n**Notes:**\n1. The currently available browsers to impersonate are (`\"edge\"`, `\"chrome\"`, `\"chrome_android\"`, `\"safari\"`, `\"safari_beta\"`, `\"safari_ios\"`, `\"safari_ios_beta\"`, `\"firefox\"`, `\"tor\"`)\n2. The available browsers to impersonate, along with their corresponding versions, are automatically displayed in the argument autocompletion and updated with each `curl_cffi` update.\n3. If any of the arguments `impersonate` or `stealthy_headers` are enabled, the fetchers will automatically generate real browser headers that match the browser version used.\n\nOther than this, for further customization, you can pass any arguments that `curl_cffi` supports for any method if that method doesn't already support them.\n\n### HTTP Methods\nThere are additional arguments for each method, depending on the method, such as `params` for GET requests and `data`/`json` for POST/PUT/DELETE requests.\n\nExamples are the best way to explain this:\n\n> Hence: `OPTIONS` and `HEAD` methods are not supported.\n#### GET\n```python\n>>> from scrapling.fetchers import Fetcher\n>>> # Basic GET\n>>> page = Fetcher.get('https://example.com')\n>>> page = Fetcher.get('https://scrapling.requestcatcher.com/get', stealthy_headers=True, follow_redirects=True)\n>>> page = Fetcher.get('https://scrapling.requestcatcher.com/get', proxy='http://username:password@localhost:8030')\n>>> # With parameters\n>>> page = Fetcher.get('https://example.com/search', params={'q': 'query'})\n>>>\n>>> # With headers\n>>> page = Fetcher.get('https://example.com', headers={'User-Agent': 'Custom/1.0'})\n>>> # Basic HTTP authentication\n>>> page = Fetcher.get(\"https://example.com\", auth=(\"my_user\", \"password123\"))\n>>> # Browser impersonation\n>>> page = Fetcher.get('https://example.com', impersonate='chrome')\n>>> # HTTP/3 support\n>>> page = Fetcher.get('https://example.com', http3=True)\n```\nAnd for asynchronous requests, it's a small adjustment \n```python\n>>> from scrapling.fetchers import AsyncFetcher\n>>> # Basic GET\n>>> page = await AsyncFetcher.get('https://example.com')\n>>> page = await AsyncFetcher.get('https://scrapling.requestcatcher.com/get', stealthy_headers=True, follow_redirects=True)\n>>> page = await AsyncFetcher.get('https://scrapling.requestcatcher.com/get', proxy='http://username:password@localhost:8030')\n>>> # With parameters\n>>> page = await AsyncFetcher.get('https://example.com/search', params={'q': 'query'})\n>>>\n>>> # With headers\n>>> page = await AsyncFetcher.get('https://example.com', headers={'User-Agent': 'Custom/1.0'})\n>>> # Basic HTTP authentication\n>>> page = await AsyncFetcher.get(\"https://example.com\", auth=(\"my_user\", \"password123\"))\n>>> # Browser impersonation\n>>> page = await AsyncFetcher.get('https://example.com', impersonate='chrome110')\n>>> # HTTP/3 support\n>>> page = await AsyncFetcher.get('https://example.com', http3=True)\n```\nThe `page` object in all cases is a [Response](choosing.md#response-object) object, which is a [Selector](parsing/main_classes.md#selector), so you can use it directly\n```python\n>>> page.css('.something.something')\n\n>>> page = Fetcher.get('https://api.github.com/events')\n>>> page.json()\n[{'id': '<redacted>',\n  'type': 'PushEvent',\n  'actor': {'id': '<redacted>',\n   'login': '<redacted>',\n   'display_login': '<redacted>',\n   'gravatar_id': '',\n   'url': 'https://api.github.com/users/<redacted>',\n   'avatar_url': 'https://avatars.githubusercontent.com/u/<redacted>'},\n  'repo': {'id': '<redacted>',\n...\n```\n#### POST\n```python\n>>> from scrapling.fetchers import Fetcher\n>>> # Basic POST\n>>> page = Fetcher.post('https://scrapling.requestcatcher.com/post', data={'key': 'value'}, params={'q': 'query'})\n>>> page = Fetcher.post('https://scrapling.requestcatcher.com/post', data={'key': 'value'}, stealthy_headers=True, follow_redirects=True)\n>>> page = Fetcher.post('https://scrapling.requestcatcher.com/post', data={'key': 'value'}, proxy='http://username:password@localhost:8030', impersonate=\"chrome\")\n>>> # Another example of form-encoded data\n>>> page = Fetcher.post('https://example.com/submit', data={'username': 'user', 'password': 'pass'}, http3=True)\n>>> # JSON data\n>>> page = Fetcher.post('https://example.com/api', json={'key': 'value'})\n```\nAnd for asynchronous requests, it's a small adjustment\n```python\n>>> from scrapling.fetchers import AsyncFetcher\n>>> # Basic POST\n>>> page = await AsyncFetcher.post('https://scrapling.requestcatcher.com/post', data={'key': 'value'})\n>>> page = await AsyncFetcher.post('https://scrapling.requestcatcher.com/post', data={'key': 'value'}, stealthy_headers=True, follow_redirects=True)\n>>> page = await AsyncFetcher.post('https://scrapling.requestcatcher.com/post', data={'key': 'value'}, proxy='http://username:password@localhost:8030', impersonate=\"chrome\")\n>>> # Another example of form-encoded data\n>>> page = await AsyncFetcher.post('https://example.com/submit', data={'username': 'user', 'password': 'pass'}, http3=True)\n>>> # JSON data\n>>> page = await AsyncFetcher.post('https://example.com/api', json={'key': 'value'})\n```\n#### PUT\n```python\n>>> from scrapling.fetchers import Fetcher\n>>> # Basic PUT\n>>> page = Fetcher.put('https://example.com/update', data={'status': 'updated'})\n>>> page = Fetcher.put('https://example.com/update', data={'status': 'updated'}, stealthy_headers=True, follow_redirects=True, impersonate=\"chrome\")\n>>> page = Fetcher.put('https://example.com/update', data={'status': 'updated'}, proxy='http://username:password@localhost:8030')\n>>> # Another example of form-encoded data\n>>> page = Fetcher.put(\"https://scrapling.requestcatcher.com/put\", data={'key': ['value1', 'value2']})\n```\nAnd for asynchronous requests, it's a small adjustment\n```python\n>>> from scrapling.fetchers import AsyncFetcher\n>>> # Basic PUT\n>>> page = await AsyncFetcher.put('https://example.com/update', data={'status': 'updated'})\n>>> page = await AsyncFetcher.put('https://example.com/update', data={'status': 'updated'}, stealthy_headers=True, follow_redirects=True, impersonate=\"chrome\")\n>>> page = await AsyncFetcher.put('https://example.com/update', data={'status': 'updated'}, proxy='http://username:password@localhost:8030')\n>>> # Another example of form-encoded data\n>>> page = await AsyncFetcher.put(\"https://scrapling.requestcatcher.com/put\", data={'key': ['value1', 'value2']})\n```\n\n#### DELETE\n```python\n>>> from scrapling.fetchers import Fetcher\n>>> page = Fetcher.delete('https://example.com/resource/123')\n>>> page = Fetcher.delete('https://example.com/resource/123', stealthy_headers=True, follow_redirects=True, impersonate=\"chrome\")\n>>> page = Fetcher.delete('https://example.com/resource/123', proxy='http://username:password@localhost:8030')\n```\nAnd for asynchronous requests, it's a small adjustment\n```python\n>>> from scrapling.fetchers import AsyncFetcher\n>>> page = await AsyncFetcher.delete('https://example.com/resource/123')\n>>> page = await AsyncFetcher.delete('https://example.com/resource/123', stealthy_headers=True, follow_redirects=True, impersonate=\"chrome\")\n>>> page = await AsyncFetcher.delete('https://example.com/resource/123', proxy='http://username:password@localhost:8030')\n```\n\n## Session Management\n\nFor making multiple requests with the same configuration, use the `FetcherSession` class. It can be used in both synchronous and asynchronous code without issue; the class automatically detects and changes the session type, without requiring a different import.\n\nThe `FetcherSession` class can accept nearly all the arguments that the methods can take, which enables you to specify a config for the entire session and later choose a different config for one of the requests effortlessly, as you will see in the following examples.\n\n```python\nfrom scrapling.fetchers import FetcherSession\n\n# Create a session with default configuration\nwith FetcherSession(\n    impersonate='chrome',\n    http3=True,\n    stealthy_headers=True,\n    timeout=30,\n    retries=3\n) as session:\n    # Make multiple requests with the same settings and the same cookies\n    page1 = session.get('https://scrapling.requestcatcher.com/get')\n    page2 = session.post('https://scrapling.requestcatcher.com/post', data={'key': 'value'})\n    page3 = session.get('https://api.github.com/events')\n\n    # All requests share the same session and connection pool\n```\n\nYou can also use a `ProxyRotator` with `FetcherSession` for automatic proxy rotation across requests:\n\n```python\nfrom scrapling.fetchers import FetcherSession, ProxyRotator\n\nrotator = ProxyRotator([\n    'http://proxy1:8080',\n    'http://proxy2:8080',\n    'http://proxy3:8080',\n])\n\nwith FetcherSession(proxy_rotator=rotator, impersonate='chrome') as session:\n    # Each request automatically uses the next proxy in rotation\n    page1 = session.get('https://example.com/page1')\n    page2 = session.get('https://example.com/page2')\n\n    # You can check which proxy was used via the response metadata\n    print(page1.meta['proxy'])\n```\n\nYou can also override the session proxy (or rotator) for a specific request by passing `proxy=` directly to the request method:\n\n```python\nwith FetcherSession(proxy='http://default-proxy:8080') as session:\n    # Uses the session proxy\n    page1 = session.get('https://example.com/page1')\n\n    # Override the proxy for this specific request\n    page2 = session.get('https://example.com/page2', proxy='http://special-proxy:9090')\n```\n\nAnd here's an async example\n\n```python\nasync with FetcherSession(impersonate='firefox', http3=True) as session:\n    # All standard HTTP methods available\n    response = await session.get('https://example.com')\n    response = await session.post('https://scrapling.requestcatcher.com/post', json={'data': 'value'})\n    response = await session.put('https://scrapling.requestcatcher.com/put', data={'update': 'info'})\n    response = await session.delete('https://scrapling.requestcatcher.com/delete')\n```\nor better\n```python\nimport asyncio\nfrom scrapling.fetchers import FetcherSession\n\n# Async session usage\nasync with FetcherSession(impersonate=\"safari\") as session:\n    urls = ['https://example.com/page1', 'https://example.com/page2']\n\n    tasks = [\n        session.get(url) for url in urls\n    ]\n\n    pages = await asyncio.gather(*tasks)\n```\n\nThe `Fetcher` class uses `FetcherSession` to create a temporary session with each request you make.\n\n### Session Benefits\n\n- **A lot faster**: 10 times faster than creating a single session for each request\n- **Cookie persistence**: Automatic cookie handling across requests\n- **Resource efficiency**: Better memory and CPU usage for multiple requests\n- **Centralized configuration**: Single place to manage request settings\n\n## Examples\nSome well-rounded examples to aid newcomers to Web Scraping\n\n### Basic HTTP Request\n\n```python\nfrom scrapling.fetchers import Fetcher\n\n# Make a request\npage = Fetcher.get('https://example.com')\n\n# Check the status\nif page.status == 200:\n    # Extract title\n    title = page.css('title::text').get()\n    print(f\"Page title: {title}\")\n\n    # Extract all links\n    links = page.css('a::attr(href)').getall()\n    print(f\"Found {len(links)} links\")\n```\n\n### Product Scraping\n\n```python\nfrom scrapling.fetchers import Fetcher\n\ndef scrape_products():\n    page = Fetcher.get('https://example.com/products')\n    \n    # Find all product elements\n    products = page.css('.product')\n    \n    results = []\n    for product in products:\n        results.append({\n            'title': product.css('.title::text').get(),\n            'price': product.css('.price::text').re_first(r'\\d+\\.\\d{2}'),\n            'description': product.css('.description::text').get(),\n            'in_stock': product.has_class('in-stock')\n        })\n    \n    return results\n```\n\n### Downloading Files\n\n```python\nfrom scrapling.fetchers import Fetcher\n\npage = Fetcher.get('https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/main_cover.png')\nwith open(file='main_cover.png', mode='wb') as f:\n   f.write(page.body)\n```\n\n### Pagination Handling\n\n```python\nfrom scrapling.fetchers import Fetcher\n\ndef scrape_all_pages():\n    base_url = 'https://example.com/products?page={}'\n    page_num = 1\n    all_products = []\n    \n    while True:\n        # Get current page\n        page = Fetcher.get(base_url.format(page_num))\n        \n        # Find products\n        products = page.css('.product')\n        if not products:\n            break\n            \n        # Process products\n        for product in products:\n            all_products.append({\n                'name': product.css('.name::text').get(),\n                'price': product.css('.price::text').get()\n            })\n            \n        # Next page\n        page_num += 1\n        \n    return all_products\n```\n\n### Form Submission\n\n```python\nfrom scrapling.fetchers import Fetcher\n\n# Submit login form\nresponse = Fetcher.post(\n    'https://example.com/login',\n    data={\n        'username': 'user@example.com',\n        'password': 'password123'\n    }\n)\n\n# Check login success\nif response.status == 200:\n    # Extract user info\n    user_name = response.css('.user-name::text').get()\n    print(f\"Logged in as: {user_name}\")\n```\n\n### Table Extraction\n\n```python\nfrom scrapling.fetchers import Fetcher\n\ndef extract_table():\n    page = Fetcher.get('https://example.com/data')\n    \n    # Find table\n    table = page.css('table')[0]\n    \n    # Extract headers\n    headers = [\n        th.text for th in table.css('thead th')\n    ]\n    \n    # Extract rows\n    rows = []\n    for row in table.css('tbody tr'):\n        cells = [td.text for td in row.css('td')]\n        rows.append(dict(zip(headers, cells)))\n        \n    return rows\n```\n\n### Navigation Menu\n\n```python\nfrom scrapling.fetchers import Fetcher\n\ndef extract_menu():\n    page = Fetcher.get('https://example.com')\n    \n    # Find navigation\n    nav = page.css('nav')[0]\n    \n    menu = {}\n    for item in nav.css('li'):\n        links = item.css('a')\n        if links:\n            link = links[0]\n            menu[link.text] = {\n                'url': link['href'],\n                'has_submenu': bool(item.css('.submenu'))\n            }\n            \n    return menu\n```\n\n## When to Use\n\nUse `Fetcher` when:\n\n- Need rapid HTTP requests.\n- Want minimal overhead.\n- Don't need JavaScript execution (the website can be scraped through requests).\n- Need some stealth features (ex, the targeted website is using protection but doesn't use JavaScript challenges).\n\nUse `FetcherSession` when:\n\n- Making multiple requests to the same or different sites.\n- Need to maintain cookies/authentication between requests.\n- Want connection pooling for better performance.\n- Require consistent configuration across requests.\n- Working with APIs that require a session state.\n\nUse other fetchers when:\n\n- Need browser automation.\n- Need advanced anti-bot/stealth capabilities.\n- Need JavaScript support or interacting with dynamic content"
  },
  {
    "path": "agent-skill/Scrapling-Skill/references/fetching/stealthy.md",
    "content": "# StealthyFetcher\n\n`StealthyFetcher` is a stealthy browser-based fetcher similar to [DynamicFetcher](dynamic.md), using [Playwright's API](https://playwright.dev/python/docs/intro). It adds advanced anti-bot protection bypass capabilities, most handled automatically. It shares the same browser automation model as `DynamicFetcher`, using [Playwright's Page API](https://playwright.dev/python/docs/api/class-page) for page interaction.\n\n## Basic Usage\nYou have one primary way to import this Fetcher, which is the same for all fetchers.\n\n```python\n>>> from scrapling.fetchers import StealthyFetcher\n```\nCheck out how to configure the parsing options [here](choosing.md#parser-configuration-in-all-fetchers)\n\n**Note:** The async version of the `fetch` method is `async_fetch`.\n\n## What does it do?\n\nThe `StealthyFetcher` class is a stealthy version of the [DynamicFetcher](dynamic.md) class, and here are some of the things it does:\n\n1. It easily bypasses all types of Cloudflare's Turnstile/Interstitial automatically. \n2. It bypasses CDP runtime leaks and WebRTC leaks.\n3. It isolates JS execution, removes many Playwright fingerprints, and stops detection through some of the known behaviors that bots do.\n4. It generates canvas noise to prevent fingerprinting through canvas.\n5. It automatically patches known methods to detect running in headless mode and provides an option to defeat timezone mismatch attacks.\n6. and other anti-protection options...\n\n## Full list of arguments\nScrapling provides many options with this fetcher and its session classes. Before jumping to the [examples](#examples), here's the full list of arguments\n\n\n|      Argument       | Description                                                                                                                                                                                                                         | Optional |\n|:-------------------:|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:--------:|\n|         url         | Target url                                                                                                                                                                                                                          |    ❌     |\n|      headless       | Pass `True` to run the browser in headless/hidden (**default**) or `False` for headful/visible mode.                                                                                                                                |    ✔️    |\n|  disable_resources  | Drop requests for unnecessary resources for a speed boost. Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.                         |    ✔️    |\n|       cookies       | Set cookies for the next request.                                                                                                                                                                                                   |    ✔️    |\n|      useragent      | Pass a useragent string to be used. **Otherwise, the fetcher will generate and use a real Useragent of the same browser and version.**                                                                                              |    ✔️    |\n|    network_idle     | Wait for the page until there are no network connections for at least 500 ms.                                                                                                                                                       |    ✔️    |\n|      load_dom       | Enabled by default, wait for all JavaScript on page(s) to fully load and execute (wait for the `domcontentloaded` state).                                                                                                           |    ✔️    |\n|       timeout       | The timeout (milliseconds) used in all operations and waits through the page. The default is 30,000 ms (30 seconds).                                                                                                                |    ✔️    |\n|        wait         | The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the `Response` object.                                                                                                |    ✔️    |\n|     page_action     | Added for automation. Pass a function that takes the `page` object and does the necessary automation.                                                                                                                               |    ✔️    |\n|    wait_selector    | Wait for a specific css selector to be in a specific state.                                                                                                                                                                         |    ✔️    |\n|     init_script     | An absolute path to a JavaScript file to be executed on page creation for all pages in this session.                                                                                                                                |    ✔️    |\n| wait_selector_state | Scrapling will wait for the given state to be fulfilled for the selector given with `wait_selector`. _Default state is `attached`._                                                                                                 |    ✔️    |\n|    google_search    | Enabled by default, Scrapling will set a Google referer header.                                                                                               |    ✔️    |\n|    extra_headers    | A dictionary of extra headers to add to the request. _The referer set by `google_search` takes priority over the referer set here if used together._                                                                   |    ✔️    |\n|        proxy        | The proxy to be used with requests. It can be a string or a dictionary with only the keys 'server', 'username', and 'password'.                                                                                                     |    ✔️    |\n|     real_chrome     | If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch and use an instance of your browser.                                                                                                |    ✔️    |\n|       locale        | Specify user locale, for example, `en-GB`, `de-DE`, etc. Locale will affect `navigator.language` value, `Accept-Language` request header value, as well as number and date formatting rules. Defaults to the system default locale. |    ✔️    |\n|     timezone_id     | Changes the timezone of the browser. Defaults to the system timezone.                                                                                                                                                               |    ✔️    |\n|       cdp_url       | Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.                                                                                                                          |    ✔️    |\n|    user_data_dir    | Path to a User Data Directory, which stores browser session data like cookies and local storage. The default is to create a temporary directory. **Only Works with sessions**                                                       |    ✔️    |\n|     extra_flags     | A list of additional browser flags to pass to the browser on launch.                                                                                                                                                                |    ✔️    |\n|  solve_cloudflare   | When enabled, fetcher solves all types of Cloudflare's Turnstile/Interstitial challenges before returning the response to you.                                                                                                      |    ✔️    |\n|    block_webrtc     | Forces WebRTC to respect proxy settings to prevent local IP address leak.                                                                                                                                                           |    ✔️    |\n|     hide_canvas     | Add random noise to canvas operations to prevent fingerprinting.                                                                                                                                                                    |    ✔️    |\n|     allow_webgl     | Enabled by default. Disabling it disables WebGL and WebGL 2.0 support entirely. Disabling WebGL is not recommended, as many WAFs now check if WebGL is enabled.                                                                     |    ✔️    |\n|   additional_args   | Additional arguments to be passed to Playwright's context as additional settings, and they take higher priority than Scrapling's settings.                                                                                          |    ✔️    |\n|   selector_config   | A dictionary of custom parsing arguments to be used when creating the final `Selector`/`Response` class.                                                                                                                            |    ✔️    |\n|   blocked_domains   | A set of domain names to block requests to. Subdomains are also matched (e.g., `\"example.com\"` blocks `\"sub.example.com\"` too).                                                                                                     |    ✔️    |\n|    proxy_rotator    | A `ProxyRotator` instance for automatic proxy rotation. Cannot be combined with `proxy`.                                                                                                                                            |    ✔️    |\n|       retries       | Number of retry attempts for failed requests. Defaults to 3.                                                                                                                                                                        |    ✔️    |\n|     retry_delay     | Seconds to wait between retry attempts. Defaults to 1.                                                                                                                                                                              |    ✔️    |\n\nIn session classes, all these arguments can be set globally for the session. Still, you can configure each request individually by passing some of the arguments here that can be configured on the browser tab level like: `google_search`, `timeout`, `wait`, `page_action`, `extra_headers`, `disable_resources`, `wait_selector`, `wait_selector_state`, `network_idle`, `load_dom`, `solve_cloudflare`, `blocked_domains`, `proxy`, and `selector_config`.\n\n**Notes:**\n\n1. It's basically the same arguments as [DynamicFetcher](dynamic.md) class, but with these additional arguments: `solve_cloudflare`, `block_webrtc`, `hide_canvas`, and `allow_webgl`.\n2. The `disable_resources` option made requests ~25% faster in tests for some websites and can help save proxy usage, but be careful with it, as it can cause some websites to never finish loading.\n3. The `google_search` argument is enabled by default for all requests, setting the referer to `https://www.google.com/`. If used together with `extra_headers`, it takes priority over the referer set there.\n4. If you didn't set a user agent and enabled headless mode, the fetcher will generate a real user agent for the same browser version and use it. If you didn't set a user agent and didn't enable headless mode, the fetcher will use the browser's default user agent, which is the same as in standard browsers in the latest versions.\n\n## Examples\n\n### Cloudflare and stealth options\n\n```python\n# Automatic Cloudflare solver\npage = StealthyFetcher.fetch('https://nopecha.com/demo/cloudflare', solve_cloudflare=True)\n\n# Works with other stealth options\npage = StealthyFetcher.fetch(\n    'https://protected-site.com',\n    solve_cloudflare=True,\n    block_webrtc=True,\n    real_chrome=True,\n    hide_canvas=True,\n    google_search=True,\n    proxy='http://username:password@host:port',  # It can also be a dictionary with only the keys 'server', 'username', and 'password'.\n)\n```\n\nThe `solve_cloudflare` parameter enables automatic detection and solving all types of Cloudflare's Turnstile/Interstitial challenges:\n\n- JavaScript challenges (managed)\n- Interactive challenges (clicking verification boxes)\n- Invisible challenges (automatic background verification)\n\nAnd even solves the custom pages with embedded captcha.\n\n**Important notes:**\n\n1. Sometimes, with websites that use custom implementations, you will need to use `wait_selector` to make sure Scrapling waits for the real website content to be loaded after solving the captcha. Some websites can be the real definition of an edge case while we are trying to make the solver as generic as possible.\n2. The timeout should be at least 60 seconds when using the Cloudflare solver for sufficient challenge-solving time.\n3. This feature works seamlessly with proxies and other stealth options.\n\n### Browser Automation\nThis is where your knowledge about [Playwright's Page API](https://playwright.dev/python/docs/api/class-page) comes into play. The function you pass here takes the page object from Playwright's API, performs the desired action, and then the fetcher continues.\n\nThis function is executed immediately after waiting for `network_idle` (if enabled) and before waiting for the `wait_selector` argument, allowing it to be used for purposes beyond automation. You can alter the page as you want.\n\nIn the example below, I used the pages' [mouse events](https://playwright.dev/python/docs/api/class-mouse) to scroll the page with the mouse wheel, then move the mouse.\n```python\nfrom playwright.sync_api import Page\n\ndef scroll_page(page: Page):\n    page.mouse.wheel(10, 0)\n    page.mouse.move(100, 400)\n    page.mouse.up()\n\npage = StealthyFetcher.fetch('https://example.com', page_action=scroll_page)\n```\nOf course, if you use the async fetch version, the function must also be async.\n```python\nfrom playwright.async_api import Page\n\nasync def scroll_page(page: Page):\n   await page.mouse.wheel(10, 0)\n   await page.mouse.move(100, 400)\n   await page.mouse.up()\n\npage = await StealthyFetcher.async_fetch('https://example.com', page_action=scroll_page)\n```\n\n### Wait Conditions\n```python\n# Wait for the selector\npage = StealthyFetcher.fetch(\n    'https://example.com',\n    wait_selector='h1',\n    wait_selector_state='visible'\n)\n```\nThis is the last wait the fetcher will do before returning the response (if enabled). You pass a CSS selector to the `wait_selector` argument, and the fetcher will wait for the state you passed in the `wait_selector_state` argument to be fulfilled. If you didn't pass a state, the default would be `attached`, which means it will wait for the element to be present in the DOM.\n\nAfter that, if `load_dom` is enabled (the default), the fetcher will check again to see if all JavaScript files are loaded and executed (in the `domcontentloaded` state) or continue waiting. If you have enabled `network_idle`, the fetcher will wait for `network_idle` to be fulfilled again, as explained above.\n\nThe states the fetcher can wait for can be any of the following ([source](https://playwright.dev/python/docs/api/class-page#page-wait-for-selector)):\n\n- `attached`: Wait for an element to be present in the DOM.\n- `detached`: Wait for an element to not be present in the DOM.\n- `visible`: wait for an element to have a non-empty bounding box and no `visibility:hidden`. Note that an element without any content or with `display:none` has an empty bounding box and is not considered visible.\n- `hidden`: wait for an element to be either detached from the DOM, or have an empty bounding box, or `visibility:hidden`. This is opposite to the `'visible'` option.\n\n\n### Real-world example (Amazon)\nThis is for educational purposes only; this example was generated by AI, which also shows how easy it is to work with Scrapling through AI\n```python\ndef scrape_amazon_product(url):\n    # Use StealthyFetcher to bypass protection\n    page = StealthyFetcher.fetch(url)\n\n    # Extract product details\n    return {\n        'title': page.css('#productTitle::text').get().clean(),\n        'price': page.css('.a-price .a-offscreen::text').get(),\n        'rating': page.css('[data-feature-name=\"averageCustomerReviews\"] .a-popover-trigger .a-color-base::text').get(),\n        'reviews_count': page.css('#acrCustomerReviewText::text').re_first(r'[\\d,]+'),\n        'features': [\n            li.get().clean() for li in page.css('#feature-bullets li span::text')\n        ],\n        'availability': page.css('#availability')[0].get_all_text(strip=True),\n        'images': [\n            img.attrib['src'] for img in page.css('#altImages img')\n        ]\n    }\n```\n\n## Session Management\n\nTo keep the browser open until you make multiple requests with the same configuration, use `StealthySession`/`AsyncStealthySession` classes. Those classes can accept all the arguments that the `fetch` function can take, which enables you to specify a config for the entire session.\n\n```python\nfrom scrapling.fetchers import StealthySession\n\n# Create a session with default configuration\nwith StealthySession(\n    headless=True,\n    real_chrome=True,\n    block_webrtc=True,\n    solve_cloudflare=True\n) as session:\n    # Make multiple requests with the same browser instance\n    page1 = session.fetch('https://example1.com')\n    page2 = session.fetch('https://example2.com') \n    page3 = session.fetch('https://nopecha.com/demo/cloudflare')\n    \n    # All requests reuse the same tab on the same browser instance\n```\n\n### Async Session Usage\n\n```python\nimport asyncio\nfrom scrapling.fetchers import AsyncStealthySession\n\nasync def scrape_multiple_sites():\n    async with AsyncStealthySession(\n        real_chrome=True,\n        block_webrtc=True,\n        solve_cloudflare=True,\n        timeout=60000,  # 60 seconds for Cloudflare challenges\n        max_pages=3\n    ) as session:\n        # Make async requests with shared browser configuration\n        pages = await asyncio.gather(\n            session.fetch('https://site1.com'),\n            session.fetch('https://site2.com'), \n            session.fetch('https://protected-site.com')\n        )\n        return pages\n```\n\nYou may have noticed the `max_pages` argument. This is a new argument that enables the fetcher to create a **rotating pool of Browser tabs**. Instead of using a single tab for all your requests, you set a limit on the maximum number of pages that can be displayed at once. With each request, the library will close all tabs that have finished their task and check if the number of the current tabs is lower than the maximum allowed number of pages/tabs, then:\n\n1. If you are within the allowed range, the fetcher will create a new tab for you, and then all is as normal.\n2. Otherwise, it will keep checking every subsecond if creating a new tab is allowed or not for 60 seconds, then raise `TimeoutError`. This can happen when the website you are fetching becomes unresponsive.\n\nThis logic allows for multiple URLs to be fetched at the same time in the same browser, which saves a lot of resources, but most importantly, is so fast :)\n\nIn versions 0.3 and 0.3.1, the pool was reusing finished tabs to save more resources/time. That logic proved flawed, as it's nearly impossible to protect pages/tabs from contamination by the previous configuration used in the request before this one.\n\n### Session Benefits\n\n- **Browser reuse**: Much faster subsequent requests by reusing the same browser instance.\n- **Cookie persistence**: Automatic cookie and session state handling as any browser does automatically.\n- **Consistent fingerprint**: Same browser fingerprint across all requests.\n- **Memory efficiency**: Better resource usage compared to launching new browsers with each fetch.\n\n## When to Use\n\nUse StealthyFetcher when:\n\n- Bypassing anti-bot protection\n- Need a reliable browser fingerprint\n- Full JavaScript support needed\n- Want automatic stealth features\n- Need browser automation\n- Dealing with Cloudflare protection"
  },
  {
    "path": "agent-skill/Scrapling-Skill/references/mcp-server.md",
    "content": "# Scrapling MCP Server\n\nThe Scrapling MCP server exposes six web scraping tools over the MCP protocol. It supports CSS-selector-based content narrowing (reducing tokens by extracting only relevant elements before returning results) and three levels of scraping capability: plain HTTP, browser-rendered, and stealth (anti-bot bypass).\n\nAll tools return a `ResponseModel` with fields: `status` (int), `content` (list of strings), `url` (str).\n\n## Tools\n\n### `get` -- HTTP request (single URL)\n\nFast HTTP GET with browser fingerprint impersonation (TLS, headers). Suitable for static pages with no/low bot protection.\n\n**Key parameters:**\n\n| Parameter           | Type                               | Default      | Description                                                        |\n|---------------------|------------------------------------|--------------|--------------------------------------------------------------------|\n| `url`               | str                                | required     | URL to fetch                                                       |\n| `extraction_type`   | `\"markdown\"` / `\"html\"` / `\"text\"` | `\"markdown\"` | Output format                                                      |\n| `css_selector`      | str or null                        | null         | CSS selector to narrow content (applied after `main_content_only`) |\n| `main_content_only` | bool                               | true         | Restrict to `<body>` content                                       |\n| `impersonate`       | str                                | `\"chrome\"`   | Browser fingerprint to impersonate                                 |\n| `proxy`             | str or null                        | null         | Proxy URL, e.g. `\"http://user:pass@host:port\"`                     |\n| `proxy_auth`        | dict or null                       | null         | `{\"username\": \"...\", \"password\": \"...\"}`                           |\n| `auth`              | dict or null                       | null         | HTTP basic auth, same format as proxy_auth                         |\n| `timeout`           | number                             | 30           | Seconds before timeout                                             |\n| `retries`           | int                                | 3            | Retry attempts on failure                                          |\n| `retry_delay`       | int                                | 1            | Seconds between retries                                            |\n| `stealthy_headers`  | bool                               | true         | Generate realistic browser headers and Google referer       |\n| `http3`             | bool                               | false        | Use HTTP/3 (may conflict with `impersonate`)                       |\n| `follow_redirects`  | bool                               | true         | Follow HTTP redirects                                              |\n| `max_redirects`     | int                                | 30           | Max redirects (-1 for unlimited)                                   |\n| `headers`           | dict or null                       | null         | Custom request headers                                             |\n| `cookies`           | dict or null                       | null         | Request cookies                                                    |\n| `params`            | dict or null                       | null         | Query string parameters                                            |\n| `verify`            | bool                               | true         | Verify HTTPS certificates                                          |\n\n### `bulk_get` -- HTTP request (multiple URLs)\n\nAsync concurrent version of `get`. Same parameters except `url` is replaced by `urls` (list of strings). All URLs are fetched in parallel. Returns a list of `ResponseModel`.\n\n### `fetch` -- Browser fetch (single URL)\n\nOpens a Chromium browser via Playwright to render JavaScript. Suitable for dynamic/SPA pages with no/low bot protection.\n\n**Key parameters (beyond shared ones):**\n\n| Parameter             | Type                | Default      | Description                                                                     |\n|-----------------------|---------------------|--------------|---------------------------------------------------------------------------------|\n| `url`                 | str                 | required     | URL to fetch                                                                    |\n| `extraction_type`     | str                 | `\"markdown\"` | `\"markdown\"` / `\"html\"` / `\"text\"`                                              |\n| `css_selector`        | str or null         | null         | Narrow content before extraction                                                |\n| `main_content_only`   | bool                | true         | Restrict to `<body>`                                                            |\n| `headless`            | bool                | true         | Run browser hidden (true) or visible (false)                                    |\n| `proxy`               | str or dict or null | null         | String URL or `{\"server\": \"...\", \"username\": \"...\", \"password\": \"...\"}`         |\n| `timeout`             | number              | 30000        | Timeout in **milliseconds**                                                     |\n| `wait`                | number              | 0            | Extra wait (ms) after page load before extraction                               |\n| `wait_selector`       | str or null         | null         | CSS selector to wait for before extraction                                      |\n| `wait_selector_state` | str                 | `\"attached\"` | State for wait_selector: `\"attached\"` / `\"visible\"` / `\"hidden\"` / `\"detached\"` |\n| `network_idle`        | bool                | false        | Wait until no network activity for 500ms                                        |\n| `disable_resources`   | bool                | false        | Block fonts, images, media, stylesheets, etc. for speed                         |\n| `google_search`       | bool                | true         | Set a Google referer header                                            |\n| `real_chrome`         | bool                | false        | Use locally installed Chrome instead of bundled Chromium                        |\n| `cdp_url`             | str or null         | null         | Connect to existing browser via CDP URL                                         |\n| `extra_headers`       | dict or null        | null         | Additional request headers                                                      |\n| `useragent`           | str or null         | null         | Custom user-agent (auto-generated if null)                                      |\n| `cookies`             | list or null        | null         | Playwright-format cookies                                                       |\n| `timezone_id`         | str or null         | null         | Browser timezone, e.g. `\"America/New_York\"`                                     |\n| `locale`              | str or null         | null         | Browser locale, e.g. `\"en-GB\"`                                                  |\n\n### `bulk_fetch` -- Browser fetch (multiple URLs)\n\nConcurrent browser version of `fetch`. Same parameters except `url` is replaced by `urls` (list of strings). Each URL opens in a separate browser tab. Returns a list of `ResponseModel`.\n\n### `stealthy_fetch` -- Stealth browser fetch (single URL)\n\nAnti-bot bypass fetcher with fingerprint spoofing. Use this for sites with Cloudflare Turnstile/Interstitial or other strong protections.\n\n**Additional parameters (beyond those in `fetch`):**\n\n| Parameter          | Type         | Default | Description                                                      |\n|--------------------|--------------|---------|------------------------------------------------------------------|\n| `solve_cloudflare` | bool         | false   | Automatically solve Cloudflare Turnstile/Interstitial challenges |\n| `hide_canvas`      | bool         | false   | Add noise to canvas operations to prevent fingerprinting         |\n| `block_webrtc`     | bool         | false   | Force WebRTC to respect proxy settings (prevents IP leak)        |\n| `allow_webgl`      | bool         | true    | Keep WebGL enabled (disabling is detectable by WAFs)             |\n| `additional_args`  | dict or null | null    | Extra Playwright context args (overrides Scrapling defaults)     |\n\nAll parameters from `fetch` are also accepted.\n\n### `bulk_stealthy_fetch` -- Stealth browser fetch (multiple URLs)\n\nConcurrent stealth version. Same parameters as `stealthy_fetch` except `url` is replaced by `urls` (list of strings). Returns a list of `ResponseModel`.\n\n## Tool selection guide\n\n| Scenario                                 | Tool                                                          |\n|------------------------------------------|---------------------------------------------------------------|\n| Static page, no bot protection           | `get`                                                         |\n| Multiple static pages                    | `bulk_get`                                                    |\n| JavaScript-rendered / SPA page           | `fetch`                                                       |\n| Multiple JS-rendered pages               | `bulk_fetch`                                                  |\n| Cloudflare or strong anti-bot protection | `stealthy_fetch` (with `solve_cloudflare=true` for Turnstile) |\n| Multiple protected pages                 | `bulk_stealthy_fetch`                                         |\n\nStart with `get` (fastest, lowest resource cost). Escalate to `fetch` if content requires JS rendering. Escalate to `stealthy_fetch` only if blocked.\n\n## Content extraction tips\n\n- Use `css_selector` to narrow results before they reach the model -- this saves significant tokens.\n- `main_content_only=true` (default) strips nav/footer by restricting to `<body>`.\n- `extraction_type=\"markdown\"` (default) is best for readability. Use `\"text\"` for minimal output, `\"html\"` when structure matters.\n- If a `css_selector` matches multiple elements, all are returned in the `content` list.\n\n## Setup\n\nStart the server (stdio transport, used by most MCP clients):\n\n```bash\nscrapling mcp\n```\n\nOr with Streamable HTTP transport:\n\n```bash\nscrapling mcp --http\nscrapling mcp --http --host 127.0.0.1 --port 8000\n```\n\nDocker alternative:\n\n```bash\ndocker pull pyd4vinci/scrapling\ndocker run -i --rm scrapling mcp\n```\n\nThe MCP server name when registering with a client is `ScraplingServer`. The command is the path to the `scrapling` binary and the argument is `mcp`."
  },
  {
    "path": "agent-skill/Scrapling-Skill/references/migrating_from_beautifulsoup.md",
    "content": "# Migrating from BeautifulSoup to Scrapling\n\nAPI comparison between BeautifulSoup and Scrapling. Scrapling is faster, provides equivalent parsing capabilities, and adds features for fetching and handling modern web pages.\n\nSome BeautifulSoup shortcuts have no direct Scrapling equivalent. Scrapling avoids those shortcuts to preserve performance.\n\n\n| Task                                                            | BeautifulSoup Code                                                                                            | Scrapling Code                                                                    |\n|-----------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------|-----------------------------------------------------------------------------------|\n| Parser import                                                   | `from bs4 import BeautifulSoup`                                                                               | `from scrapling.parser import Selector`                                           |\n| Parsing HTML from string                                        | `soup = BeautifulSoup(html, 'html.parser')`                                                                   | `page = Selector(html)`                                                           |\n| Finding a single element                                        | `element = soup.find('div', class_='example')`                                                                | `element = page.find('div', class_='example')`                                    |\n| Finding multiple elements                                       | `elements = soup.find_all('div', class_='example')`                                                           | `elements = page.find_all('div', class_='example')`                               |\n| Finding a single element (Example 2)                            | `element = soup.find('div', attrs={\"class\": \"example\"})`                                                      | `element = page.find('div', {\"class\": \"example\"})`                                |\n| Finding a single element (Example 3)                            | `element = soup.find(re.compile(\"^b\"))`                                                                       | `element = page.find(re.compile(\"^b\"))`<br/>`element = page.find_by_regex(r\"^b\")` |\n| Finding a single element (Example 4)                            | `element = soup.find(lambda e: len(list(e.children)) > 0)`                                                    | `element = page.find(lambda e: len(e.children) > 0)`                              |\n| Finding a single element (Example 5)                            | `element = soup.find([\"a\", \"b\"])`                                                                             | `element = page.find([\"a\", \"b\"])`                                                 |\n| Find element by its text content                                | `element = soup.find(text=\"some text\")`                                                                       | `element = page.find_by_text(\"some text\", partial=False)`                         |\n| Using CSS selectors to find the first matching element          | `elements = soup.select_one('div.example')`                                                                   | `elements = page.css('div.example').first`                                        |\n| Using CSS selectors to find all matching element                | `elements = soup.select('div.example')`                                                                       | `elements = page.css('div.example')`                                              |\n| Get a prettified version of the page/element source             | `prettified = soup.prettify()`                                                                                | `prettified = page.prettify()`                                                    |\n| Get a Non-pretty version of the page/element source             | `source = str(soup)`                                                                                          | `source = page.html_content`                                                      |\n| Get tag name of an element                                      | `name = element.name`                                                                                         | `name = element.tag`                                                              |\n| Extracting text content of an element                           | `string = element.string`                                                                                     | `string = element.text`                                                           |\n| Extracting all the text in a document or beneath a tag          | `text = soup.get_text(strip=True)`                                                                            | `text = page.get_all_text(strip=True)`                                            |\n| Access the dictionary of attributes                             | `attrs = element.attrs`                                                                                       | `attrs = element.attrib`                                                          |\n| Extracting attributes                                           | `attr = element['href']`                                                                                      | `attr = element['href']`                                                          |\n| Navigating to parent                                            | `parent = element.parent`                                                                                     | `parent = element.parent`                                                         |\n| Get all parents of an element                                   | `parents = list(element.parents)`                                                                             | `parents = list(element.iterancestors())`                                         |\n| Searching for an element in the parents of an element           | `target_parent = element.find_parent(\"a\")`                                                                    | `target_parent = element.find_ancestor(lambda p: p.tag == 'a')`                   |\n| Get all siblings of an element                                  | N/A                                                                                                           | `siblings = element.siblings`                                                     |\n| Get next sibling of an element                                  | `next_element = element.next_sibling`                                                                         | `next_element = element.next`                                                     |\n| Searching for an element in the siblings of an element          | `target_sibling = element.find_next_sibling(\"a\")`<br/>`target_sibling = element.find_previous_sibling(\"a\")`   | `target_sibling = element.siblings.search(lambda s: s.tag == 'a')`                |\n| Searching for elements in the siblings of an element            | `target_sibling = element.find_next_siblings(\"a\")`<br/>`target_sibling = element.find_previous_siblings(\"a\")` | `target_sibling = element.siblings.filter(lambda s: s.tag == 'a')`                |\n| Searching for an element in the next elements of an element     | `target_parent = element.find_next(\"a\")`                                                                      | `target_parent = element.below_elements.search(lambda p: p.tag == 'a')`           |\n| Searching for elements in the next elements of an element       | `target_parent = element.find_all_next(\"a\")`                                                                  | `target_parent = element.below_elements.filter(lambda p: p.tag == 'a')`           |\n| Searching for an element in the ancestors of an element         | `target_parent = element.find_previous(\"a\")` ¹                                                                | `target_parent = element.path.search(lambda p: p.tag == 'a')`                     |\n| Searching for elements in the ancestors of an element           | `target_parent = element.find_all_previous(\"a\")` ¹                                                            | `target_parent = element.path.filter(lambda p: p.tag == 'a')`                     |\n| Get previous sibling of an element                              | `prev_element = element.previous_sibling`                                                                     | `prev_element = element.previous`                                                 |\n| Navigating to children                                          | `children = list(element.children)`                                                                           | `children = element.children`                                                     |\n| Get all descendants of an element                               | `children = list(element.descendants)`                                                                        | `children = element.below_elements`                                               |\n| Filtering a group of elements that satisfies a condition        | `group = soup.find('p', 'story').css.filter('a')`                                                             | `group = page.find_all('p', 'story').filter(lambda p: p.tag == 'a')`              |\n\n\n¹ **Note:** BS4's `find_previous`/`find_all_previous` searches all preceding elements in document order, while Scrapling's `path` only returns ancestors (the parent chain). These are not exact equivalents, but ancestor search covers the most common use case.\n\nBeautifulSoup supports modifying/manipulating the parsed DOM. Scrapling does not — it is read-only and optimized for extraction.\n\n### Full Example: Extracting Links\n\n**With BeautifulSoup:**\n\n```python\nimport requests\nfrom bs4 import BeautifulSoup\n\nurl = 'https://example.com'\nresponse = requests.get(url)\nsoup = BeautifulSoup(response.text, 'html.parser')\n\nlinks = soup.find_all('a')\nfor link in links:\n    print(link['href'])\n```\n\n**With Scrapling:**\n\n```python\nfrom scrapling import Fetcher\n\nurl = 'https://example.com'\npage = Fetcher.get(url)\n\nlinks = page.css('a::attr(href)')\nfor link in links:\n    print(link)\n```\n\nScrapling combines fetching and parsing into a single step.\n\n**Note:**\n\n- **Parsers**: BeautifulSoup supports multiple parser engines. Scrapling always uses `lxml` for performance.\n- **Element Types**: BeautifulSoup elements are `Tag` objects; Scrapling elements are `Selector` objects. Both provide similar navigation and extraction methods.\n- **Error Handling**: Both libraries return `None` when an element is not found (e.g., `soup.find()` or `page.find()`). `page.css()` returns an empty `Selectors` list when no elements match. Use `page.css('.foo').first` to safely get the first match or `None`.\n- **Text Extraction**: Scrapling's `TextHandler` provides additional text processing methods such as `clean()` for removing extra whitespace, consecutive spaces, or unwanted characters."
  },
  {
    "path": "agent-skill/Scrapling-Skill/references/parsing/adaptive.md",
    "content": "# Adaptive scraping\n\nAdaptive scraping (previously known as automatch) is one of Scrapling's most powerful features. It allows your scraper to survive website changes by intelligently tracking and relocating elements.\n\nConsider a page with a structure like this:\n```html\n<div class=\"container\">\n    <section class=\"products\">\n        <article class=\"product\" id=\"p1\">\n            <h3>Product 1</h3>\n            <p class=\"description\">Description 1</p>\n        </article>\n        <article class=\"product\" id=\"p2\">\n            <h3>Product 2</h3>\n            <p class=\"description\">Description 2</p>\n        </article>\n    </section>\n</div>\n```\nTo scrape the first product (the one with the `p1` ID), a selector like this would be used:\n```python\npage.css('#p1')\n```\nWhen website owners implement structural changes like\n```html\n<div class=\"new-container\">\n    <div class=\"product-wrapper\">\n        <section class=\"products\">\n            <article class=\"product new-class\" data-id=\"p1\">\n                <div class=\"product-info\">\n                    <h3>Product 1</h3>\n                    <p class=\"new-description\">Description 1</p>\n                </div>\n            </article>\n            <article class=\"product new-class\" data-id=\"p2\">\n                <div class=\"product-info\">\n                    <h3>Product 2</h3>\n                    <p class=\"new-description\">Description 2</p>\n                </div>\n            </article>\n        </section>\n    </div>\n</div>\n```\nThe selector will no longer function, and your code needs maintenance. That's where Scrapling's `adaptive` feature comes into play.\n\nWith Scrapling, you can enable the `adaptive` feature the first time you select an element, and the next time you select that element and it doesn't exist, Scrapling will remember its properties and search on the website for the element with the highest percentage of similarity to that element.\n\n```python\nfrom scrapling import Selector, Fetcher\n# Before the change\npage = Selector(page_source, adaptive=True, url='example.com')\n# or\nFetcher.adaptive = True\npage = Fetcher.get('https://example.com')\n# then\nelement = page.css('#p1', auto_save=True)\nif not element:  # One day website changes?\n    element = page.css('#p1', adaptive=True)  # Scrapling still finds it!\n# the rest of your code...\n```\nIt works with all selection methods, not just CSS/XPath selection.\n\n## Real-World Scenario\nThis example uses [The Web Archive](https://archive.org/)'s [Wayback Machine](https://web.archive.org/) to demonstrate adaptive scraping across different versions of a website. A copy of [StackOverflow's website in 2010](https://web.archive.org/web/20100102003420/http://stackoverflow.com/) is compared against the current design to show that the adaptive feature can extract the same button using the same selector.\n\nTo extract the Questions button from the old design, a selector like `#hmenus > div:nth-child(1) > ul > li:nth-child(1) > a` can be used (this specific selector was generated by Chrome).\n\nTesting the same selector in both versions:\n```python\n>> from scrapling import Fetcher\n>> selector = '#hmenus > div:nth-child(1) > ul > li:nth-child(1) > a'\n>> old_url = \"https://web.archive.org/web/20100102003420/http://stackoverflow.com/\"\n>> new_url = \"https://stackoverflow.com/\"\n>> Fetcher.configure(adaptive = True, adaptive_domain='stackoverflow.com')\n>> \n>> page = Fetcher.get(old_url, timeout=30)\n>> element1 = page.css(selector, auto_save=True)[0]\n>> \n>> # Same selector but used in the updated website\n>> page = Fetcher.get(new_url)\n>> element2 = page.css(selector, adaptive=True)[0]\n>> \n>> if element1.text == element2.text:\n...    print('Scrapling found the same element in the old and new designs!')\n'Scrapling found the same element in the old and new designs!'\n```\nThe `adaptive_domain` argument is used here because Scrapling sees `archive.org` and `stackoverflow.com` as two different domains and would isolate their `adaptive` data. Passing `adaptive_domain` tells Scrapling to treat them as the same website for adaptive data storage.\n\nIn a typical scenario with the same URL for both requests, the `adaptive_domain` argument is not needed. The adaptive logic works the same way with both the `Selector` and `Fetcher` classes.\n\n**Note:** The main reason for creating the `adaptive_domain` argument was to handle if the website changed its URL while changing the design/structure. In that case, it can be used to continue using the previously stored adaptive data for the new URL. Otherwise, Scrapling will consider it a new website and discard the old data.\n\n## How the adaptive scraping feature works\nAdaptive scraping works in two phases:\n\n1. **Save Phase**: Store unique properties of elements\n2. **Match Phase**: Find elements with similar properties later\n\nAfter selecting an element through any method, the library can find it the next time the website is scraped, even if it undergoes structural/design changes.\n\nThe general logic is as follows:\n\n  1. Scrapling saves that element's unique properties (methods shown below).\n  2. Scrapling uses its configured database (SQLite by default) and saves each element's unique properties.\n  3. Because everything about the element can be changed or removed by the website's owner(s), nothing from the element can be used as a unique identifier for the database. The storage system relies on two things:\n     1. The domain of the current website. When using the `Selector` class, pass it when initializing; when using a fetcher, the domain is automatically taken from the URL.\n     2. An `identifier` to query that element's properties from the database. The identifier does not always need to be set manually (see below).\n\n     Together, they will later be used to retrieve the element's unique properties from the database.\n\n  4. Later, when the website's structure changes, enabling `adaptive` causes Scrapling to retrieve the element's unique properties and match all elements on the page against them. A score is calculated based on their similarity to the desired element. Everything is taken into consideration in that comparison.\n  5. The element(s) with the highest similarity score to the wanted element are returned.\n\n### The unique properties\nThe unique properties Scrapling relies on are:\n\n- Element tag name, text, attributes (names and values), siblings (tag names only), and path (tag names only).\n- Element's parent tag name, attributes (names and values), and text.\n\nThe comparison between elements is not exact; it is based on how similar these values are. Everything is considered, including the values' order (e.g., the order in which class names are written).\n\n## How to use adaptive feature\nThe adaptive feature can be applied to any found element and is added as arguments to CSS/XPath selection methods.\n\nFirst, enable the `adaptive` feature by passing `adaptive=True` to the [Selector](main_classes.md#selector) class when initializing it, or enable it on the fetcher being used.\n\nExamples:\n```python\n>>> from scrapling import Selector, Fetcher\n>>> page = Selector(html_doc, adaptive=True)\n# OR\n>>> Fetcher.adaptive = True\n>>> page = Fetcher.get('https://example.com')\n```\nWhen using the [Selector](main_classes.md#selector) class, pass the URL of the website with the `url` argument so Scrapling can separate the properties saved for each element by domain.\n\nIf no URL is passed, the word `default` will be used in place of the URL field while saving the element's unique properties. This is only an issue when using the same identifier for a different website without passing the URL parameter. The save process overwrites previous data, and the `adaptive` feature uses only the latest saved properties.\n\nThe `storage` and `storage_args` arguments control the database connection; by default, the SQLite class provided by the library is used.\n\nThere are two main ways to use the `adaptive` feature:\n\n### The CSS/XPath Selection way\nFirst, use the `auto_save` argument while selecting an element that exists on the page:\n```python\nelement = page.css('#p1', auto_save=True)\n```\nWhen the element no longer exists, use the same selector with the `adaptive` argument to have the library find it:\n```python\nelement = page.css('#p1', adaptive=True)\n```\nWith the `css`/`xpath` methods, the identifier is set automatically to the selector string passed to the method.\n\nAdditionally, for all these methods, you can pass the `identifier` argument to set it yourself. This is useful in some instances, or you can use it to save properties with the `auto_save` argument.\n\n### The manual way\nElements can be manually saved, retrieved, and relocated within the `adaptive` feature. This allows relocating any element found by any method.\n\nExample of getting an element by text:\n```python\n>>> element = page.find_by_text('Tipping the Velvet', first_match=True)\n```\nSave its unique properties using the `save` method. The identifier must be set manually (use a meaningful identifier):\n```python\n>>> page.save(element, 'my_special_element')\n```\nLater, retrieve and relocate the element inside the page with `adaptive`:\n```python\n>>> element_dict = page.retrieve('my_special_element')\n>>> page.relocate(element_dict, selector_type=True)\n[<data='<a href=\"catalogue/tipping-the-velvet_99...' parent='<h3><a href=\"catalogue/tipping-the-velve...'>]\n>>> page.relocate(element_dict, selector_type=True).css('::text').getall()\n['Tipping the Velvet']\n```\nThe `retrieve` and `relocate` methods are used here.\n\nTo keep it as a `lxml.etree` object, omit the `selector_type` argument:\n```python\n>>> page.relocate(element_dict)\n[<Element a at 0x105a2a7b0>]\n```\n\n## Troubleshooting\n\n### No Matches Found\n```python\n# 1. Check if data was saved\nelement_data = page.retrieve('identifier')\nif not element_data:\n    print(\"No data saved for this identifier\")\n\n# 2. Try with different identifier\nproducts = page.css('.product', adaptive=True, identifier='old_selector')\n\n# 3. Save again with new identifier\nproducts = page.css('.new-product', auto_save=True, identifier='new_identifier')\n```\n\n### Wrong Elements Matched\n```python\n# Use more specific selectors\nproducts = page.css('.product-list .product', auto_save=True)\n\n# Or save with more context\nproduct = page.find_by_text('Product Name').parent\npage.save(product, 'specific_product')\n```\n\n## Known Issues\nIn the `adaptive` save process, only the unique properties of the first element in the selection results are saved. So if the selector you are using selects different elements on the page in other locations, `adaptive` will return the first element to you only when you relocate it later. This doesn't include combined CSS selectors (Using commas to combine more than one selector, for example), as these selectors are separated and each is executed alone.\n\n"
  },
  {
    "path": "agent-skill/Scrapling-Skill/references/parsing/main_classes.md",
    "content": "# Parsing main classes\n\nThe [Selector](#selector) class is the core parsing engine in Scrapling, providing HTML parsing and element selection capabilities. You can always import it with any of the following imports\n```python\nfrom scrapling import Selector\nfrom scrapling.parser import Selector\n```\nUsage:\n```python\npage = Selector(\n    '<html>...</html>',\n    url='https://example.com'\n)\n\n# Then select elements as you like\nelements = page.css('.product')\n```\nIn Scrapling, the main object you deal with after passing an HTML source or fetching a website is, of course, a [Selector](#selector) object. Any operation you do, like selection, navigation, etc., will return either a [Selector](#selector) object or a [Selectors](#selectors) object, given that the result is element/elements from the page, not text or similar.\n\nThe main page is a [Selector](#selector) object, and the elements within are [Selector](#selector) objects. Any text (text content inside elements or attribute values) is a [TextHandler](#texthandler) object, and element attributes are stored as [AttributesHandler](#attributeshandler).\n\n## Selector\n### Arguments explained\nThe most important one is `content`, it's used to pass the HTML code you want to parse, and it accepts the HTML content as `str` or `bytes`.\n\nThe arguments `url`, `adaptive`, `storage`, and `storage_args` are settings used with the `adaptive` feature. They are explained in the [adaptive](adaptive.md) feature page.\n\nArguments for parsing adjustments:\n\n- **encoding**: This is the encoding that will be used while parsing the HTML. The default is `UTF-8`.\n- **keep_comments**: This tells the library whether to keep HTML comments while parsing the page. It's disabled by default because it can cause issues with your scraping in various ways.\n- **keep_cdata**: Same logic as the HTML comments. [cdata](https://stackoverflow.com/questions/7092236/what-is-cdata-in-html) is removed by default for cleaner HTML.\n\nThe arguments `huge_tree` and `root` are advanced features not covered here.\n\nMost properties on the main page and its elements are lazily loaded (not initialized until accessed), which contributes to Scrapling's speed.\n\n### Properties\nProperties for traversal are separated in the [traversal](#traversal) section below.\n\nParsing this HTML page as an example:\n```html\n<html>\n  <head>\n    <title>Some page</title>\n  </head>\n  <body>\n    <div class=\"product-list\">\n      <article class=\"product\" data-id=\"1\">\n        <h3>Product 1</h3>\n        <p class=\"description\">This is product 1</p>\n        <span class=\"price\">$10.99</span>\n        <div class=\"hidden stock\">In stock: 5</div>\n      </article>\n    \n      <article class=\"product\" data-id=\"2\">\n        <h3>Product 2</h3>\n        <p class=\"description\">This is product 2</p>\n        <span class=\"price\">$20.99</span>\n        <div class=\"hidden stock\">In stock: 3</div>\n      </article>\n    \n      <article class=\"product\" data-id=\"3\">\n        <h3>Product 3</h3>\n        <p class=\"description\">This is product 3</p>\n        <span class=\"price\">$15.99</span>\n        <div class=\"hidden stock\">Out of stock</div>\n      </article>\n    </div>\n\n    <script id=\"page-data\" type=\"application/json\">\n      {\n        \"lastUpdated\": \"2024-09-22T10:30:00Z\",\n        \"totalProducts\": 3\n      }\n    </script>\n  </body>\n</html>\n```\nLoad the page directly as shown before:\n```python\nfrom scrapling import Selector\npage = Selector(html_doc)\n```\nGet all text content on the page recursively\n```python\n>>> page.get_all_text()\n'Some page\\n\\n    \\n\\n      \\nProduct 1\\nThis is product 1\\n$10.99\\nIn stock: 5\\nProduct 2\\nThis is product 2\\n$20.99\\nIn stock: 3\\nProduct 3\\nThis is product 3\\n$15.99\\nOut of stock'\n```\nGet the first article (used as an example throughout):\n```python\narticle = page.find('article')\n```\nWith the same logic, get all text content on the element recursively\n```python\n>>> article.get_all_text()\n'Product 1\\nThis is product 1\\n$10.99\\nIn stock: 5'\n```\nBut if you try to get the direct text content, it will be empty because it doesn't have direct text in the HTML code above\n```python\n>>> article.text\n''\n```\nThe `get_all_text` method has the following optional arguments:\n\n1. **separator**: All strings collected will be concatenated using this separator. The default is '\\n'.\n2. **strip**: If enabled, strings will be stripped before concatenation. Disabled by default.\n3. **ignore_tags**: A tuple of all tag names you want to ignore in the final results and ignore any elements nested within them. The default is `('script', 'style',)`.\n4. **valid_values**: If enabled, the method will only collect elements with real values, so all elements with empty text content or only whitespaces will be ignored. It's enabled by default\n\nThe text returned is a [TextHandler](#texthandler), not a standard string. If the text content can be serialized to JSON, use `.json()` on it:\n```python\n>>> script = page.find('script')\n>>> script.json()\n{'lastUpdated': '2024-09-22T10:30:00Z', 'totalProducts': 3}\n```\nLet's continue to get the element tag\n```python\n>>> article.tag\n'article'\n```\nUsing it on the page directly operates on the root `html` element:\n```python\n>>> page.tag\n'html'\n```\nGetting the attributes of the element\n```python\n>>> print(article.attrib)\n{'class': 'product', 'data-id': '1'}\n```\nAccess a specific attribute with any of the following\n```python\n>>> article.attrib['class']\n>>> article.attrib.get('class')\n>>> article['class']  # new in v0.3\n```\nCheck if the attributes contain a specific attribute with any of the methods below\n```python\n>>> 'class' in article.attrib\n>>> 'class' in article  # new in v0.3\n```\nGet the HTML content of the element\n```python\n>>> article.html_content\n'<article class=\"product\" data-id=\"1\"><h3>Product 1</h3>\\n        <p class=\"description\">This is product 1</p>\\n        <span class=\"price\">$10.99</span>\\n        <div class=\"hidden stock\">In stock: 5</div>\\n      </article>'\n```\nGet the prettified version of the element's HTML content\n```python\nprint(article.prettify())\n```\n```html\n<article class=\"product\" data-id=\"1\"><h3>Product 1</h3>\n    <p class=\"description\">This is product 1</p>\n    <span class=\"price\">$10.99</span>\n    <div class=\"hidden stock\">In stock: 5</div>\n</article>\n```\nUse the `.body` property to get the raw content of the page. Starting from v0.4, when used on a `Response` object from fetchers, `.body` always returns `bytes`.\n```python\n>>> page.body\n'<html>\\n  <head>\\n    <title>Some page</title>\\n  </head>\\n  ...'\n```\nTo get all the ancestors in the DOM tree of this element\n```python\n>>> article.path\n[<data='<div class=\"product-list\"> <article clas...' parent='<body> <div class=\"product-list\"> <artic...'>,\n <data='<body> <div class=\"product-list\"> <artic...' parent='<html><head><title>Some page</title></he...'>,\n <data='<html><head><title>Some page</title></he...'>]\n```\nGenerate a CSS shortened selector if possible, or generate the full selector\n```python\n>>> article.generate_css_selector\n'body > div > article'\n>>> article.generate_full_css_selector\n'body > div > article'\n```\nSame case with XPath\n```python\n>>> article.generate_xpath_selector\n\"//body/div/article\"\n>>> article.generate_full_xpath_selector\n\"//body/div/article\"\n```\n\n### Traversal\nProperties and methods for navigating elements on the page.\n\nThe `html` element is the root of the website's tree. Elements like `head` and `body` are \"children\" of `html`, and `html` is their \"parent\". The element `body` is a \"sibling\" of `head` and vice versa.\n\nAccessing the parent of an element\n```python\n>>> article.parent\n<data='<div class=\"product-list\"> <article clas...' parent='<body> <div class=\"product-list\"> <artic...'>\n>>> article.parent.tag\n'div'\n```\nChaining is supported, as with all similar properties/methods:\n```python\n>>> article.parent.parent.tag\n'body'\n```\nGet the children of an element\n```python\n>>> article.children\n[<data='<h3>Product 1</h3>' parent='<article class=\"product\" data-id=\"1\"><h3...'>,\n <data='<p class=\"description\">This is product 1...' parent='<article class=\"product\" data-id=\"1\"><h3...'>,\n <data='<span class=\"price\">$10.99</span>' parent='<article class=\"product\" data-id=\"1\"><h3...'>,\n <data='<div class=\"hidden stock\">In stock: 5</d...' parent='<article class=\"product\" data-id=\"1\"><h3...'>]\n```\nGet all elements underneath an element. It acts as a nested version of the `children` property\n```python\n>>> article.below_elements\n[<data='<h3>Product 1</h3>' parent='<article class=\"product\" data-id=\"1\"><h3...'>,\n <data='<p class=\"description\">This is product 1...' parent='<article class=\"product\" data-id=\"1\"><h3...'>,\n <data='<span class=\"price\">$10.99</span>' parent='<article class=\"product\" data-id=\"1\"><h3...'>,\n <data='<div class=\"hidden stock\">In stock: 5</d...' parent='<article class=\"product\" data-id=\"1\"><h3...'>]\n```\nThis element returns the same result as the `children` property because its children don't have children.\n\nAnother example of using the element with the `product-list` class will clear the difference between the `children` property and the `below_elements` property\n```python\n>>> products_list = page.css('.product-list')[0]\n>>> products_list.children\n[<data='<article class=\"product\" data-id=\"1\"><h3...' parent='<div class=\"product-list\"> <article clas...'>,\n <data='<article class=\"product\" data-id=\"2\"><h3...' parent='<div class=\"product-list\"> <article clas...'>,\n <data='<article class=\"product\" data-id=\"3\"><h3...' parent='<div class=\"product-list\"> <article clas...'>]\n\n>>> products_list.below_elements\n[<data='<article class=\"product\" data-id=\"1\"><h3...' parent='<div class=\"product-list\"> <article clas...'>,\n <data='<h3>Product 1</h3>' parent='<article class=\"product\" data-id=\"1\"><h3...'>,\n <data='<p class=\"description\">This is product 1...' parent='<article class=\"product\" data-id=\"1\"><h3...'>,\n <data='<span class=\"price\">$10.99</span>' parent='<article class=\"product\" data-id=\"1\"><h3...'>,\n <data='<div class=\"hidden stock\">In stock: 5</d...' parent='<article class=\"product\" data-id=\"1\"><h3...'>,\n <data='<article class=\"product\" data-id=\"2\"><h3...' parent='<div class=\"product-list\"> <article clas...'>,\n...]\n```\nGet the siblings of an element\n```python\n>>> article.siblings\n[<data='<article class=\"product\" data-id=\"2\"><h3...' parent='<div class=\"product-list\"> <article clas...'>,\n <data='<article class=\"product\" data-id=\"3\"><h3...' parent='<div class=\"product-list\"> <article clas...'>]\n```\nGet the next element of the current element\n```python\n>>> article.next\n<data='<article class=\"product\" data-id=\"2\"><h3...' parent='<div class=\"product-list\"> <article clas...'>\n```\nThe same logic applies to the `previous` property\n```python\n>>> article.previous  # It's the first child, so it doesn't have a previous element\n>>> second_article = page.css('.product[data-id=\"2\"]')[0]\n>>> second_article.previous\n<data='<article class=\"product\" data-id=\"1\"><h3...' parent='<div class=\"product-list\"> <article clas...'>\n```\nCheck if an element has a specific class name:\n```python\n>>> article.has_class('product')\nTrue\n```\nIterate over the entire ancestors' tree of any element:\n```python\nfor ancestor in article.iterancestors():\n    # do something with it...\n```\nSearch for a specific ancestor that satisfies a search function. Pass a function that takes a [Selector](#selector) object as an argument and returns `True`/`False`:\n```python\n>>> article.find_ancestor(lambda ancestor: ancestor.has_class('product-list'))\n<data='<div class=\"product-list\"> <article clas...' parent='<body> <div class=\"product-list\"> <artic...'>\n\n>>> article.find_ancestor(lambda ancestor: ancestor.css('.product-list'))  # Same result, different approach\n<data='<div class=\"product-list\"> <article clas...' parent='<body> <div class=\"product-list\"> <artic...'>\n```\n## Selectors\nThe class `Selectors` is the \"List\" version of the [Selector](#selector) class. It inherits from the Python standard `List` type, so it shares all `List` properties and methods while adding more methods to make the operations you want to execute on the [Selector](#selector) instances within more straightforward.\n\nIn the [Selector](#selector) class, all methods/properties that should return a group of elements return them as a [Selectors](#selectors) class instance.\n\nStarting with v0.4, all selection methods consistently return [Selector](#selector)/[Selectors](#selectors) objects, even for text nodes and attribute values. Text nodes (selected via `::text`, `/text()`, `::attr()`, `/@attr`) are wrapped in [Selector](#selector) objects. These text node selectors have `tag` set to `\"#text\"`, and their `text` property returns the text value. You can still access the text value directly, and all other properties return empty/default values gracefully.\n\n```python\n>>> page.css('a::text')              # -> Selectors (of text node Selectors)\n>>> page.xpath('//a/text()')         # -> Selectors\n>>> page.css('a::text').get()        # -> TextHandler (the first text value)\n>>> page.css('a::text').getall()     # -> TextHandlers (all text values)\n>>> page.css('a::attr(href)')        # -> Selectors\n>>> page.xpath('//a/@href')          # -> Selectors\n>>> page.css('.price_color')         # -> Selectors\n```\n\n### Data extraction methods\nStarting with v0.4, [Selector](#selector) and [Selectors](#selectors) both provide `get()`, `getall()`, and their aliases `extract_first` and `extract` (following Scrapy conventions). The old `get_all()` method has been removed.\n\n**On a [Selector](#selector) object:**\n\n- `get()` returns a `TextHandler` — for text node selectors, it returns the text value; for HTML element selectors, it returns the serialized outer HTML.\n- `getall()` returns a `TextHandlers` list containing the single serialized string.\n- `extract_first` is an alias for `get()`, and `extract` is an alias for `getall()`.\n\n```python\n>>> page.css('h3')[0].get()        # Outer HTML of the element\n'<h3>Product 1</h3>'\n\n>>> page.css('h3::text')[0].get()  # Text value of the text node\n'Product 1'\n```\n\n**On a [Selectors](#selectors) object:**\n\n- `get(default=None)` returns the serialized string of the **first** element, or `default` if the list is empty.\n- `getall()` serializes **all** elements and returns a `TextHandlers` list.\n- `extract_first` is an alias for `get()`, and `extract` is an alias for `getall()`.\n\n```python\n>>> page.css('.price::text').get()      # First price text\n'$10.99'\n\n>>> page.css('.price::text').getall()   # All price texts\n['$10.99', '$20.99', '$15.99']\n\n>>> page.css('.price::text').get('')    # With default value\n'$10.99'\n```\n\nThese methods work seamlessly with all selection types (CSS, XPath, `find`, etc.) and are the recommended way to extract text and attribute values in a Scrapy-compatible style.\n\n### Properties\nApart from the standard operations on Python lists (iteration, slicing, etc.), the following operations are available:\n\nCSS and XPath selectors can be executed directly on the [Selector](#selector) instances, with the same return types as [Selector](#selector)'s `css` and `xpath` methods. The arguments are similar, except the `adaptive` argument is not available. This makes chaining methods straightforward:\n```python\n>>> page.css('.product_pod a')\n[<data='<a href=\"catalogue/a-light-in-the-attic_...' parent='<div class=\"image_container\"> <a href=\"c...'>,\n <data='<a href=\"catalogue/a-light-in-the-attic_...' parent='<h3><a href=\"catalogue/a-light-in-the-at...'>,\n <data='<a href=\"catalogue/tipping-the-velvet_99...' parent='<div class=\"image_container\"> <a href=\"c...'>,\n <data='<a href=\"catalogue/tipping-the-velvet_99...' parent='<h3><a href=\"catalogue/tipping-the-velve...'>,\n <data='<a href=\"catalogue/soumission_998/index....' parent='<div class=\"image_container\"> <a href=\"c...'>,\n <data='<a href=\"catalogue/soumission_998/index....' parent='<h3><a href=\"catalogue/soumission_998/in...'>,\n...]\n\n>>> page.css('.product_pod').css('a')  # Returns the same result\n[<data='<a href=\"catalogue/a-light-in-the-attic_...' parent='<div class=\"image_container\"> <a href=\"c...'>,\n <data='<a href=\"catalogue/a-light-in-the-attic_...' parent='<h3><a href=\"catalogue/a-light-in-the-at...'>,\n <data='<a href=\"catalogue/tipping-the-velvet_99...' parent='<div class=\"image_container\"> <a href=\"c...'>,\n <data='<a href=\"catalogue/tipping-the-velvet_99...' parent='<h3><a href=\"catalogue/tipping-the-velve...'>,\n <data='<a href=\"catalogue/soumission_998/index....' parent='<div class=\"image_container\"> <a href=\"c...'>,\n <data='<a href=\"catalogue/soumission_998/index....' parent='<h3><a href=\"catalogue/soumission_998/in...'>,\n...]\n```\nThe `re` and `re_first` methods can be run directly. They take the same arguments as the [Selector](#selector) class. In this class, `re_first` runs `re` on each [Selector](#selector) within and returns the first one with a result. The `re` method returns a [TextHandlers](#texthandlers) object combining all matches:\n```python\n>>> page.css('.price_color').re(r'[\\d\\.]+')\n['51.77',\n '53.74',\n '50.10',\n '47.82',\n '54.23',\n...]\n\n>>> page.css('.product_pod h3 a::attr(href)').re(r'catalogue/(.*)/index.html')\n['a-light-in-the-attic_1000',\n 'tipping-the-velvet_999',\n 'soumission_998',\n 'sharp-objects_997',\n...]\n```\nThe `search` method searches the available [Selector](#selector) instances. The function passed must accept a [Selector](#selector) instance as the first argument and return True/False. Returns the first matching [Selector](#selector) instance, or `None`:\n```python\n# Find all the products with price '53.23'.\n>>> search_function = lambda p: float(p.css('.price_color').re_first(r'[\\d\\.]+')) == 54.23\n>>> page.css('.product_pod').search(search_function)\n<data='<article class=\"product_pod\"><div class=...' parent='<li class=\"col-xs-6 col-sm-4 col-md-3 co...'>\n```\nThe `filter` method takes a function like `search` but returns a `Selectors` instance of all matching [Selector](#selector) instances:\n```python\n# Find all products with prices over $50\n>>> filtering_function = lambda p: float(p.css('.price_color').re_first(r'[\\d\\.]+')) > 50\n>>> page.css('.product_pod').filter(filtering_function)\n[<data='<article class=\"product_pod\"><div class=...' parent='<li class=\"col-xs-6 col-sm-4 col-md-3 co...'>,\n <data='<article class=\"product_pod\"><div class=...' parent='<li class=\"col-xs-6 col-sm-4 col-md-3 co...'>,\n <data='<article class=\"product_pod\"><div class=...' parent='<li class=\"col-xs-6 col-sm-4 col-md-3 co...'>,\n...]\n```\nSafe access to the first or last element without index errors:\n```python\n>>> page.css('.product').first   # First Selector or None\n<data='<article class=\"product\" data-id=\"1\"><h3...'>\n>>> page.css('.product').last    # Last Selector or None\n<data='<article class=\"product\" data-id=\"3\"><h3...'>\n>>> page.css('.nonexistent').first  # Returns None instead of raising IndexError\n```\n\nGet the number of [Selector](#selector) instances in a [Selectors](#selectors) instance:\n```python\npage.css('.product_pod').length\n```\nwhich is equivalent to\n```python\nlen(page.css('.product_pod'))\n```\n\n## TextHandler\nAll methods/properties that return a string return `TextHandler`, and those that return a list of strings return [TextHandlers](#texthandlers) instead.\n\nTextHandler is a subclass of the standard Python string, so all standard string operations are supported.\n\nTextHandler provides extra methods and properties beyond standard Python strings. All methods and properties in all classes that return string(s) return TextHandler, enabling chaining and cleaner code. It can also be imported directly and used on any string.\n### Usage\nAll operations (slicing, indexing, etc.) and methods (`split`, `replace`, `strip`, etc.) return a `TextHandler`, so they can be chained.\n\nThe `re` and `re_first` methods exist in [Selector](#selector), [Selectors](#selectors), and [TextHandlers](#texthandlers) as well, accepting the same arguments.\n\n- The `re` method takes a string/compiled regex pattern as the first argument. It searches the data for all strings matching the regex and returns them as a [TextHandlers](#texthandlers) instance. The `re_first` method takes the same arguments but returns only the first result as a `TextHandler` instance.\n    \n    Also, it takes other helpful arguments, which are:\n    \n    - **replace_entities**: This is enabled by default. It replaces character entity references with their corresponding characters.\n    - **clean_match**: It's disabled by default. This causes the method to ignore all whitespace, including consecutive spaces, while matching.\n    - **case_sensitive**: It's enabled by default. As the name implies, disabling it causes the regex to ignore letter case during compilation.\n  \n    The return result is [TextHandlers](#texthandlers) because the `re` method is used:\n    ```python\n    >>> page.css('.price_color').re(r'[\\d\\.]+')\n    ['51.77',\n     '53.74',\n     '50.10',\n     '47.82',\n     '54.23',\n    ...]\n    \n    >>> page.css('.product_pod h3 a::attr(href)').re(r'catalogue/(.*)/index.html')\n    ['a-light-in-the-attic_1000',\n     'tipping-the-velvet_999',\n     'soumission_998',\n     'sharp-objects_997',\n    ...]\n    ```\n    Examples with custom strings demonstrating the other arguments:\n    ```python\n    >>> from scrapling import TextHandler\n    >>> test_string = TextHandler('hi  there')  # Hence the two spaces\n    >>> test_string.re('hi there')\n    >>> test_string.re('hi there', clean_match=True)  # Using `clean_match` will clean the string before matching the regex\n    ['hi there']\n    \n    >>> test_string2 = TextHandler('Oh, Hi Mark')\n    >>> test_string2.re_first('oh, hi Mark')\n    >>> test_string2.re_first('oh, hi Mark', case_sensitive=False)  # Hence disabling `case_sensitive`\n    'Oh, Hi Mark'\n    \n    # Mixing arguments\n    >>> test_string.re('hi there', clean_match=True, case_sensitive=False)\n    ['hi There']\n    ```\n    Since `html_content` returns `TextHandler`, regex can be applied directly on HTML content:\n    ```python\n    >>> page.html_content.re('div class=\".*\">(.*)</div')\n    ['In stock: 5', 'In stock: 3', 'Out of stock']\n    ```\n\n- The `.json()` method converts the content to a JSON object if possible; otherwise, it throws an error:\n  ```python\n  >>> page.css('#page-data::text').get()\n    '\\n      {\\n        \"lastUpdated\": \"2024-09-22T10:30:00Z\",\\n        \"totalProducts\": 3\\n      }\\n    '\n  >>> page.css('#page-data::text').get().json()\n    {'lastUpdated': '2024-09-22T10:30:00Z', 'totalProducts': 3}\n  ```\n  If no text node is specified while selecting an element, the text content is selected automatically:\n  ```python\n  >>> page.css('#page-data')[0].json()\n  {'lastUpdated': '2024-09-22T10:30:00Z', 'totalProducts': 3}\n  ```\n  The [Selector](#selector) class adds additional behavior. Given this page:\n  ```html\n  <html>\n      <body>\n          <div>\n            <script id=\"page-data\" type=\"application/json\">\n              {\n                \"lastUpdated\": \"2024-09-22T10:30:00Z\",\n                \"totalProducts\": 3\n              }\n            </script>\n          </div>\n      </body>\n  </html>\n  ```\n  The [Selector](#selector) class has the `get_all_text` method, which returns a `TextHandler`. For example:\n  ```python\n  >>> page.css('div::text').get().json()\n  ```\n  This throws an error because the `div` tag has no direct text content. The `get_all_text` method handles this case:\n  ```python\n  >>> page.css('div')[0].get_all_text(ignore_tags=[]).json()\n    {'lastUpdated': '2024-09-22T10:30:00Z', 'totalProducts': 3}\n  ```\n  The `ignore_tags` argument is used here because its default value is `('script', 'style',)`.\n\n  When dealing with a JSON response:\n  ```python\n  >>> page = Selector(\"\"\"{\"some_key\": \"some_value\"}\"\"\")\n  ```\n  The [Selector](#selector) class is optimized for HTML, so it treats this as a broken HTML response and wraps it. The `html_content` property shows:\n  ```python\n  >>> page.html_content\n  '<html><body><p>{\"some_key\": \"some_value\"}</p></body></html>'\n  ```\n  The `json` method can be used directly:\n  ```python\n  >>> page.json()\n  {'some_key': 'some_value'}\n  ```\n  For JSON responses, the [Selector](#selector) class keeps a raw copy of the content it receives. When `.json()` is called, it checks for that raw copy first and converts it to JSON. If the raw copy is unavailable (as with sub-elements), it checks the current element's text content, then falls back to `get_all_text`.\n\n- The `.clean()` method removes all whitespace and consecutive spaces, returning a new `TextHandler` instance:\n```python\n>>> TextHandler('\\n wonderful  idea, \\reh?').clean()\n'wonderful idea, eh?'\n```\nThe `remove_entities` argument causes `clean` to replace HTML entities with their corresponding characters.\n\n- The `.sort()` method sorts the string characters:\n```python\n>>> TextHandler('acb').sort()\n'abc'\n```\nOr do it in reverse:\n```python\n>>> TextHandler('acb').sort(reverse=True)\n'cba'\n```\n\nThis class is returned in place of strings nearly everywhere in the library.\n\n## TextHandlers\nThis class inherits from standard lists, adding `re` and `re_first` as new methods.\n\nThe `re_first` method runs `re` on each [TextHandler](#texthandler) and returns the first result, or `None`.\n\n## AttributesHandler\nThis is a read-only version of Python's standard dictionary, or `dict`, used solely to store the attributes of each element/[Selector](#selector) instance.\n```python\n>>> print(page.find('script').attrib)\n{'id': 'page-data', 'type': 'application/json'}\n>>> type(page.find('script').attrib).__name__\n'AttributesHandler'\n```\nBecause it's read-only, it will use fewer resources than the standard dictionary. Still, it has the same dictionary method and properties, except those that allow you to modify/override the data.\n\nIt currently adds two extra simple methods:\n\n- The `search_values` method\n\n    Searches the current attributes by values (rather than keys) and returns a dictionary of each matching item.\n    \n    A simple example would be\n    ```python\n    >>> for i in page.find('script').attrib.search_values('page-data'):\n            print(i)\n    {'id': 'page-data'}\n    ```\n    But this method provides the `partial` argument as well, which allows you to search by part of the value:\n    ```python\n    >>> for i in page.find('script').attrib.search_values('page', partial=True):\n            print(i)\n    {'id': 'page-data'}\n    ```\n    A more practical example is using it with `find_all` to find all elements that have a specific value in their attributes:\n    ```python\n    >>> page.find_all(lambda element: list(element.attrib.search_values('product')))\n    [<data='<article class=\"product\" data-id=\"1\"><h3...' parent='<div class=\"product-list\"> <article clas...'>,\n     <data='<article class=\"product\" data-id=\"2\"><h3...' parent='<div class=\"product-list\"> <article clas...'>,\n     <data='<article class=\"product\" data-id=\"3\"><h3...' parent='<div class=\"product-list\"> <article clas...'>]\n    ```\n    All these elements have 'product' as the value for the `class` attribute.\n    \n    The `list` function is used here because `search_values` returns a generator, so it would be `True` for all elements.\n\n- The `json_string` property\n\n    This property converts current attributes to a JSON string if the attributes are JSON serializable; otherwise, it throws an error.\n  \n    ```python\n    >>>page.find('script').attrib.json_string\n    b'{\"id\":\"page-data\",\"type\":\"application/json\"}'\n    ```"
  },
  {
    "path": "agent-skill/Scrapling-Skill/references/parsing/selection.md",
    "content": "# Querying elements\nScrapling currently supports parsing HTML pages exclusively (no XML feeds), because the adaptive feature does not work with XML.\n\nIn Scrapling, there are five main ways to find elements:\n\n1. CSS3 Selectors\n2. XPath Selectors\n3. Finding elements based on filters/conditions.\n4. Finding elements whose content contains a specific text\n5. Finding elements whose content matches a specific regex\n\nThere are also other indirect ways to find elements. Scrapling can also find elements similar to a given element; see [Finding Similar Elements](#finding-similar-elements).\n\n## CSS/XPath selectors\n\n### What are CSS selectors?\n[CSS](https://en.wikipedia.org/wiki/CSS) is a language for applying styles to HTML documents. It defines selectors to associate those styles with specific HTML elements.\n\nScrapling implements CSS3 selectors as described in the [W3C specification](http://www.w3.org/TR/2011/REC-css3-selectors-20110929/). CSS selectors support comes from `cssselect`, so it's better to read about which [selectors are supported from cssselect](https://cssselect.readthedocs.io/en/latest/#supported-selectors) and pseudo-functions/elements.\n\nAlso, Scrapling implements some non-standard pseudo-elements like:\n\n* To select text nodes, use ``::text``.\n* To select attribute values, use ``::attr(name)`` where name is the name of the attribute that you want the value of\n\nThe selector logic follows the same conventions as Scrapy/Parsel.\n\nTo select elements with CSS selectors, use the `css` method, which returns `Selectors`. Use `[0]` to get the first element, or `.get()` / `.getall()` to extract text values from text/attribute pseudo-selectors.\n\n### What are XPath selectors?\n[XPath](https://en.wikipedia.org/wiki/XPath) is a language for selecting nodes in XML documents, which can also be used with HTML. This [cheatsheet](https://devhints.io/xpath) is a good resource for learning about [XPath](https://en.wikipedia.org/wiki/XPath). Scrapling adds XPath selectors directly through [lxml](https://lxml.de/).\n\nThe logic follows the same conventions as Scrapy/Parsel. However, Scrapling does not implement the XPath extension function `has-class` as Scrapy/Parsel does. Instead, it provides the `has_class` method on returned elements.\n\nTo select elements with XPath selectors, use the `xpath` method, which follows the same logic as the CSS selectors method above.\n\n> Note that each method of `css` and `xpath` has additional arguments, but we didn't explain them here, as they are all about the adaptive feature. The adaptive feature will have its own page later to be described in detail.\n\n### Selectors examples\nLet's see some shared examples of using CSS and XPath Selectors.\n\nSelect all elements with the class `product`.\n```python\nproducts = page.css('.product')\nproducts = page.xpath('//*[@class=\"product\"]')\n```\n**Note:** The XPath version won't be accurate if there's another class; it's always better to rely on CSS for selecting by class.\n\nSelect the first element with the class `product`.\n```python\nproduct = page.css('.product')[0]\nproduct = page.xpath('//*[@class=\"product\"]')[0]\n```\nGet the text of the first element with the `h1` tag name\n```python\ntitle = page.css('h1::text').get()\ntitle = page.xpath('//h1//text()').get()\n```\nWhich is the same as doing\n```python\ntitle = page.css('h1')[0].text\ntitle = page.xpath('//h1')[0].text\n```\nGet the `href` attribute of the first element with the `a` tag name\n```python\nlink = page.css('a::attr(href)').get()\nlink = page.xpath('//a/@href').get()\n```\nSelect the text of the first element with the `h1` tag name, which contains `Phone`, and under an element with class `product`.\n```python\ntitle = page.css('.product h1:contains(\"Phone\")::text').get()\ntitle = page.xpath('//*[@class=\"product\"]//h1[contains(text(),\"Phone\")]/text()').get()\n```\nYou can nest and chain selectors as you want, given that they return results\n```python\npage.css('.product')[0].css('h1:contains(\"Phone\")::text').get()\npage.xpath('//*[@class=\"product\"]')[0].xpath('//h1[contains(text(),\"Phone\")]/text()').get()\npage.xpath('//*[@class=\"product\"]')[0].css('h1:contains(\"Phone\")::text').get()\n```\nAnother example\n\nAll links that have 'image' in their 'href' attribute\n```python\nlinks = page.css('a[href*=\"image\"]')\nlinks = page.xpath('//a[contains(@href, \"image\")]')\nfor index, link in enumerate(links):\n    link_value = link.attrib['href']  # Cleaner than link.css('::attr(href)').get()\n    link_text = link.text\n    print(f'Link number {index} points to this url {link_value} with text content as \"{link_text}\"')\n```\n\n## Text-content selection\nScrapling provides two ways to select elements based on their direct text content:\n\n1. Elements whose direct text content contains the given text with many options through the `find_by_text` method.\n2. Elements whose direct text content matches the given regex pattern with many options through the `find_by_regex` method.\n\nAnything achievable with `find_by_text` can also be done with `find_by_regex`, but both are provided for convenience.\n\nWith `find_by_text`, you pass the text as the first argument; with `find_by_regex`, the regex pattern is the first argument. Both methods share the following arguments:\n\n* **first_match**: If `True` (the default), the method used will return the first result it finds.\n* **case_sensitive**: If `True`, the case of the letters will be considered.\n* **clean_match**: If `True`, all whitespaces and consecutive spaces will be replaced with a single space before matching.\n\nBy default, Scrapling searches for the exact matching of the text/pattern you pass to `find_by_text`, so the text content of the wanted element has to be ONLY the text you input, but that's why it also has one extra argument, which is:\n\n* **partial**: If enabled, `find_by_text` will return elements that contain the input text. So it's not an exact match anymore\n\n**Note:** The method `find_by_regex` can accept both regular strings and a compiled regex pattern as its first argument.\n\n### Finding Similar Elements\nScrapling can find elements similar to a given element, inspired by the AutoScraper library but usable with elements found by any method.\n\nGiven an element (e.g., a product found by title), calling `.find_similar()` on it causes Scrapling to:\n\n1. Find all page elements with the same DOM tree depth as this element. \n2. All found elements will be checked, and those without the same tag name, parent tag name, and grandparent tag name will be dropped.\n3. As a final check, Scrapling uses fuzzy matching to drop elements whose attributes don't resemble the original element's attributes. A configurable percentage controls this step (see arguments below).\n\nArguments for `find_similar()`:\n\n* **similarity_threshold**: The percentage for comparing elements' attributes (step 3). Default is 0.2 (tag attributes must be at least 20% similar). Set to 0 to disable this check entirely.\n* **ignore_attributes**: The attribute names passed will be ignored while matching the attributes in the last step. The default value is `('href', 'src',)` because URLs can change significantly across elements, making them unreliable.\n* **match_text**: If `True`, the element's text content will be considered when matching (Step 3). Using this argument in typical cases is not recommended, but it depends.\n\n### Examples\nExamples of finding elements with raw text, regex, and `find_similar`.\n```python\nfrom scrapling.fetchers import Fetcher\npage = Fetcher.get('https://books.toscrape.com/index.html')\n```\nFind the first element whose text fully matches this text\n```python\n>>> page.find_by_text('Tipping the Velvet')\n<data='<a href=\"catalogue/tipping-the-velvet_99...' parent='<h3><a href=\"catalogue/tipping-the-velve...'>\n```\nCombining it with `page.urljoin` to return the full URL from the relative `href`.\n```python\n>>> page.find_by_text('Tipping the Velvet').attrib['href']\n'catalogue/tipping-the-velvet_999/index.html'\n>>> page.urljoin(page.find_by_text('Tipping the Velvet').attrib['href'])\n'https://books.toscrape.com/catalogue/tipping-the-velvet_999/index.html'\n```\nGet all matches if there are more (notice it returns a list)\n```python\n>>> page.find_by_text('Tipping the Velvet', first_match=False)\n[<data='<a href=\"catalogue/tipping-the-velvet_99...' parent='<h3><a href=\"catalogue/tipping-the-velve...'>]\n```\nGet all elements that contain the word `the` (Partial matching)\n```python\n>>> results = page.find_by_text('the', partial=True, first_match=False)\n>>> [i.text for i in results]\n['A Light in the ...',\n 'Tipping the Velvet',\n 'The Requiem Red',\n 'The Dirty Little Secrets ...',\n 'The Coming Woman: A ...',\n 'The Boys in the ...',\n 'The Black Maria',\n 'Mesaerion: The Best Science ...',\n \"It's Only the Himalayas\"]\n```\nThe search is case-insensitive by default, so those results include `The`, not just the lowercase `the`. To limit to exact case:\n```python\n>>> results = page.find_by_text('the', partial=True, first_match=False, case_sensitive=True)\n>>> [i.text for i in results]\n['A Light in the ...',\n 'Tipping the Velvet',\n 'The Boys in the ...',\n \"It's Only the Himalayas\"]\n```\nGet the first element whose text content matches my price regex\n```python\n>>> page.find_by_regex(r'£[\\d\\.]+')\n<data='<p class=\"price_color\">£51.77</p>' parent='<div class=\"product_price\"> <p class=\"pr...'>\n>>> page.find_by_regex(r'£[\\d\\.]+').text\n'£51.77'\n```\nIt's the same if you pass the compiled regex as well; Scrapling will detect the input type and act upon that:\n```python\n>>> import re\n>>> regex = re.compile(r'£[\\d\\.]+')\n>>> page.find_by_regex(regex)\n<data='<p class=\"price_color\">£51.77</p>' parent='<div class=\"product_price\"> <p class=\"pr...'>\n>>> page.find_by_regex(regex).text\n'£51.77'\n```\nGet all elements that match the regex\n```python\n>>> page.find_by_regex(r'£[\\d\\.]+', first_match=False)\n[<data='<p class=\"price_color\">£51.77</p>' parent='<div class=\"product_price\"> <p class=\"pr...'>,\n <data='<p class=\"price_color\">£53.74</p>' parent='<div class=\"product_price\"> <p class=\"pr...'>,\n <data='<p class=\"price_color\">£50.10</p>' parent='<div class=\"product_price\"> <p class=\"pr...'>,\n <data='<p class=\"price_color\">£47.82</p>' parent='<div class=\"product_price\"> <p class=\"pr...'>,\n ...]\n```\nAnd so on...\n\nFind all elements similar to the current element in location and attributes. For our case, ignore the 'title' attribute while matching\n```python\n>>> element = page.find_by_text('Tipping the Velvet')\n>>> element.find_similar(ignore_attributes=['title'])\n[<data='<a href=\"catalogue/a-light-in-the-attic_...' parent='<h3><a href=\"catalogue/a-light-in-the-at...'>,\n <data='<a href=\"catalogue/soumission_998/index....' parent='<h3><a href=\"catalogue/soumission_998/in...'>,\n <data='<a href=\"catalogue/sharp-objects_997/ind...' parent='<h3><a href=\"catalogue/sharp-objects_997...'>,\n...]\n```\nThe number of elements is 19, not 20, because the current element is not included in the results:\n```python\n>>> len(element.find_similar(ignore_attributes=['title']))\n19\n```\nGet the `href` attribute from all similar elements\n```python\n>>> [\n    element.attrib['href']\n    for element in element.find_similar(ignore_attributes=['title'])\n]\n['catalogue/a-light-in-the-attic_1000/index.html',\n 'catalogue/soumission_998/index.html',\n 'catalogue/sharp-objects_997/index.html',\n ...]\n```\nGetting all books' data using that element as a starting point:\n```python\n>>> for product in element.parent.parent.find_similar():\n        print({\n            \"name\": product.css('h3 a::text').get(),\n            \"price\": product.css('.price_color')[0].re_first(r'[\\d\\.]+'),\n            \"stock\": product.css('.availability::text').getall()[-1].clean()\n        })\n{'name': 'A Light in the ...', 'price': '51.77', 'stock': 'In stock'}\n{'name': 'Soumission', 'price': '50.10', 'stock': 'In stock'}\n{'name': 'Sharp Objects', 'price': '47.82', 'stock': 'In stock'}\n...\n```\n### Advanced examples\nAdvanced examples using the `find_similar` method:\n\nE-commerce Product Extraction\n```python\ndef extract_product_grid(page):\n    # Find the first product card\n    first_product = page.find_by_text('Add to Cart').find_ancestor(\n        lambda e: e.has_class('product-card')\n    )\n\n    # Find similar product cards\n    products = first_product.find_similar()\n\n    return [\n        {\n            'name': p.css('h3::text').get(),\n            'price': p.css('.price::text').re_first(r'\\d+\\.\\d{2}'),\n            'stock': 'In stock' in p.text,\n            'rating': p.css('.rating')[0].attrib.get('data-rating')\n        }\n        for p in products\n    ]\n```\nTable Row Extraction\n```python\ndef extract_table_data(page):\n    # Find the first data row\n    first_row = page.css('table tbody tr')[0]\n\n    # Find similar rows\n    rows = first_row.find_similar()\n\n    return [\n        {\n            'column1': row.css('td:nth-child(1)::text').get(),\n            'column2': row.css('td:nth-child(2)::text').get(),\n            'column3': row.css('td:nth-child(3)::text').get()\n        }\n        for row in rows\n    ]\n```\nForm Field Extraction\n```python\ndef extract_form_fields(page):\n    # Find first form field container\n    first_field = page.css('input')[0].find_ancestor(\n        lambda e: e.has_class('form-field')\n    )\n\n    # Find similar field containers\n    fields = first_field.find_similar()\n\n    return [\n        {\n            'label': f.css('label::text').get(),\n            'type': f.css('input')[0].attrib.get('type'),\n            'required': 'required' in f.css('input')[0].attrib\n        }\n        for f in fields\n    ]\n```\nExtracting reviews from a website\n```python\ndef extract_reviews(page):\n    # Find first review\n    first_review = page.find_by_text('Great product!')\n    review_container = first_review.find_ancestor(\n        lambda e: e.has_class('review')\n    )\n    \n    # Find similar reviews\n    all_reviews = review_container.find_similar()\n    \n    return [\n        {\n            'text': r.css('.review-text::text').get(),\n            'rating': r.attrib.get('data-rating'),\n            'author': r.css('.reviewer::text').get()\n        }\n        for r in all_reviews\n    ]\n```\n## Filters-based searching\nInspired by BeautifulSoup's `find_all` function, elements can be found using the `find_all` and `find` methods. Both methods accept multiple filters and return all elements on the pages where all filters apply.\n\nTo be more specific:\n\n* Any string passed is considered a tag name.\n* Any iterable passed, like List/Tuple/Set, will be considered as an iterable of tag names.\n* Any dictionary is considered a mapping of HTML element(s), attribute names, and attribute values.\n* Any regex patterns passed are used to filter elements by content, like the `find_by_regex` method\n* Any functions passed are used to filter elements\n* Any keyword argument passed is considered as an HTML element attribute with its value.\n\nIt collects all passed arguments and keywords, and each filter passes its results to the following filter in a waterfall-like filtering system.\n\nIt filters all elements in the current page/element in the following order:\n\n1. All elements with the passed tag name(s) get collected.\n2. All elements that match all passed attribute(s) are collected; if a previous filter is used, then previously collected elements are filtered.\n3. All elements that match all passed regex patterns are collected, or if previous filter(s) are used, then previously collected elements are filtered.\n4. All elements that fulfill all passed function(s) are collected; if a previous filter(s) is used, then previously collected elements are filtered.\n\n**Notes:**\n\n1. The filtering process always starts from the first filter it finds in the filtering order above. If no tag name(s) are passed but attributes are passed, the process starts from step 2, and so on.\n2. The order in which arguments are passed does not matter. The only order considered is the one explained above.\n\n### Examples\n```python\n>>> from scrapling.fetchers import Fetcher\n>>> page = Fetcher.get('https://quotes.toscrape.com/')\n```\nFind all elements with the tag name `div`.\n```python\n>>> page.find_all('div')\n[<data='<div class=\"container\"> <div class=\"row...' parent='<body> <div class=\"container\"> <div clas...'>,\n <data='<div class=\"row header-box\"> <div class=...' parent='<div class=\"container\"> <div class=\"row...'>,\n...]\n```\nFind all div elements with a class that equals `quote`.\n```python\n>>> page.find_all('div', class_='quote')\n[<data='<div class=\"quote\" itemscope itemtype=\"h...' parent='<div class=\"col-md-8\"> <div class=\"quote...'>,\n <data='<div class=\"quote\" itemscope itemtype=\"h...' parent='<div class=\"col-md-8\"> <div class=\"quote...'>,\n...]\n```\nSame as above.\n```python\n>>> page.find_all('div', {'class': 'quote'})\n[<data='<div class=\"quote\" itemscope itemtype=\"h...' parent='<div class=\"col-md-8\"> <div class=\"quote...'>,\n <data='<div class=\"quote\" itemscope itemtype=\"h...' parent='<div class=\"col-md-8\"> <div class=\"quote...'>,\n...]\n```\nFind all elements with a class that equals `quote`.\n```python\n>>> page.find_all({'class': 'quote'})\n[<data='<div class=\"quote\" itemscope itemtype=\"h...' parent='<div class=\"col-md-8\"> <div class=\"quote...'>,\n <data='<div class=\"quote\" itemscope itemtype=\"h...' parent='<div class=\"col-md-8\"> <div class=\"quote...'>,\n...]\n```\nFind all div elements with a class that equals `quote` and contains the element `.text`, which contains the word 'world' in its content.\n```python\n>>> page.find_all('div', {'class': 'quote'}, lambda e: \"world\" in e.css('.text::text').get())\n[<data='<div class=\"quote\" itemscope itemtype=\"h...' parent='<div class=\"col-md-8\"> <div class=\"quote...'>]\n```\nFind all elements that have children.\n```python\n>>> page.find_all(lambda element: len(element.children) > 0)\n[<data='<html lang=\"en\"><head><meta charset=\"UTF...'>,\n <data='<head><meta charset=\"UTF-8\"><title>Quote...' parent='<html lang=\"en\"><head><meta charset=\"UTF...'>,\n <data='<body> <div class=\"container\"> <div clas...' parent='<html lang=\"en\"><head><meta charset=\"UTF...'>,\n...]\n```\nFind all elements that contain the word 'world' in their content.\n```python\n>>> page.find_all(lambda element: \"world\" in element.text)\n[<data='<span class=\"text\" itemprop=\"text\">“The...' parent='<div class=\"quote\" itemscope itemtype=\"h...'>,\n <data='<a class=\"tag\" href=\"/tag/world/page/1/\"...' parent='<div class=\"tags\"> Tags: <meta class=\"ke...'>]\n```\nFind all span elements that match the given regex\n```python\n>>> page.find_all('span', re.compile(r'world'))\n[<data='<span class=\"text\" itemprop=\"text\">“The...' parent='<div class=\"quote\" itemscope itemtype=\"h...'>]\n```\nFind all div and span elements with class 'quote' (No span elements like that, so only div returned)\n```python\n>>> page.find_all(['div', 'span'], {'class': 'quote'})\n[<data='<div class=\"quote\" itemscope itemtype=\"h...' parent='<div class=\"col-md-8\"> <div class=\"quote...'>,\n <data='<div class=\"quote\" itemscope itemtype=\"h...' parent='<div class=\"col-md-8\"> <div class=\"quote...'>,\n...]\n```\nMix things up\n```python\n>>> page.find_all({'itemtype':\"http://schema.org/CreativeWork\"}, 'div').css('.author::text').getall()\n['Albert Einstein',\n 'J.K. Rowling',\n...]\n```\nA bonus pro tip: Find all elements whose `href` attribute's value ends with the word 'Einstein'.\n```python\n>>> page.find_all({'href$': 'Einstein'})\n[<data='<a href=\"/author/Albert-Einstein\">(about...' parent='<span>by <small class=\"author\" itemprop=...'>,\n <data='<a href=\"/author/Albert-Einstein\">(about...' parent='<span>by <small class=\"author\" itemprop=...'>,\n <data='<a href=\"/author/Albert-Einstein\">(about...' parent='<span>by <small class=\"author\" itemprop=...'>]\n```\nAnother pro tip: Find all elements whose `href` attribute's value has '/author/' in it\n```python\n>>> page.find_all({'href*': '/author/'})\n[<data='<a href=\"/author/Albert-Einstein\">(about...' parent='<span>by <small class=\"author\" itemprop=...'>,\n <data='<a href=\"/author/J-K-Rowling\">(about)</a...' parent='<span>by <small class=\"author\" itemprop=...'>,\n <data='<a href=\"/author/Albert-Einstein\">(about...' parent='<span>by <small class=\"author\" itemprop=...'>,\n...]\n```\nAnd so on...\n\n## Generating selectors\nCSS/XPath selectors can be generated for any element, regardless of the method used to find it.\n\nGenerate a short CSS selector for the `url_element` element (if possible, create a short one; otherwise, it's a full selector)\n```python\n>>> url_element = page.find({'href*': '/author/'})\n>>> url_element.generate_css_selector\n'body > div > div:nth-of-type(2) > div > div > span:nth-of-type(2) > a'\n```\nGenerate a full CSS selector for the `url_element` element from the start of the page\n```python\n>>> url_element.generate_full_css_selector\n'body > div > div:nth-of-type(2) > div > div > span:nth-of-type(2) > a'\n```\nGenerate a short XPath selector for the `url_element` element (if possible, create a short one; otherwise, it's a full selector)\n```python\n>>> url_element.generate_xpath_selector\n'//body/div/div[2]/div/div/span[2]/a'\n```\nGenerate a full XPath selector for the `url_element` element from the start of the page\n```python\n>>> url_element.generate_full_xpath_selector\n'//body/div/div[2]/div/div/span[2]/a'\n```\n**Note:** When generating a short selector, Scrapling tries to find a unique element (e.g., one with an `id` attribute) as a stop point. If none exists, the short and full selectors will be identical.\n\n## Using selectors with regular expressions\nSimilar to `parsel`/`scrapy`, `re` and `re_first` methods are available for extracting data using regular expressions. These methods exist in `Selector`, `Selectors`, `TextHandler`, and `TextHandlers`, so they can be used directly on elements even without selecting a text node. See the [TextHandler](main_classes.md#texthandler) class for details.\n\nExamples:\n```python\n>>> page.css('.price_color')[0].re_first(r'[\\d\\.]+')\n'51.77'\n\n>>> page.css('.price_color').re_first(r'[\\d\\.]+')\n'51.77'\n\n>>> page.css('.price_color').re(r'[\\d\\.]+')\n['51.77',\n '53.74',\n '50.10',\n '47.82',\n '54.23',\n...]\n\n>>> page.css('.product_pod h3 a::attr(href)').re(r'catalogue/(.*)/index.html')\n['a-light-in-the-attic_1000',\n 'tipping-the-velvet_999',\n 'soumission_998',\n 'sharp-objects_997',\n...]\n\n>>> filtering_function = lambda e: e.parent.tag == 'h3' and e.parent.parent.has_class('product_pod')  # As above selector\n>>> page.find('a', filtering_function).attrib['href'].re(r'catalogue/(.*)/index.html')\n['a-light-in-the-attic_1000']\n\n>>> page.find_by_text('Tipping the Velvet').attrib['href'].re(r'catalogue/(.*)/index.html')\n['tipping-the-velvet_999']\n```\nSee the [TextHandler](main_classes.md#texthandler) class for more details on regex methods."
  },
  {
    "path": "agent-skill/Scrapling-Skill/references/spiders/advanced.md",
    "content": "# Advanced usages\n\n## Concurrency Control\n\nThe spider system uses three class attributes to control how aggressively it crawls:\n\n| Attribute                        | Default | Description                                                      |\n|----------------------------------|---------|------------------------------------------------------------------|\n| `concurrent_requests`            | `4`     | Maximum number of requests being processed at the same time      |\n| `concurrent_requests_per_domain` | `0`     | Maximum concurrent requests per domain (0 = no per-domain limit) |\n| `download_delay`                 | `0.0`   | Seconds to wait before each request                              |\n\n```python\nclass PoliteSpider(Spider):\n    name = \"polite\"\n    start_urls = [\"https://example.com\"]\n\n    # Be gentle with the server\n    concurrent_requests = 4\n    concurrent_requests_per_domain = 2\n    download_delay = 1.0  # Wait 1 second between requests\n\n    async def parse(self, response: Response):\n        yield {\"title\": response.css(\"title::text\").get(\"\")}\n```\n\nWhen `concurrent_requests_per_domain` is set, each domain gets its own concurrency limiter in addition to the global limit. This is useful when crawling multiple domains simultaneously — you can allow high global concurrency while being polite to each individual domain.\n\n**Tip:** The `download_delay` parameter adds a fixed wait before every request, regardless of the domain. Use it for simple rate limiting.\n\n### Using uvloop\n\nThe `start()` method accepts a `use_uvloop` parameter to use the faster [uvloop](https://github.com/MagicStack/uvloop)/[winloop](https://github.com/nicktimko/winloop) event loop implementation, if available:\n\n```python\nresult = MySpider().start(use_uvloop=True)\n```\n\nThis can improve throughput for I/O-heavy crawls. You'll need to install `uvloop` (Linux/macOS) or `winloop` (Windows) separately.\n\n## Pause & Resume\n\nThe spider supports graceful pause-and-resume via checkpointing. To enable it, pass a `crawldir` directory to the spider constructor:\n\n```python\nspider = MySpider(crawldir=\"crawl_data/my_spider\")\nresult = spider.start()\n\nif result.paused:\n    print(\"Crawl was paused. Run again to resume.\")\nelse:\n    print(\"Crawl completed!\")\n```\n\n### How It Works\n\n1. **Pausing**: Press `Ctrl+C` during a crawl. The spider waits for all in-flight requests to finish, saves a checkpoint (pending requests + a set of seen request fingerprints), and then exits.\n2. **Force stopping**: Press `Ctrl+C` a second time to stop immediately without waiting for active tasks.\n3. **Resuming**: Run the spider again with the same `crawldir`. It detects the checkpoint, restores the queue and seen set, and continues from where it left off — skipping `start_requests()`.\n4. **Cleanup**: When a crawl completes normally (not paused), the checkpoint files are deleted automatically.\n\n**Checkpoints are also saved periodically during the crawl (every 5 minutes by default).** \n\nYou can change the interval as follows:\n\n```python\n# Save checkpoint every 2 minutes\nspider = MySpider(crawldir=\"crawl_data/my_spider\", interval=120.0)\n```\n\nThe writing to the disk is atomic, so it's totally safe.\n\n**Tip:** Pressing `Ctrl+C` during a crawl always causes the spider to close gracefully, even if the checkpoint system is not enabled. Doing it again without waiting forces the spider to close immediately.\n\n### Knowing If You're Resuming\n\nThe `on_start()` hook receives a `resuming` flag:\n\n```python\nasync def on_start(self, resuming: bool = False):\n    if resuming:\n        self.logger.info(\"Resuming from checkpoint!\")\n    else:\n        self.logger.info(\"Starting fresh crawl\")\n```\n\n## Streaming\n\nFor long-running spiders or applications that need real-time access to scraped items, use the `stream()` method instead of `start()`:\n\n```python\nimport anyio\n\nasync def main():\n    spider = MySpider()\n    async for item in spider.stream():\n        print(f\"Got item: {item}\")\n        # Access real-time stats\n        print(f\"Items so far: {spider.stats.items_scraped}\")\n        print(f\"Requests made: {spider.stats.requests_count}\")\n\nanyio.run(main)\n```\n\nKey differences from `start()`:\n\n- `stream()` must be called from an async context\n- Items are yielded one by one as they're scraped, not collected into a list\n- You can access `spider.stats` during iteration for real-time statistics\n\n**Note:** The full list of all stats that can be accessed by `spider.stats` is explained below [here](#results--statistics).\n\nYou can use it with the checkpoint system too, so it's easy to build UI on top of spiders. UIs that have real-time data and can be paused/resumed.\n\n```python\nimport anyio\n\nasync def main():\n    spider = MySpider(crawldir=\"crawl_data/my_spider\")\n    async for item in spider.stream():\n        print(f\"Got item: {item}\")\n        # Access real-time stats\n        print(f\"Items so far: {spider.stats.items_scraped}\")\n        print(f\"Requests made: {spider.stats.requests_count}\")\n\nanyio.run(main)\n```\nYou can also use `spider.pause()` to shut down the spider in the code above. If you used it without enabling the checkpoint system, it will just close the crawl.\n\n## Lifecycle Hooks\n\nThe spider provides several hooks you can override to add custom behavior at different stages of the crawl:\n\n### on_start\n\nCalled before crawling begins. Use it for setup tasks like loading data or initializing resources:\n\n```python\nasync def on_start(self, resuming: bool = False):\n    self.logger.info(\"Spider starting up\")\n    # Load seed URLs from a database, initialize counters, etc.\n```\n\n### on_close\n\nCalled after crawling finishes (whether completed or paused). Use it for cleanup:\n\n```python\nasync def on_close(self):\n    self.logger.info(\"Spider shutting down\")\n    # Close database connections, flush buffers, etc.\n```\n\n### on_error\n\nCalled when a request fails with an exception. Use it for error tracking or custom recovery logic:\n\n```python\nasync def on_error(self, request: Request, error: Exception):\n    self.logger.error(f\"Failed: {request.url} - {error}\")\n    # Log to error tracker, save failed URL for later, etc.\n```\n\n### on_scraped_item\n\nCalled for every scraped item before it's added to the results. Return the item (modified or not) to keep it, or return `None` to drop it:\n\n```python\nasync def on_scraped_item(self, item: dict) -> dict | None:\n    # Drop items without a title\n    if not item.get(\"title\"):\n        return None\n\n    # Modify items (e.g., add timestamps)\n    item[\"scraped_at\"] = \"2026-01-01\"\n    return item\n```\n\n**Tip:** This hook can also be used to direct items through your own pipelines and drop them from the spider.\n\n### start_requests\n\nOverride `start_requests()` for custom initial request generation instead of using `start_urls`:\n\n```python\nasync def start_requests(self):\n    # POST request to log in first\n    yield Request(\n        \"https://example.com/login\",\n        method=\"POST\",\n        data={\"user\": \"admin\", \"pass\": \"secret\"},\n        callback=self.after_login,\n    )\n\nasync def after_login(self, response: Response):\n    # Now crawl the authenticated pages\n    yield response.follow(\"/dashboard\", callback=self.parse)\n```\n\n## Results & Statistics\n\nThe `CrawlResult` returned by `start()` contains both the scraped items and detailed statistics:\n\n```python\nresult = MySpider().start()\n\n# Items\nprint(f\"Total items: {len(result.items)}\")\nresult.items.to_json(\"output.json\", indent=True)\n\n# Did the crawl complete?\nprint(f\"Completed: {result.completed}\")\nprint(f\"Paused: {result.paused}\")\n\n# Statistics\nstats = result.stats\nprint(f\"Requests: {stats.requests_count}\")\nprint(f\"Failed: {stats.failed_requests_count}\")\nprint(f\"Blocked: {stats.blocked_requests_count}\")\nprint(f\"Offsite filtered: {stats.offsite_requests_count}\")\nprint(f\"Items scraped: {stats.items_scraped}\")\nprint(f\"Items dropped: {stats.items_dropped}\")\nprint(f\"Response bytes: {stats.response_bytes}\")\nprint(f\"Duration: {stats.elapsed_seconds:.1f}s\")\nprint(f\"Speed: {stats.requests_per_second:.1f} req/s\")\n```\n\n### Detailed Stats\n\nThe `CrawlStats` object tracks granular information:\n\n```python\nstats = result.stats\n\n# Status code distribution\nprint(stats.response_status_count)\n# {'status_200': 150, 'status_404': 3, 'status_403': 1}\n\n# Bytes downloaded per domain\nprint(stats.domains_response_bytes)\n# {'example.com': 1234567, 'api.example.com': 45678}\n\n# Requests per session\nprint(stats.sessions_requests_count)\n# {'http': 120, 'stealth': 34}\n\n# Proxies used during the crawl\nprint(stats.proxies)\n# ['http://proxy1:8080', 'http://proxy2:8080']\n\n# Log level counts\nprint(stats.log_levels_counter)\n# {'debug': 200, 'info': 50, 'warning': 3, 'error': 1, 'critical': 0}\n\n# Timing information\nprint(stats.start_time)       # Unix timestamp when crawl started\nprint(stats.end_time)         # Unix timestamp when crawl finished\nprint(stats.download_delay)   # The download delay used (seconds)\n\n# Concurrency settings used\nprint(stats.concurrent_requests)             # Global concurrency limit\nprint(stats.concurrent_requests_per_domain)  # Per-domain concurrency limit\n\n# Custom stats (set by your spider code)\nprint(stats.custom_stats)\n# {'login_attempts': 3, 'pages_with_errors': 5}\n\n# Export everything as a dict\nprint(stats.to_dict())\n```\n\n## Logging\n\nThe spider has a built-in logger accessible via `self.logger`. It's pre-configured with the spider's name and supports several customization options:\n\n| Attribute             | Default                                                      | Description                                        |\n|-----------------------|--------------------------------------------------------------|----------------------------------------------------|\n| `logging_level`       | `logging.DEBUG`                                              | Minimum log level                                  |\n| `logging_format`      | `\"[%(asctime)s]:({spider_name}) %(levelname)s: %(message)s\"` | Log message format                                 |\n| `logging_date_format` | `\"%Y-%m-%d %H:%M:%S\"`                                        | Date format in log messages                        |\n| `log_file`            | `None`                                                       | Path to a log file (in addition to console output) |\n\n```python\nimport logging\n\nclass MySpider(Spider):\n    name = \"my_spider\"\n    start_urls = [\"https://example.com\"]\n    logging_level = logging.INFO\n    log_file = \"logs/my_spider.log\"\n\n    async def parse(self, response: Response):\n        self.logger.info(f\"Processing {response.url}\")\n        yield {\"title\": response.css(\"title::text\").get(\"\")}\n```\n\nThe log file directory is created automatically if it doesn't exist. Both console and file output use the same format."
  },
  {
    "path": "agent-skill/Scrapling-Skill/references/spiders/architecture.md",
    "content": "# Spiders architecture\n\nScrapling's spider system is an async crawling framework designed for concurrent, multi-session crawls with built-in pause/resume support. It brings together Scrapling's parsing engine and fetchers into a unified crawling API while adding scheduling, concurrency control, and checkpointing.\n\n## Data Flow\n\nThe diagram below shows how data flows through the spider system when a crawl is running:\n\nHere's what happens step by step when you run a spider:\n\n1. The **Spider** produces the first batch of `Request` objects. By default, it creates one request for each URL in `start_urls`, but you can override `start_requests()` for custom logic.\n2. The **Scheduler** receives requests and places them in a priority queue, and creates fingerprints for them. Higher-priority requests are dequeued first.\n3. The **Crawler Engine** asks the **Scheduler** to dequeue the next request, respecting concurrency limits (global and per-domain) and download delays. Once the **Crawler Engine** receives the request, it passes it to the **Session Manager**, which routes it to the correct session based on the request's `sid` (session ID).\n4. The **session** fetches the page and returns a [Response](../fetching/choosing.md#response-object) object to the **Crawler Engine**. The engine records statistics and checks for blocked responses. If the response is blocked, the engine retries the request up to `max_blocked_retries` times. Of course, the blocking detection and the retry logic for blocked requests can be customized.\n5. The **Crawler Engine** passes the [Response](../fetching/choosing.md#response-object) to the request's callback. The callback either yields a dictionary, which gets treated as a scraped item, or a follow-up request, which gets sent to the scheduler for queuing.\n6. The cycle repeats from step 2 until the scheduler is empty and no tasks are active, or the spider is paused.\n7. If `crawldir` is set while starting the spider, the **Crawler Engine** periodically saves a checkpoint (pending requests + seen URLs set) to disk. On graceful shutdown (Ctrl+C), a final checkpoint is saved. The next time the spider runs with the same `crawldir`, it resumes from where it left off — skipping `start_requests()` and restoring the scheduler state.\n\n\n## Components\n\n### Spider\n\nThe central class you interact with. You subclass `Spider`, define your `start_urls` and `parse()` method, and optionally configure sessions and override lifecycle hooks.\n\n```python\nfrom scrapling.spiders import Spider, Response, Request\n\nclass MySpider(Spider):\n    name = \"my_spider\"\n    start_urls = [\"https://example.com\"]\n\n    async def parse(self, response: Response):\n        for link in response.css(\"a::attr(href)\").getall():\n            yield response.follow(link, callback=self.parse_page)\n\n    async def parse_page(self, response: Response):\n        yield {\"title\": response.css(\"h1::text\").get(\"\")}\n```\n\n### Crawler Engine\n\nThe engine orchestrates the entire crawl. It manages the main loop, enforces concurrency limits, dispatches requests through the Session Manager, and processes results from callbacks. You don't interact with it directly — the `Spider.start()` and `Spider.stream()` methods handle it for you.\n\n### Scheduler\n\nA priority queue with built-in URL deduplication. Requests are fingerprinted based on their URL, HTTP method, body, and session ID. The scheduler supports `snapshot()` and `restore()` for the checkpoint system, allowing the crawl state to be saved and resumed.\n\n### Session Manager\n\nManages one or more named session instances. Each session is one of:\n\n- [FetcherSession](../fetching/static.md)\n- [AsyncDynamicSession](../fetching/dynamic.md)\n- [AsyncStealthySession](../fetching/stealthy.md)\n\nWhen a request comes in, the Session Manager routes it to the correct session based on the request's `sid` field. Sessions can be started with the spider start (default) or lazily (started on the first use).\n\n### Checkpoint System\n\nAn optional system that, if enabled, saves the crawler's state (pending requests + seen URL fingerprints) to a pickle file on disk. Writes are atomic (temp file + rename) to prevent corruption. Checkpoints are saved periodically at a configurable interval and on graceful shutdown. Upon successful completion (not paused), checkpoint files are automatically cleaned up.\n\n### Output\n\nScraped items are collected in an `ItemList` (a list subclass with `to_json()` and `to_jsonl()` export methods). Crawl statistics are tracked in a `CrawlStats` dataclass which contains a lot of useful info.\n\n\n## Comparison with Scrapy\n\nIf you're coming from Scrapy, here's how Scrapling's spider system maps:\n\n| Concept            | Scrapy                        | Scrapling                                                       |\n|--------------------|-------------------------------|-----------------------------------------------------------------|\n| Spider definition  | `scrapy.Spider` subclass      | `scrapling.spiders.Spider` subclass                             |\n| Initial requests   | `start_requests()`            | `async start_requests()`                                        |\n| Callbacks          | `def parse(self, response)`   | `async def parse(self, response)`                               |\n| Following links    | `response.follow(url)`        | `response.follow(url)`                                          |\n| Item output        | `yield dict` or `yield Item`  | `yield dict`                                                    |\n| Request scheduling | Scheduler + Dupefilter        | Scheduler with built-in deduplication                           |\n| Downloading        | Downloader + Middlewares      | Session Manager with multi-session support                      |\n| Item processing    | Item Pipelines                | `on_scraped_item()` hook                                        |\n| Blocked detection  | Through custom middlewares    | Built-in `is_blocked()` + `retry_blocked_request()` hooks       |\n| Concurrency        | `CONCURRENT_REQUESTS` setting | `concurrent_requests` class attribute                           |\n| Domain filtering   | `allowed_domains`             | `allowed_domains`                                               |\n| Pause/Resume       | `JOBDIR` setting              | `crawldir` constructor argument                                 |\n| Export             | Feed exports                  | `result.items.to_json()` / `to_jsonl()` or custom through hooks |\n| Running            | `scrapy crawl spider_name`    | `MySpider().start()`                                            |\n| Streaming          | N/A                           | `async for item in spider.stream()`                             |\n| Multi-session      | N/A                           | Multiple sessions with different types per spider               |"
  },
  {
    "path": "agent-skill/Scrapling-Skill/references/spiders/getting-started.md",
    "content": "# Getting started\n\n## Your First Spider\n\nA spider is a class that defines how to crawl and extract data from websites. Here's the simplest possible spider:\n\n```python\nfrom scrapling.spiders import Spider, Response\n\nclass QuotesSpider(Spider):\n    name = \"quotes\"\n    start_urls = [\"https://quotes.toscrape.com\"]\n\n    async def parse(self, response: Response):\n        for quote in response.css(\"div.quote\"):\n            yield {\n                \"text\": quote.css(\"span.text::text\").get(\"\"),\n                \"author\": quote.css(\"small.author::text\").get(\"\"),\n            }\n```\n\nEvery spider needs three things:\n\n1. **`name`** — A unique identifier for the spider.\n2. **`start_urls`** — A list of URLs to start crawling from.\n3. **`parse()`** — An async generator method that processes each response and yields results.\n\nThe `parse()` method processes each response. You use the same selection methods you'd use with Scrapling's [Selector](../parsing/main_classes.md#selector)/[Response](../fetching/choosing.md#response-object), and `yield` dictionaries to output scraped items.\n\n## Running the Spider\n\nTo run your spider, create an instance and call `start()`:\n\n```python\nresult = QuotesSpider().start()\n```\n\nThe `start()` method handles all the async machinery internally — no need to worry about event loops. While the spider is running, everything that happens is logged to the terminal, and at the end of the crawl, you get very detailed stats.\n\nThose stats are in the returned `CrawlResult` object, which gives you everything you need:\n\n```python\nresult = QuotesSpider().start()\n\n# Access scraped items\nfor item in result.items:\n    print(item[\"text\"], \"-\", item[\"author\"])\n\n# Check statistics\nprint(f\"Scraped {result.stats.items_scraped} items\")\nprint(f\"Made {result.stats.requests_count} requests\")\nprint(f\"Took {result.stats.elapsed_seconds:.1f} seconds\")\n\n# Did the crawl finish or was it paused?\nprint(f\"Completed: {result.completed}\")\n```\n\n## Following Links\n\nMost crawls need to follow links across multiple pages. Use `response.follow()` to create follow-up requests:\n\n```python\nfrom scrapling.spiders import Spider, Response\n\nclass QuotesSpider(Spider):\n    name = \"quotes\"\n    start_urls = [\"https://quotes.toscrape.com\"]\n\n    async def parse(self, response: Response):\n        # Extract items from the current page\n        for quote in response.css(\"div.quote\"):\n            yield {\n                \"text\": quote.css(\"span.text::text\").get(\"\"),\n                \"author\": quote.css(\"small.author::text\").get(\"\"),\n            }\n\n        # Follow the \"next page\" link\n        next_page = response.css(\"li.next a::attr(href)\").get()\n        if next_page:\n            yield response.follow(next_page, callback=self.parse)\n```\n\n`response.follow()` handles relative URLs automatically — it joins them with the current page's URL. It also sets the current page as the `Referer` header by default.\n\nYou can point follow-up requests at different callback methods for different page types:\n\n```python\nasync def parse(self, response: Response):\n    for link in response.css(\"a.product-link::attr(href)\").getall():\n        yield response.follow(link, callback=self.parse_product)\n\nasync def parse_product(self, response: Response):\n    yield {\n        \"name\": response.css(\"h1::text\").get(\"\"),\n        \"price\": response.css(\".price::text\").get(\"\"),\n    }\n```\n\n**Note:** All callback methods must be async generators (using `async def` and `yield`).\n\n## Exporting Data\n\nThe `ItemList` returned in `result.items` has built-in export methods:\n\n```python\nresult = QuotesSpider().start()\n\n# Export as JSON\nresult.items.to_json(\"quotes.json\")\n\n# Export as JSON with pretty-printing\nresult.items.to_json(\"quotes.json\", indent=True)\n\n# Export as JSON Lines (one JSON object per line)\nresult.items.to_jsonl(\"quotes.jsonl\")\n```\n\nBoth methods create parent directories automatically if they don't exist.\n\n## Filtering Domains\n\nUse `allowed_domains` to restrict the spider to specific domains. This prevents it from accidentally following links to external websites:\n\n```python\nclass MySpider(Spider):\n    name = \"my_spider\"\n    start_urls = [\"https://example.com\"]\n    allowed_domains = {\"example.com\"}\n\n    async def parse(self, response: Response):\n        for link in response.css(\"a::attr(href)\").getall():\n            # Links to other domains are silently dropped\n            yield response.follow(link, callback=self.parse)\n```\n\nSubdomains are matched automatically — setting `allowed_domains = {\"example.com\"}` also allows `sub.example.com`, `blog.example.com`, etc.\n\nWhen a request is filtered out, it's counted in `stats.offsite_requests_count` so you can see how many were dropped.\n\n"
  },
  {
    "path": "agent-skill/Scrapling-Skill/references/spiders/proxy-blocking.md",
    "content": "# Proxy management and handling Blocks\n\nScrapling's `ProxyRotator` manages proxy rotation across requests. It works with all session types and integrates with the spider's blocked request retry system.\n\n## ProxyRotator\n\nThe `ProxyRotator` class manages a list of proxies and rotates through them automatically. Pass it to any session type via the `proxy_rotator` parameter:\n\n```python\nfrom scrapling.spiders import Spider, Response\nfrom scrapling.fetchers import FetcherSession, ProxyRotator\n\nclass MySpider(Spider):\n    name = \"my_spider\"\n    start_urls = [\"https://example.com\"]\n\n    def configure_sessions(self, manager):\n        rotator = ProxyRotator([\n            \"http://proxy1:8080\",\n            \"http://proxy2:8080\",\n            \"http://user:pass@proxy3:8080\",\n        ])\n        manager.add(\"default\", FetcherSession(proxy_rotator=rotator))\n\n    async def parse(self, response: Response):\n        # Check which proxy was used\n        print(f\"Proxy used: {response.meta.get('proxy')}\")\n        yield {\"title\": response.css(\"title::text\").get(\"\")}\n```\n\nEach request automatically gets the next proxy in the rotation. The proxy used is stored in `response.meta[\"proxy\"]` so you can track which proxy fetched which page.\n\n\nBrowser sessions support both string and dict proxy formats:\n\n```python\nfrom scrapling.fetchers import AsyncDynamicSession, AsyncStealthySession, ProxyRotator\n\n# String proxies work for all session types\nrotator = ProxyRotator([\n    \"http://proxy1:8080\",\n    \"http://proxy2:8080\",\n])\n\n# Dict proxies (Playwright format) work for browser sessions\nrotator = ProxyRotator([\n    {\"server\": \"http://proxy1:8080\", \"username\": \"user\", \"password\": \"pass\"},\n    {\"server\": \"http://proxy2:8080\"},\n])\n\n# Then inside the spider\ndef configure_sessions(self, manager):\n    rotator = ProxyRotator([\"http://proxy1:8080\", \"http://proxy2:8080\"])\n    manager.add(\"browser\", AsyncStealthySession(proxy_rotator=rotator))\n```\n\n**Important:**\n\n1. You cannot use the `proxy_rotator` argument together with the static `proxy` or `proxies` parameters on the same session. Pick one approach when configuring the session, and override it per request later if needed.\n2. By default, all browser-based sessions use a persistent browser context with a pool of tabs. However, since browsers can't set a proxy per tab, when you use a `ProxyRotator`, the fetcher will automatically open a separate context for each proxy, with one tab per context. Once the tab's job is done, both the tab and its context are closed.\n\n## Custom Rotation Strategies\n\nBy default, `ProxyRotator` uses cyclic rotation — it iterates through proxies sequentially, wrapping around at the end.\n\nYou can provide a custom strategy function to change this behavior, but it has to match the below signature:\n\n```python\nfrom scrapling.core._types import ProxyType\n\ndef my_strategy(proxies: list, current_index: int) -> tuple[ProxyType, int]:\n    ...\n```\n\nIt receives the list of proxies and the current index, and must return the chosen proxy and the next index.\n\nBelow are some examples of custom rotation strategies you can use.\n\n### Random Rotation\n\n```python\nimport random\nfrom scrapling.fetchers import ProxyRotator\n\ndef random_strategy(proxies, current_index):\n    idx = random.randint(0, len(proxies) - 1)\n    return proxies[idx], idx\n\nrotator = ProxyRotator(\n    [\"http://proxy1:8080\", \"http://proxy2:8080\", \"http://proxy3:8080\"],\n    strategy=random_strategy,\n)\n```\n\n### Weighted Rotation\n\n```python\nimport random\n\ndef weighted_strategy(proxies, current_index):\n    # First proxy gets 60% of traffic, others split the rest\n    weights = [60] + [40 // (len(proxies) - 1)] * (len(proxies) - 1)\n    proxy = random.choices(proxies, weights=weights, k=1)[0]\n    return proxy, current_index  # Index doesn't matter for weighted\n\nrotator = ProxyRotator(proxies, strategy=weighted_strategy)\n```\n\n\n## Per-Request Proxy Override\n\nYou can override the rotator for individual requests by passing `proxy=` as a keyword argument:\n\n```python\nasync def parse(self, response: Response):\n    # This request uses the rotator's next proxy\n    yield response.follow(\"/page1\", callback=self.parse_page)\n\n    # This request uses a specific proxy, bypassing the rotator\n    yield response.follow(\n        \"/special-page\",\n        callback=self.parse_page,\n        proxy=\"http://special-proxy:8080\",\n    )\n```\n\nThis is useful when certain pages require a specific proxy (e.g., a geo-located proxy for region-specific content).\n\n## Blocked Request Handling\n\nThe spider has built-in blocked request detection and retry. By default, it considers the following HTTP status codes blocked: `401`, `403`, `407`, `429`, `444`, `500`, `502`, `503`, `504`.\n\nThe retry system works like this:\n\n1. After a response comes back, the spider calls the `is_blocked(response)` method.\n2. If blocked, it copies the request and calls the `retry_blocked_request()` method so you can modify it before retrying.\n3. The retried request is re-queued with `dont_filter=True` (bypassing deduplication) and lower priority, so it's not retried right away.\n4. This repeats up to `max_blocked_retries` times (default: 3).\n\n**Tip:**\n\n1. On retry, the previous `proxy`/`proxies` kwargs are cleared from the request automatically, so the rotator assigns a fresh proxy.\n2. The `max_blocked_retries` attribute is different than the session retries and doesn't share the counter.\n\n### Custom Block Detection\n\nOverride `is_blocked()` to add your own detection logic:\n\n```python\nclass MySpider(Spider):\n    name = \"my_spider\"\n    start_urls = [\"https://example.com\"]\n\n    async def is_blocked(self, response: Response) -> bool:\n        # Check status codes (default behavior)\n        if response.status in {403, 429, 503}:\n            return True\n\n        # Check response content\n        body = response.body.decode(\"utf-8\", errors=\"ignore\")\n        if \"access denied\" in body.lower() or \"rate limit\" in body.lower():\n            return True\n\n        return False\n\n    async def parse(self, response: Response):\n        yield {\"title\": response.css(\"title::text\").get(\"\")}\n```\n\n### Customizing Retries\n\nOverride `retry_blocked_request()` to modify the request before retrying. The `max_blocked_retries` attribute controls how many times a blocked request is retried (default: 3):\n\n```python\nfrom scrapling.spiders import Spider, SessionManager, Request, Response\nfrom scrapling.fetchers import FetcherSession, AsyncStealthySession\n\n\nclass MySpider(Spider):\n    name = \"my_spider\"\n    start_urls = [\"https://example.com\"]\n    max_blocked_retries = 5\n\n    def configure_sessions(self, manager: SessionManager) -> None:\n        manager.add('requests', FetcherSession(impersonate=['chrome', 'firefox', 'safari']))\n        manager.add('stealth', AsyncStealthySession(block_webrtc=True), lazy=True)\n\n    async def retry_blocked_request(self, request: Request, response: Response) -> Request:\n        request.sid = \"stealth\"\n        self.logger.info(f\"Retrying blocked request: {request.url}\")\n        return request\n\n    async def parse(self, response: Response):\n        yield {\"title\": response.css(\"title::text\").get(\"\")}\n```\n\nWhat happened above is that I left the blocking detection logic unchanged and had the spider mainly use requests until it got blocked, then switch to the stealthy browser.\n\n\nPutting it all together:\n\n```python\nfrom scrapling.spiders import Spider, SessionManager, Request, Response\nfrom scrapling.fetchers import FetcherSession, AsyncStealthySession, ProxyRotator\n\n\ncheap_proxies = ProxyRotator([ \"http://proxy1:8080\", \"http://proxy2:8080\"])\n\n# A format acceptable by the browser\nexpensive_proxies = ProxyRotator([\n    {\"server\": \"http://residential_proxy1:8080\", \"username\": \"user\", \"password\": \"pass\"},\n    {\"server\": \"http://residential_proxy2:8080\", \"username\": \"user\", \"password\": \"pass\"},\n    {\"server\": \"http://mobile_proxy1:8080\", \"username\": \"user\", \"password\": \"pass\"},\n    {\"server\": \"http://mobile_proxy2:8080\", \"username\": \"user\", \"password\": \"pass\"},\n])\n\n\nclass MySpider(Spider):\n    name = \"my_spider\"\n    start_urls = [\"https://example.com\"]\n    max_blocked_retries = 5\n\n    def configure_sessions(self, manager: SessionManager) -> None:\n        manager.add('requests', FetcherSession(impersonate=['chrome', 'firefox', 'safari'], proxy_rotator=cheap_proxies))\n        manager.add('stealth', AsyncStealthySession(block_webrtc=True, proxy_rotator=expensive_proxies), lazy=True)\n\n    async def retry_blocked_request(self, request: Request, response: Response) -> Request:\n        request.sid = \"stealth\"\n        self.logger.info(f\"Retrying blocked request: {request.url}\")\n        return request\n\n    async def parse(self, response: Response):\n        yield {\"title\": response.css(\"title::text\").get(\"\")}\n```\nThe above logic is: requests are made with cheap proxies, such as datacenter proxies, until they are blocked, then retried with higher-quality proxies, such as residential or mobile proxies."
  },
  {
    "path": "agent-skill/Scrapling-Skill/references/spiders/requests-responses.md",
    "content": "# Requests & Responses\n\nThis page covers the `Request` object in detail — how to construct requests, pass data between callbacks, control priority and deduplication, and use `response.follow()` for link-following.\n\n## The Request Object\n\nA `Request` represents a URL to be fetched. You create requests either directly or via `response.follow()`:\n\n```python\nfrom scrapling.spiders import Request\n\n# Direct construction\nrequest = Request(\n    \"https://example.com/page\",\n    callback=self.parse_page,\n    priority=5,\n)\n\n# Via response.follow (preferred in callbacks)\nrequest = response.follow(\"/page\", callback=self.parse_page)\n```\n\nHere are all the arguments you can pass to `Request`:\n\n| Argument      | Type       | Default    | Description                                                                                           |\n|---------------|------------|------------|-------------------------------------------------------------------------------------------------------|\n| `url`         | `str`      | *required* | The URL to fetch                                                                                      |\n| `sid`         | `str`      | `\"\"`       | Session ID — routes the request to a specific session (see [Sessions](sessions.md))                   |\n| `callback`    | `callable` | `None`     | Async generator method to process the response. Defaults to `parse()`                                 |\n| `priority`    | `int`      | `0`        | Higher values are processed first                                                                     |\n| `dont_filter` | `bool`     | `False`    | If `True`, skip deduplication (allow duplicate requests)                                              |\n| `meta`        | `dict`     | `{}`       | Arbitrary metadata passed through to the response                                                     |\n| `**kwargs`    |            |            | Additional keyword arguments passed to the session's fetch method (e.g., `headers`, `method`, `data`) |\n\nAny extra keyword arguments are forwarded directly to the underlying session. For example, to make a POST request:\n\n```python\nyield Request(\n    \"https://example.com/api\",\n    method=\"POST\",\n    data={\"key\": \"value\"},\n    callback=self.parse_result,\n)\n```\n\n## Response.follow()\n\n`response.follow()` is the recommended way to create follow-up requests inside callbacks. It offers several advantages over constructing `Request` objects directly:\n\n- **Relative URLs** are resolved automatically against the current page URL\n- **Referer header** is set to the current page URL by default\n- **Session kwargs** from the original request are inherited (headers, proxy settings, etc.)\n- **Callback, session ID, and priority** are inherited from the original request if not specified\n\n```python\nasync def parse(self, response: Response):\n    # Minimal — inherits callback, sid, priority from current request\n    yield response.follow(\"/next-page\")\n\n    # Override specific fields\n    yield response.follow(\n        \"/product/123\",\n        callback=self.parse_product,\n        priority=10,\n    )\n\n    # Pass additional metadata to\n    yield response.follow(\n        \"/details\",\n        callback=self.parse_details,\n        meta={\"category\": \"electronics\"},\n    )\n```\n\n| Argument           | Type       | Default    | Description                                                |\n|--------------------|------------|------------|------------------------------------------------------------|\n| `url`              | `str`      | *required* | URL to follow (absolute or relative)                       |\n| `sid`              | `str`      | `\"\"`       | Session ID (inherits from original request if empty)       |\n| `callback`         | `callable` | `None`     | Callback method (inherits from original request if `None`) |\n| `priority`         | `int`      | `None`     | Priority (inherits from original request if `None`)        |\n| `dont_filter`      | `bool`     | `False`    | Skip deduplication                                         |\n| `meta`             | `dict`     | `None`     | Metadata (merged with existing response meta)              |\n| **`referer_flow`** | `bool`     | `True`     | Set current URL as Referer header                          |\n| `**kwargs`         |            |            | Merged with original request's session kwargs              |\n\n### Disabling Referer Flow\n\nBy default, `response.follow()` sets the `Referer` header to the current page URL. To disable this:\n\n```python\nyield response.follow(\"/page\", referer_flow=False)\n```\n\n## Callbacks\n\nCallbacks are async generator methods on your spider that process responses. They must `yield` one of three types:\n\n- **`dict`** — A scraped item, added to the results\n- **`Request`** — A follow-up request, added to the queue\n- **`None`** — Silently ignored\n\n```python\nclass MySpider(Spider):\n    name = \"my_spider\"\n    start_urls = [\"https://example.com\"]\n\n    async def parse(self, response: Response):\n        # Yield items (dicts)\n        yield {\"url\": response.url, \"title\": response.css(\"title::text\").get(\"\")}\n\n        # Yield follow-up requests\n        for link in response.css(\"a::attr(href)\").getall():\n            yield response.follow(link, callback=self.parse_page)\n\n    async def parse_page(self, response: Response):\n        yield {\"content\": response.css(\"article::text\").get(\"\")}\n```\n\n**Note:** All callback methods must be `async def` and use `yield` (not `return`). Even if a callback only yields items with no follow-up requests, it must still be an async generator.\n\n## Request Priority\n\nRequests with higher priority values are processed first. This is useful when some pages are more important to be processed first before others:\n\n```python\nasync def parse(self, response: Response):\n    # High priority — process product pages first\n    for link in response.css(\"a.product::attr(href)\").getall():\n        yield response.follow(link, callback=self.parse_product, priority=10)\n\n    # Low priority — pagination links processed after products\n    next_page = response.css(\"a.next::attr(href)\").get()\n    if next_page:\n        yield response.follow(next_page, callback=self.parse, priority=0)\n```\n\nWhen using `response.follow()`, the priority is inherited from the original request unless you specify a new one.\n\n## Deduplication\n\nThe spider automatically deduplicates requests based on a fingerprint computed from the URL, HTTP method, request body, and session ID. If two requests produce the same fingerprint, the second one is silently dropped.\n\nTo allow duplicate requests (e.g., re-visiting a page after login), set `dont_filter=True`:\n\n```python\nyield Request(\"https://example.com/dashboard\", dont_filter=True, callback=self.parse_dashboard)\n\n# Or with response.follow\nyield response.follow(\"/dashboard\", dont_filter=True, callback=self.parse_dashboard)\n```\n\nYou can fine-tune what goes into the fingerprint using class attributes on your spider:\n\n| Attribute            | Default | Effect                                                                                                          |\n|----------------------|---------|-----------------------------------------------------------------------------------------------------------------|\n| `fp_include_kwargs`  | `False` | Include extra request kwargs (arguments you passed to the session fetch, like headers, etc.) in the fingerprint |\n| `fp_keep_fragments`  | `False` | Keep URL fragments (`#section`) when computing fingerprints                                                     |\n| `fp_include_headers` | `False` | Include request headers in the fingerprint                                                                      |\n\nFor example, if you need to treat `https://example.com/page#section1` and `https://example.com/page#section2` as different URLs:\n\n```python\nclass MySpider(Spider):\n    name = \"my_spider\"\n    fp_keep_fragments = True\n    # ...\n```\n\n## Request Meta\n\nThe `meta` dictionary lets you pass arbitrary data between callbacks. This is useful when you need context from one page to process another:\n\n```python\nasync def parse(self, response: Response):\n    for product in response.css(\"div.product\"):\n        category = product.css(\"span.category::text\").get(\"\")\n        link = product.css(\"a::attr(href)\").get()\n        if link:\n            yield response.follow(\n                link,\n                callback=self.parse_product,\n                meta={\"category\": category},\n            )\n\nasync def parse_product(self, response: Response):\n    yield {\n        \"name\": response.css(\"h1::text\").get(\"\"),\n        \"price\": response.css(\".price::text\").get(\"\"),\n        # Access meta from the request\n        \"category\": response.meta.get(\"category\", \"\"),\n    }\n```\n\nWhen using `response.follow()`, the meta from the current response is merged with the new meta you provide (new values take precedence).\n\nThe spider system also automatically stores some metadata. For example, the proxy used for a request is available as `response.meta[\"proxy\"]` when proxy rotation is enabled."
  },
  {
    "path": "agent-skill/Scrapling-Skill/references/spiders/sessions.md",
    "content": "# Spiders sessions\n\nA spider can use multiple fetcher sessions simultaneously — for example, a fast HTTP session for simple pages and a stealth browser session for protected pages.\n\n## What are Sessions?\n\nA session is a pre-configured fetcher instance that stays alive for the duration of the crawl. Instead of creating a new connection or browser for every request, the spider reuses sessions, which is faster and more resource-efficient.\n\nBy default, every spider creates a single [FetcherSession](../fetching/static.md). You can add more sessions or swap the default by overriding the `configure_sessions()` method, but you have to use the async version of each session only, as the table shows below:\n\n\n| Session Type                                    | Use Case                                 |\n|-------------------------------------------------|------------------------------------------|\n| [FetcherSession](../fetching/static.md)         | Fast HTTP requests, no JavaScript        |\n| [AsyncDynamicSession](../fetching/dynamic.md)   | Browser automation, JavaScript rendering |\n| [AsyncStealthySession](../fetching/stealthy.md) | Anti-bot bypass, Cloudflare, etc.        |\n\n\n## Configuring Sessions\n\nOverride `configure_sessions()` on your spider to set up sessions. The `manager` parameter is a `SessionManager` instance — use `manager.add()` to register sessions:\n\n```python\nfrom scrapling.spiders import Spider, Response\nfrom scrapling.fetchers import FetcherSession\n\nclass MySpider(Spider):\n    name = \"my_spider\"\n    start_urls = [\"https://example.com\"]\n\n    def configure_sessions(self, manager):\n        manager.add(\"default\", FetcherSession())\n\n    async def parse(self, response: Response):\n        yield {\"title\": response.css(\"title::text\").get(\"\")}\n```\n\nThe `manager.add()` method takes:\n\n| Argument     | Type      | Default    | Description                                  |\n|--------------|-----------|------------|----------------------------------------------|\n| `session_id` | `str`     | *required* | A name to reference this session in requests |\n| `session`    | `Session` | *required* | The session instance                         |\n| `default`    | `bool`    | `False`    | Make this the default session                |\n| `lazy`       | `bool`    | `False`    | Start the session only when first used       |\n\n**Notes:**\n\n1. In all requests, if you don't specify which session to use, the default session is used. The default session is determined in one of two ways:\n    1. The first session you add to the manager becomes the default automatically.\n    2. The session that gets `default=True` while added to the manager.\n2. The instances you pass of each session don't have to be already started by you; the spider checks on all sessions if they are not already started and starts them.\n3. If you want a specific session to start when used only, then use the `lazy` argument while adding that session to the manager. Example: start the browser only when you need it, not with the spider start.\n\n## Multi-Session Spider\n\nHere's a practical example: use a fast HTTP session for listing pages and a stealth browser for detail pages that have bot protection:\n\n```python\nfrom scrapling.spiders import Spider, Response\nfrom scrapling.fetchers import FetcherSession, AsyncStealthySession\n\nclass ProductSpider(Spider):\n    name = \"products\"\n    start_urls = [\"https://shop.example.com/products\"]\n\n    def configure_sessions(self, manager):\n        # Fast HTTP for listing pages (default)\n        manager.add(\"http\", FetcherSession())\n\n        # Stealth browser for protected product pages\n        manager.add(\"stealth\", AsyncStealthySession(\n            headless=True,\n            network_idle=True,\n        ))\n\n    async def parse(self, response: Response):\n        for link in response.css(\"a.product::attr(href)\").getall():\n            # Route product pages through the stealth session\n            yield response.follow(link, sid=\"stealth\", callback=self.parse_product)\n\n        next_page = response.css(\"a.next::attr(href)\").get()\n        if next_page:\n            yield response.follow(next_page)\n\n    async def parse_product(self, response: Response):\n        yield {\n            \"name\": response.css(\"h1::text\").get(\"\"),\n            \"price\": response.css(\".price::text\").get(\"\"),\n        }\n```\n\nThe key is the `sid` parameter — it tells the spider which session to use for each request. When you call `response.follow()` without `sid`, the session ID from the original request is inherited.\n\nSessions can also be different instances of the same class with different configurations:\n\n```python\nfrom scrapling.spiders import Spider, Response\nfrom scrapling.fetchers import FetcherSession\n\nclass ProductSpider(Spider):\n    name = \"products\"\n    start_urls = [\"https://shop.example.com/products\"]\n\n    def configure_sessions(self, manager):\n        chrome_requests = FetcherSession(impersonate=\"chrome\")\n        firefox_requests = FetcherSession(impersonate=\"firefox\")\n\n        manager.add(\"chrome\", chrome_requests)\n        manager.add(\"firefox\", firefox_requests)\n\n    async def parse(self, response: Response):\n        for link in response.css(\"a.product::attr(href)\").getall():\n            yield response.follow(link, callback=self.parse_product)\n\n        next_page = response.css(\"a.next::attr(href)\").get()\n        if next_page:\n            yield response.follow(next_page, sid=\"firefox\")\n\n    async def parse_product(self, response: Response):\n        yield {\n            \"name\": response.css(\"h1::text\").get(\"\"),\n            \"price\": response.css(\".price::text\").get(\"\"),\n        }\n```\n\n## Session Arguments\n\nExtra keyword arguments passed to a `Request` (or through `response.follow(**kwargs)`) are forwarded to the session's fetch method. This lets you customize individual requests without changing the session configuration:\n\n```python\nasync def parse(self, response: Response):\n    # Pass extra headers for this specific request\n    yield Request(\n        \"https://api.example.com/data\",\n        headers={\"Authorization\": \"Bearer token123\"},\n        callback=self.parse_api,\n    )\n\n    # Use a different HTTP method\n    yield Request(\n        \"https://example.com/submit\",\n        method=\"POST\",\n        data={\"field\": \"value\"},\n        sid=\"firefox\",\n        callback=self.parse_result,\n    )\n```\n\n**Warning:** When using `FetcherSession` in spiders, you cannot use `.get()` and `.post()` methods directly. By default, the request is an HTTP GET request; to use another HTTP method, pass it to the `method` argument as in the above example. This unifies the `Request` interface across all session types.\n\nFor browser sessions (`AsyncDynamicSession`, `AsyncStealthySession`), you can pass browser-specific arguments like `wait_selector`, `page_action`, or `extra_headers`:\n\n```python\nasync def parse(self, response: Response):\n    # Use Cloudflare solver with the `AsyncStealthySession` we configured above\n    yield Request(\n        \"https://nopecha.com/demo/cloudflare\",\n        sid=\"stealth\",\n        callback=self.parse_result,\n        solve_cloudflare=True,\n        block_webrtc=True,\n        hide_canvas=True,\n        google_search=True,\n    )\n\n    yield response.follow(\n        \"/dynamic-page\",\n        sid=\"browser\",\n        callback=self.parse_dynamic,\n        wait_selector=\"div.loaded\",\n        network_idle=True,\n    )\n```\n\n**Warning:** Session arguments (**kwargs) passed from the original request are inherited by `response.follow()`. New kwargs take precedence over inherited ones.\n\n```python\nfrom scrapling.spiders import Spider, Response\nfrom scrapling.fetchers import FetcherSession\n\nclass ProductSpider(Spider):\n    name = \"products\"\n    start_urls = [\"https://shop.example.com/products\"]\n\n    def configure_sessions(self, manager):\n        manager.add(\"http\", FetcherSession(impersonate='chrome'))\n\n    async def parse(self, response: Response):\n        # I don't want the follow request to impersonate a desktop Chrome like the previous request, but a mobile one\n        # so I override it like this\n        for link in response.css(\"a.product::attr(href)\").getall():\n            yield response.follow(link, impersonate=\"chrome131_android\", callback=self.parse_product)\n\n        next_page = response.css(\"a.next::attr(href)\").get()\n        if next_page:\n            yield Request(next_page)\n\n    async def parse_product(self, response: Response):\n        yield {\n            \"name\": response.css(\"h1::text\").get(\"\"),\n            \"price\": response.css(\".price::text\").get(\"\"),\n        }\n```\n**Note:** Upon spider closure, the manager automatically checks whether any sessions are still running and closes them before closing the spider."
  },
  {
    "path": "benchmarks.py",
    "content": "import functools\nimport time\nimport timeit\nfrom statistics import mean\n\nimport requests\nfrom autoscraper import AutoScraper\nfrom bs4 import BeautifulSoup\nfrom lxml import etree, html\nfrom mechanicalsoup import StatefulBrowser\nfrom parsel import Selector\nfrom pyquery import PyQuery as pq\nfrom selectolax.parser import HTMLParser\n\nfrom scrapling import Selector as ScraplingSelector\n\nlarge_html = (\n    \"<html><body>\" + '<div class=\"item\">' * 5000 + \"</div>\" * 5000 + \"</body></html>\"\n)\n\n\ndef benchmark(func):\n    @functools.wraps(func)\n    def wrapper(*args, **kwargs):\n        benchmark_name = func.__name__.replace(\"test_\", \"\").replace(\"_\", \" \")\n        print(f\"-> {benchmark_name}\", end=\" \", flush=True)\n        # Warm-up phase\n        timeit.repeat(\n            lambda: func(*args, **kwargs), number=2, repeat=2, globals=globals()\n        )\n        # Measure time (1 run, repeat 100 times, take average)\n        times = timeit.repeat(\n            lambda: func(*args, **kwargs),\n            number=1,\n            repeat=100,\n            globals=globals(),\n            timer=time.process_time,\n        )\n        min_time = round(mean(times) * 1000, 2)  # Convert to milliseconds\n        print(f\"average execution time: {min_time} ms\")\n        return min_time\n\n    return wrapper\n\n\n@benchmark\ndef test_lxml():\n    return [\n        e.text\n        for e in etree.fromstring(\n            large_html,\n            # Scrapling and Parsel use the same parser inside, so this is just to make it fair\n            parser=html.HTMLParser(recover=True, huge_tree=True),\n        ).cssselect(\".item\")\n    ]\n\n\n@benchmark\ndef test_bs4_lxml():\n    return [e.text for e in BeautifulSoup(large_html, \"lxml\").select(\".item\")]\n\n\n@benchmark\ndef test_bs4_html5lib():\n    return [e.text for e in BeautifulSoup(large_html, \"html5lib\").select(\".item\")]\n\n\n@benchmark\ndef test_pyquery():\n    return [e.text() for e in pq(large_html)(\".item\").items()]\n\n\n@benchmark\ndef test_scrapling():\n    # No need to do `.extract()` like parsel to extract text\n    # Also, this is faster than `[t.text for t in Selector(large_html, adaptive=False).css('.item')]`\n    # for obvious reasons, of course.\n    return ScraplingSelector(large_html, adaptive=False).css(\".item::text\").getall()\n\n\n@benchmark\ndef test_parsel():\n    return Selector(text=large_html).css(\".item::text\").extract()\n\n\n@benchmark\ndef test_mechanicalsoup():\n    browser = StatefulBrowser()\n    browser.open_fake_page(large_html)\n    return [e.text for e in browser.page.select(\".item\")]\n\n\n@benchmark\ndef test_selectolax():\n    return [node.text() for node in HTMLParser(large_html).css(\".item\")]\n\n\ndef display(results):\n    # Sort and display results\n    sorted_results = sorted(results.items(), key=lambda x: x[1])  # Sort by time\n    scrapling_time = results[\"Scrapling\"]\n    print(\"\\nRanked Results (fastest to slowest):\")\n    print(f\" i. {'Library tested':<18} | {'avg. time (ms)':<15} | vs Scrapling\")\n    print(\"-\" * 50)\n    for i, (test_name, test_time) in enumerate(sorted_results, 1):\n        compare = round(test_time / scrapling_time, 3)\n        print(f\" {i}. {test_name:<18} | {str(test_time):<15} | {compare}\")\n\n\n@benchmark\ndef test_scrapling_text(request_html):\n    return ScraplingSelector(request_html, adaptive=False).find_by_text(\"Tipping the Velvet\", first_match=True, clean_match=False).find_similar(ignore_attributes=[\"title\"])\n\n\n@benchmark\ndef test_autoscraper(request_html):\n    # autoscraper by default returns elements text\n    return AutoScraper().build(html=request_html, wanted_list=[\"Tipping the Velvet\"])\n\n\nif __name__ == \"__main__\":\n    print(\n        \" Benchmark: Speed of parsing and retrieving the text content of 5000 nested elements \\n\"\n    )\n    results1 = {\n        \"Raw Lxml\": test_lxml(),\n        \"Parsel/Scrapy\": test_parsel(),\n        \"Scrapling\": test_scrapling(),\n        \"Selectolax\": test_selectolax(),\n        \"PyQuery\": test_pyquery(),\n        \"BS4 with Lxml\": test_bs4_lxml(),\n        \"MechanicalSoup\": test_mechanicalsoup(),\n        \"BS4 with html5lib\": test_bs4_html5lib(),\n    }\n\n    display(results1)\n    print(\"\\n\" + \"=\" * 25)\n    req = requests.get(\"https://books.toscrape.com/index.html\")\n    print(\n        \" Benchmark: Speed of searching for an element by text content, and retrieving the text of similar elements\\n\"\n    )\n    results2 = {\n        \"Scrapling\": test_scrapling_text(req.text),\n        \"AutoScraper\": test_autoscraper(req.text),\n    }\n    display(results2)\n"
  },
  {
    "path": "cleanup.py",
    "content": "import shutil\nfrom pathlib import Path\n\n\n# Clean up after installing for local development\ndef clean():\n    # Get the current directory\n    base_dir = Path.cwd()\n\n    # Directories and patterns to clean\n    cleanup_patterns = [\n        \"build\",\n        \"dist\",\n        \"*.egg-info\",\n        \"__pycache__\",\n        \".eggs\",\n        \".pytest_cache\",\n    ]\n\n    # Clean directories\n    for pattern in cleanup_patterns:\n        for path in base_dir.glob(pattern):\n            try:\n                if path.is_dir():\n                    shutil.rmtree(path)\n                else:\n                    path.unlink()\n                print(f\"Removed: {path}\")\n            except Exception as e:\n                print(f\"Could not remove {path}: {e}\")\n\n    # Remove compiled Python files\n    for path in base_dir.rglob(\"*.py[co]\"):\n        try:\n            path.unlink()\n            print(f\"Removed compiled file: {path}\")\n        except Exception as e:\n            print(f\"Could not remove {path}: {e}\")\n\n\nif __name__ == \"__main__\":\n    clean()\n"
  },
  {
    "path": "docs/README_AR.md",
    "content": "<!-- mcp-name: io.github.D4Vinci/Scrapling -->\n\n<h1 align=\"center\">\n    <a href=\"https://scrapling.readthedocs.io\">\n        <picture>\n          <source media=\"(prefers-color-scheme: dark)\" srcset=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/docs/assets/cover_dark.svg?sanitize=true\">\n          <img alt=\"Scrapling Poster\" src=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/docs/assets/cover_light.svg?sanitize=true\">\n        </picture>\n    </a>\n    <br>\n    <small>Effortless Web Scraping for the Modern Web</small>\n</h1>\n\n<p align=\"center\">\n    <a href=\"https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml\" alt=\"Tests\">\n        <img alt=\"Tests\" src=\"https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml/badge.svg\"></a>\n    <a href=\"https://badge.fury.io/py/Scrapling\" alt=\"PyPI version\">\n        <img alt=\"PyPI version\" src=\"https://badge.fury.io/py/Scrapling.svg\"></a>\n    <a href=\"https://clickpy.clickhouse.com/dashboard/scrapling\" rel=\"nofollow\"><img src=\"https://img.shields.io/pypi/dm/scrapling\" alt=\"PyPI package downloads\"></a>\n    <a href=\"https://github.com/D4Vinci/Scrapling/tree/main/agent-skill\" alt=\"AI Agent Skill directory\">\n        <img alt=\"Static Badge\" src=\"https://img.shields.io/badge/Skill-black?style=flat&label=Agent&link=https%3A%2F%2Fgithub.com%2FD4Vinci%2FScrapling%2Ftree%2Fmain%2Fagent-skill\"></a>\n    <a href=\"https://clawhub.ai/D4Vinci/scrapling-official\" alt=\"OpenClaw Skill\">\n        <img alt=\"OpenClaw Skill\" src=\"https://img.shields.io/badge/Clawhub-darkred?style=flat&label=OpenClaw&link=https%3A%2F%2Fclawhub.ai%2FD4Vinci%2Fscrapling-official\"></a>\n    <br/>\n    <a href=\"https://discord.gg/EMgGbDceNQ\" alt=\"Discord\" target=\"_blank\">\n      <img alt=\"Discord\" src=\"https://img.shields.io/discord/1360786381042880532?style=social&logo=discord&link=https%3A%2F%2Fdiscord.gg%2FEMgGbDceNQ\">\n    </a>\n    <a href=\"https://x.com/Scrapling_dev\" alt=\"X (formerly Twitter)\">\n      <img alt=\"X (formerly Twitter) Follow\" src=\"https://img.shields.io/twitter/follow/Scrapling_dev?style=social&logo=x&link=https%3A%2F%2Fx.com%2FScrapling_dev\">\n    </a>\n    <br/>\n    <a href=\"https://pypi.org/project/scrapling/\" alt=\"Supported Python versions\">\n        <img alt=\"Supported Python versions\" src=\"https://img.shields.io/pypi/pyversions/scrapling.svg\"></a>\n</p>\n\n<p align=\"center\">\n    <a href=\"https://scrapling.readthedocs.io/en/latest/parsing/selection.html\"><strong>طرق الاختيار</strong></a>\n    &middot;\n    <a href=\"https://scrapling.readthedocs.io/en/latest/fetching/choosing.html\"><strong>اختيار Fetcher</strong></a>\n    &middot;\n    <a href=\"https://scrapling.readthedocs.io/en/latest/spiders/architecture.html\"><strong>العناكب</strong></a>\n    &middot;\n    <a href=\"https://scrapling.readthedocs.io/en/latest/spiders/proxy-blocking.html\"><strong>تدوير البروكسي</strong></a>\n    &middot;\n    <a href=\"https://scrapling.readthedocs.io/en/latest/cli/overview.html\"><strong>واجهة سطر الأوامر</strong></a>\n    &middot;\n    <a href=\"https://scrapling.readthedocs.io/en/latest/ai/mcp-server.html\"><strong>وضع MCP</strong></a>\n</p>\n\nScrapling هو إطار عمل تكيفي لـ Web Scraping يتعامل مع كل شيء من طلب واحد إلى زحف كامل النطاق.\n\nمحلله يتعلم من تغييرات المواقع ويعيد تحديد موقع عناصرك تلقائياً عند تحديث الصفحات. جوالبه تتجاوز أنظمة مكافحة الروبوتات مثل Cloudflare Turnstile مباشرةً. وإطار عمل Spider الخاص به يتيح لك التوسع إلى عمليات زحف متزامنة ومتعددة الجلسات مع إيقاف/استئناف وتدوير تلقائي لـ Proxy - كل ذلك في بضعة أسطر من Python. مكتبة واحدة، بدون تنازلات.\n\nزحف سريع للغاية مع إحصائيات فورية و Streaming. مبني بواسطة مستخرجي الويب لمستخرجي الويب والمستخدمين العاديين، هناك شيء للجميع.\n\n```python\nfrom scrapling.fetchers import Fetcher, AsyncFetcher, StealthyFetcher, DynamicFetcher\nStealthyFetcher.adaptive = True\np = StealthyFetcher.fetch('https://example.com', headless=True, network_idle=True)  # احصل على الموقع بشكل خفي!\nproducts = p.css('.product', auto_save=True)                                        # استخرج بيانات تنجو من تغييرات تصميم الموقع!\nproducts = p.css('.product', adaptive=True)                                         # لاحقاً، إذا تغيرت بنية الموقع، مرر `adaptive=True` للعثور عليها!\n```\nأو توسع إلى عمليات زحف كاملة\n```python\nfrom scrapling.spiders import Spider, Response\n\nclass MySpider(Spider):\n  name = \"demo\"\n  start_urls = [\"https://example.com/\"]\n\n  async def parse(self, response: Response):\n      for item in response.css('.product'):\n          yield {\"title\": item.css('h2::text').get()}\n\nMySpider().start()\n```\n\n<p align=\"center\">\n    <a href=\"https://dataimpulse.com/?utm_source=scrapling&utm_medium=banner&utm_campaign=scrapling\" target=\"_blank\" style=\"display:flex; justify-content:center; padding:4px 0;\">\n        <img src=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/DataImpulse.png\" alt=\"At DataImpulse, we specialize in developing custom proxy services for your business. Make requests from anywhere, collect data, and enjoy fast connections with our premium proxies.\" style=\"max-height:60px;\">\n    </a>\n</p>\n\n# الرعاة البلاتينيون\n<table>\n  <tr>\n    <td width=\"200\">\n      <a href=\"https://hypersolutions.co/?utm_source=github&utm_medium=readme&utm_campaign=scrapling\" target=\"_blank\" title=\"Bot Protection Bypass API for Akamai, DataDome, Incapsula & Kasada\">\n        <img src=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/HyperSolutions.png\">\n      </a>\n    </td>\n    <td> Scrapling يتعامل مع Cloudflare Turnstile. للحماية على مستوى المؤسسات، توفر <a href=\"https://hypersolutions.co?utm_source=github&utm_medium=readme&utm_campaign=scrapling\">\n        <b>Hyper Solutions</b>\n      </a> نقاط نهاية API تولّد رموز antibot صالحة لـ <b>Akamai</b>، <b>DataDome</b>، <b>Kasada</b> و <b>Incapsula</b>. استدعاءات API بسيطة، بدون أتمتة متصفح. </td>\n  </tr>\n  <tr>\n    <td width=\"200\">\n      <a href=\"https://birdproxies.com/t/scrapling\" target=\"_blank\" title=\"At Bird Proxies, we eliminate your pains such as banned IPs, geo restriction, and high costs so you can focus on your work.\">\n        <img src=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/BirdProxies.jpg\">\n      </a>\n    </td>\n    <td>مرحباً، لقد بنينا <a href=\"https://birdproxies.com/t/scrapling\">\n        <b>BirdProxies</b>\n      </a> لأن البروكسيات لا يجب أن تكون معقدة أو باهظة الثمن. <br /> بروكسيات سكنية و ISP سريعة في أكثر من 195 موقعاً، أسعار عادلة، ودعم حقيقي. <br />\n      <b>جرّب لعبة FlappyBird على صفحة الهبوط للحصول على بيانات مجانية!</b>\n    </td>\n  </tr>\n  <tr>\n    <td width=\"200\">\n      <a href=\"https://evomi.com?utm_source=github&utm_medium=banner&utm_campaign=d4vinci-scrapling\" target=\"_blank\" title=\"Evomi is your Swiss Quality Proxy Provider, starting at $0.49/GB\">\n        <img src=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/evomi.png\">\n      </a>\n    </td>\n    <td>\n      <a href=\"https://evomi.com?utm_source=github&utm_medium=banner&utm_campaign=d4vinci-scrapling\">\n        <b>Evomi</b>\n      </a>: بروكسيات سكنية بدءاً من 0.49$/جيجابايت. متصفح سكرابينج مع Chromium مُزيّف بالكامل، عناوين IP سكنية، حل تلقائي لـ CAPTCHA، وتجاوز أنظمة مكافحة البوتات. </br>\n      <b>واجهة Scraper API لنتائج بدون عناء. تكاملات MCP و N8N متاحة.</b>\n    </td>\n  </tr>\n  <tr>\n    <td width=\"200\">\n      <a href=\"https://tikhub.io/?ref=KarimShoair\" target=\"_blank\" title=\"Unlock the Power of Social Media Data & AI\">\n        <img src=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/TikHub.jpg\">\n      </a>\n    </td>\n    <td>\n      <a href=\"https://tikhub.io/?ref=KarimShoair\" target=\"_blank\">TikHub.io</a> يوفر أكثر من 900 واجهة API مستقرة عبر أكثر من 16 منصة تشمل TikTok و X و YouTube و Instagram، مع أكثر من 40 مليون مجموعة بيانات. <br /> يقدم أيضاً <a href=\"https://ai.tikhub.io/?ref=KarimShoair\" target=\"_blank\">نماذج ذكاء اصطناعي بأسعار مخفضة</a> — Claude و GPT و GEMINI والمزيد بخصم يصل إلى 71%.\n    </td>\n  </tr>\n  <tr>\n    <td width=\"200\">\n      <a href=\"https://www.nsocks.com/?keyword=2p67aivg\" target=\"_blank\" title=\"Scalable Web Data Access for AI Applications\">\n        <img src=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/nsocks.png\">\n      </a>\n    </td>\n    <td>\n    <a href=\"https://www.nsocks.com/?keyword=2p67aivg\" target=\"_blank\">Nsocks</a> يوفر بروكسيات سكنية و ISP سريعة للمطورين والسكرابرز. تغطية IP عالمية، إخفاء هوية عالي، تدوير ذكي، وأداء موثوق للأتمتة واستخراج البيانات. استخدم <a href=\"https://www.xcrawl.com/?keyword=2p67aivg\" target=\"_blank\">Xcrawl</a> لتبسيط زحف الويب على نطاق واسع.\n    </td>\n  </tr>\n  <tr>\n    <td width=\"200\">\n      <a href=\"https://petrosky.io/d4vinci\" target=\"_blank\" title=\"PetroSky delivers cutting-edge VPS hosting.\">\n        <img src=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/petrosky.png\">\n      </a>\n    </td>\n    <td>\n    أغلق حاسوبك. أدوات الكشط تواصل العمل. <br />\n    <a href=\"https://petrosky.io/d4vinci\" target=\"_blank\">PetroSky VPS</a> - خوادم سحابية مصممة للأتمتة المتواصلة. أجهزة Windows وLinux مع تحكم كامل. بدءًا من 6.99 يورو/شهريًا.\n    </td>\n  </tr>\n  <tr>\n    <td width=\"200\">\n      <a href=\"https://substack.thewebscraping.club/p/scrapling-hands-on-guide?utm_source=github&utm_medium=repo&utm_campaign=scrapling\" target=\"_blank\" title=\"The #1 newsletter dedicated to Web Scraping\">\n        <img src=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/TWSC.png\">\n      </a>\n    </td>\n    <td>\n    اقرأ مراجعة كاملة عن <a href=\"https://substack.thewebscraping.club/p/scrapling-hands-on-guide?utm_source=github&utm_medium=repo&utm_campaign=scrapling\" target=\"_blank\">Scrapling على The Web Scraping Club</a> (نوفمبر 2025)، النشرة الإخبارية الأولى المخصصة لكشط الويب.\n    </td>\n  </tr>\n  <tr>\n    <td width=\"200\">\n      <a href=\"https://proxy-seller.com/?partner=CU9CAA5TBYFFT2\" target=\"_blank\" title=\"Proxy-Seller provides reliable proxy infrastructure for Web Scraping\">\n        <img src=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/ProxySeller.png\">\n      </a>\n    </td>\n    <td>\n    <a href=\"https://proxy-seller.com/?partner=CU9CAA5TBYFFT2\" target=\"_blank\">Proxy-Seller</a> يوفر بنية تحتية موثوقة للبروكسي لكشط الويب، بما في ذلك بروكسيات IPv4 وIPv6 وISP والسكنية والمحمولة مع أداء مستقر وتغطية جغرافية واسعة وخطط مرنة لجمع البيانات على نطاق الأعمال.\n    </td>\n  </tr>\n</table>\n\n<i><sub>هل تريد عرض إعلانك هنا؟ انقر [هنا](https://github.com/sponsors/D4Vinci/sponsorships?tier_id=586646)</sub></i>\n# الرعاة\n\n<!-- sponsors -->\n\n<a href=\"https://serpapi.com/?utm_source=scrapling\" target=\"_blank\" title=\"Scrape Google and other search engines with SerpApi\"><img src=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/SerpApi.png\"></a>\n<a href=\"https://visit.decodo.com/Dy6W0b\" target=\"_blank\" title=\"Try the Most Efficient Residential Proxies for Free\"><img src=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/decodo.png\"></a>\n<a href=\"https://hasdata.com/?utm_source=github&utm_medium=banner&utm_campaign=D4Vinci\" target=\"_blank\" title=\"The web scraping service that actually beats anti-bot systems!\"><img src=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/hasdata.png\"></a>\n<a href=\"https://proxyempire.io/?ref=scrapling&utm_source=scrapling\" target=\"_blank\" title=\"Collect The Data Your Project Needs with the Best Residential Proxies\"><img src=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/ProxyEmpire.png\"></a><a href=\"https://www.swiftproxy.net/\" target=\"_blank\" title=\"Unlock Reliable Proxy Services with Swiftproxy!\"><img src=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/swiftproxy.png\"></a>\n<a href=\"https://www.rapidproxy.io/?ref=d4v\" target=\"_blank\" title=\"Affordable Access to the Proxy World – bypass CAPTCHAs blocks, and avoid additional costs.\"><img src=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/rapidproxy.jpg\"></a>\n<a href=\"https://browser.cash/?utm_source=D4Vinci&utm_medium=referral\" target=\"_blank\" title=\"Browser Automation & AI Browser Agent Platform\"><img src=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/browserCash.png\"></a>\n\n<!-- /sponsors -->\n\n<i><sub>هل تريد عرض إعلانك هنا؟ انقر [هنا](https://github.com/sponsors/D4Vinci) واختر المستوى الذي يناسبك!</sub></i>\n\n---\n\n## الميزات الرئيسية\n\n### Spiders — إطار عمل زحف كامل\n- 🕷️ **واجهة Spider شبيهة بـ Scrapy**: عرّف Spiders مع `start_urls`، و async `parse` callbacks، وكائنات `Request`/`Response`.\n- ⚡ **زحف متزامن**: حدود تزامن قابلة للتكوين، وتحكم بالسرعة حسب النطاق، وتأخيرات التنزيل.\n- 🔄 **دعم الجلسات المتعددة**: واجهة موحدة لطلبات HTTP، ومتصفحات خفية بدون واجهة في Spider واحد — وجّه الطلبات إلى جلسات مختلفة بالمعرّف.\n- 💾 **إيقاف واستئناف**: استمرارية الزحف القائمة على Checkpoint. اضغط Ctrl+C للإيقاف بسلاسة؛ أعد التشغيل للاستئناف من حيث توقفت.\n- 📡 **وضع Streaming**: بث العناصر المستخرجة فور وصولها عبر `async for item in spider.stream()` مع إحصائيات فورية — مثالي لواجهات المستخدم وخطوط الأنابيب وعمليات الزحف الطويلة.\n- 🛡️ **كشف الطلبات المحظورة**: كشف تلقائي وإعادة محاولة للطلبات المحظورة مع منطق قابل للتخصيص.\n- 📦 **تصدير مدمج**: صدّر النتائج عبر الخطافات وخط الأنابيب الخاص بك أو JSON/JSONL المدمج مع `result.items.to_json()` / `result.items.to_jsonl()` على التوالي.\n\n### جلب متقدم للمواقع مع دعم الجلسات\n- **طلبات HTTP**: طلبات HTTP سريعة وخفية مع فئة `Fetcher`. يمكنها تقليد بصمة TLS للمتصفح والرؤوس واستخدام HTTP/3.\n- **التحميل الديناميكي**: جلب المواقع الديناميكية مع أتمتة كاملة للمتصفح من خلال فئة `DynamicFetcher` التي تدعم Chromium من Playwright و Google Chrome.\n- **تجاوز مكافحة الروبوتات**: قدرات تخفي متقدمة مع `StealthyFetcher` وانتحال fingerprint. يمكنه تجاوز جميع أنواع Turnstile/Interstitial من Cloudflare بسهولة بالأتمتة.\n- **إدارة الجلسات**: دعم الجلسات المستمرة مع فئات `FetcherSession` و`StealthySession` و`DynamicSession` لإدارة ملفات تعريف الارتباط والحالة عبر الطلبات.\n- **تدوير Proxy**: `ProxyRotator` مدمج مع استراتيجيات التدوير الدوري أو المخصصة عبر جميع أنواع الجلسات، بالإضافة إلى تجاوزات Proxy لكل طلب.\n- **حظر النطاقات**: حظر الطلبات إلى نطاقات محددة (ونطاقاتها الفرعية) في الجوالب المعتمدة على المتصفح.\n- **دعم Async**: دعم async كامل عبر جميع الجوالب وفئات الجلسات async المخصصة.\n\n### الاستخراج التكيفي والتكامل مع الذكاء الاصطناعي\n- 🔄 **تتبع العناصر الذكي**: إعادة تحديد موقع العناصر بعد تغييرات الموقع باستخدام خوارزميات التشابه الذكية.\n- 🎯 **الاختيار المرن الذكي**: محددات CSS، محددات XPath، البحث القائم على الفلاتر، البحث النصي، البحث بالتعبيرات العادية والمزيد.\n- 🔍 **البحث عن عناصر مشابهة**: تحديد العناصر المشابهة للعناصر الموجودة تلقائياً.\n- 🤖 **خادم MCP للاستخدام مع الذكاء الاصطناعي**: خادم MCP مدمج لـ Web Scraping بمساعدة الذكاء الاصطناعي واستخراج البيانات. يتميز خادم MCP بقدرات قوية مخصصة تستفيد من Scrapling لاستخراج المحتوى المستهدف قبل تمريره إلى الذكاء الاصطناعي (Claude/Cursor/إلخ)، وبالتالي تسريع العمليات وتقليل التكاليف عن طريق تقليل استخدام الرموز. ([فيديو توضيحي](https://www.youtube.com/watch?v=qyFk3ZNwOxE))\n\n### بنية عالية الأداء ومختبرة ميدانياً\n- 🚀 **سريع كالبرق**: أداء محسّن يتفوق على معظم مكتبات Web Scraping في Python.\n- 🔋 **فعال في استخدام الذاكرة**: هياكل بيانات محسّنة وتحميل كسول لأقل استخدام للذاكرة.\n- ⚡ **تسلسل JSON سريع**: أسرع 10 مرات من المكتبة القياسية.\n- 🏗️ **مُختبر ميدانياً**: لا يمتلك Scrapling فقط تغطية اختبار بنسبة 92٪ وتغطية كاملة لتلميحات الأنواع، بل تم استخدامه يومياً من قبل مئات مستخرجي الويب خلال العام الماضي.\n\n### تجربة صديقة للمطورين/مستخرجي الويب\n- 🎯 **Shell تفاعلي لـ Web Scraping**: Shell IPython مدمج اختياري مع تكامل Scrapling، واختصارات، وأدوات جديدة لتسريع تطوير سكريبتات Web Scraping، مثل تحويل طلبات curl إلى طلبات Scrapling وعرض نتائج الطلبات في متصفحك.\n- 🚀 **استخدمه مباشرة من الطرفية**: اختيارياً، يمكنك استخدام Scrapling لاستخراج عنوان URL دون كتابة سطر واحد من الكود!\n- 🛠️ **واجهة تنقل غنية**: اجتياز DOM متقدم مع طرق التنقل بين العناصر الوالدية والشقيقة والفرعية.\n- 🧬 **معالجة نصوص محسّنة**: تعبيرات عادية مدمجة وطرق تنظيف وعمليات نصية محسّنة.\n- 📝 **إنشاء محددات تلقائي**: إنشاء محددات CSS/XPath قوية لأي عنصر.\n- 🔌 **واجهة مألوفة**: مشابه لـ Scrapy/BeautifulSoup مع نفس العناصر الزائفة المستخدمة في Scrapy/Parsel.\n- 📘 **تغطية كاملة للأنواع**: تلميحات نوع كاملة لدعم IDE ممتاز وإكمال الكود. يتم فحص قاعدة الكود بالكامل تلقائياً بواسطة **PyRight** و**MyPy** مع كل تغيير.\n- 🔋 **صورة Docker جاهزة**: مع كل إصدار، يتم بناء ودفع صورة Docker تحتوي على جميع المتصفحات تلقائياً.\n\n## البدء\n\nلنلقِ نظرة سريعة على ما يمكن لـ Scrapling فعله دون التعمق.\n\n### الاستخدام الأساسي\nطلبات HTTP مع دعم الجلسات\n```python\nfrom scrapling.fetchers import Fetcher, FetcherSession\n\nwith FetcherSession(impersonate='chrome') as session:  # استخدم أحدث إصدار من بصمة TLS لـ Chrome\n    page = session.get('https://quotes.toscrape.com/', stealthy_headers=True)\n    quotes = page.css('.quote .text::text').getall()\n\n# أو استخدم طلبات لمرة واحدة\npage = Fetcher.get('https://quotes.toscrape.com/')\nquotes = page.css('.quote .text::text').getall()\n```\nوضع التخفي المتقدم\n```python\nfrom scrapling.fetchers import StealthyFetcher, StealthySession\n\nwith StealthySession(headless=True, solve_cloudflare=True) as session:  # أبقِ المتصفح مفتوحاً حتى تنتهي\n    page = session.fetch('https://nopecha.com/demo/cloudflare', google_search=False)\n    data = page.css('#padded_content a').getall()\n\n# أو استخدم نمط الطلب لمرة واحدة، يفتح المتصفح لهذا الطلب، ثم يغلقه بعد الانتهاء\npage = StealthyFetcher.fetch('https://nopecha.com/demo/cloudflare')\ndata = page.css('#padded_content a').getall()\n```\nأتمتة المتصفح الكاملة\n```python\nfrom scrapling.fetchers import DynamicFetcher, DynamicSession\n\nwith DynamicSession(headless=True, disable_resources=False, network_idle=True) as session:  # أبقِ المتصفح مفتوحاً حتى تنتهي\n    page = session.fetch('https://quotes.toscrape.com/', load_dom=False)\n    data = page.xpath('//span[@class=\"text\"]/text()').getall()  # محدد XPath إذا كنت تفضله\n\n# أو استخدم نمط الطلب لمرة واحدة، يفتح المتصفح لهذا الطلب، ثم يغلقه بعد الانتهاء\npage = DynamicFetcher.fetch('https://quotes.toscrape.com/')\ndata = page.css('.quote .text::text').getall()\n```\n\n### Spiders\nابنِ زواحف كاملة مع طلبات متزامنة وأنواع جلسات متعددة وإيقاف/استئناف:\n```python\nfrom scrapling.spiders import Spider, Request, Response\n\nclass QuotesSpider(Spider):\n    name = \"quotes\"\n    start_urls = [\"https://quotes.toscrape.com/\"]\n    concurrent_requests = 10\n\n    async def parse(self, response: Response):\n        for quote in response.css('.quote'):\n            yield {\n                \"text\": quote.css('.text::text').get(),\n                \"author\": quote.css('.author::text').get(),\n            }\n\n        next_page = response.css('.next a')\n        if next_page:\n            yield response.follow(next_page[0].attrib['href'])\n\nresult = QuotesSpider().start()\nprint(f\"Scraped {len(result.items)} quotes\")\nresult.items.to_json(\"quotes.json\")\n```\nاستخدم أنواع جلسات متعددة في Spider واحد:\n```python\nfrom scrapling.spiders import Spider, Request, Response\nfrom scrapling.fetchers import FetcherSession, AsyncStealthySession\n\nclass MultiSessionSpider(Spider):\n    name = \"multi\"\n    start_urls = [\"https://example.com/\"]\n\n    def configure_sessions(self, manager):\n        manager.add(\"fast\", FetcherSession(impersonate=\"chrome\"))\n        manager.add(\"stealth\", AsyncStealthySession(headless=True), lazy=True)\n\n    async def parse(self, response: Response):\n        for link in response.css('a::attr(href)').getall():\n            # وجّه الصفحات المحمية عبر جلسة التخفي\n            if \"protected\" in link:\n                yield Request(link, sid=\"stealth\")\n            else:\n                yield Request(link, sid=\"fast\", callback=self.parse)  # callback صريح\n```\nأوقف واستأنف عمليات الزحف الطويلة مع Checkpoints بتشغيل Spider هكذا:\n```python\nQuotesSpider(crawldir=\"./crawl_data\").start()\n```\nاضغط Ctrl+C للإيقاف بسلاسة — يتم حفظ التقدم تلقائياً. لاحقاً، عند تشغيل Spider مرة أخرى، مرر نفس `crawldir`، وسيستأنف من حيث توقف.\n\n### التحليل المتقدم والتنقل\n```python\nfrom scrapling.fetchers import Fetcher\n\n# اختيار عناصر غني وتنقل\npage = Fetcher.get('https://quotes.toscrape.com/')\n\n# احصل على الاقتباسات بطرق اختيار متعددة\nquotes = page.css('.quote')  # محدد CSS\nquotes = page.xpath('//div[@class=\"quote\"]')  # XPath\nquotes = page.find_all('div', {'class': 'quote'})  # بأسلوب BeautifulSoup\n# نفس الشيء مثل\nquotes = page.find_all('div', class_='quote')\nquotes = page.find_all(['div'], class_='quote')\nquotes = page.find_all(class_='quote')  # وهكذا...\n# البحث عن عنصر بمحتوى النص\nquotes = page.find_by_text('quote', tag='div')\n\n# التنقل المتقدم\nquote_text = page.css('.quote')[0].css('.text::text').get()\nquote_text = page.css('.quote').css('.text::text').getall()  # محددات متسلسلة\nfirst_quote = page.css('.quote')[0]\nauthor = first_quote.next_sibling.css('.author::text')\nparent_container = first_quote.parent\n\n# علاقات العناصر والتشابه\nsimilar_elements = first_quote.find_similar()\nbelow_elements = first_quote.below_elements()\n```\nيمكنك استخدام المحلل مباشرة إذا كنت لا تريد جلب المواقع كما يلي:\n```python\nfrom scrapling.parser import Selector\n\npage = Selector(\"<html>...</html>\")\n```\nوهو يعمل بنفس الطريقة تماماً!\n\n### أمثلة إدارة الجلسات بشكل Async\n```python\nimport asyncio\nfrom scrapling.fetchers import FetcherSession, AsyncStealthySession, AsyncDynamicSession\n\nasync with FetcherSession(http3=True) as session:  # `FetcherSession` واعٍ بالسياق ويعمل في كلا النمطين المتزامن/async\n    page1 = session.get('https://quotes.toscrape.com/')\n    page2 = session.get('https://quotes.toscrape.com/', impersonate='firefox135')\n\n# استخدام جلسة async\nasync with AsyncStealthySession(max_pages=2) as session:\n    tasks = []\n    urls = ['https://example.com/page1', 'https://example.com/page2']\n\n    for url in urls:\n        task = session.fetch(url)\n        tasks.append(task)\n\n    print(session.get_pool_stats())  # اختياري - حالة مجموعة علامات تبويب المتصفح (مشغول/حر/خطأ)\n    results = await asyncio.gather(*tasks)\n    print(session.get_pool_stats())\n```\n\n## واجهة سطر الأوامر والـ Shell التفاعلي\n\nيتضمن Scrapling واجهة سطر أوامر قوية:\n\n[![asciicast](https://asciinema.org/a/736339.svg)](https://asciinema.org/a/736339)\n\nتشغيل Shell الـ Web Scraping التفاعلي\n```bash\nscrapling shell\n```\nاستخرج الصفحات إلى ملف مباشرة دون برمجة (يستخرج المحتوى داخل وسم `body` افتراضياً). إذا انتهى ملف الإخراج بـ `.txt`، فسيتم استخراج محتوى النص للهدف. إذا انتهى بـ `.md`، فسيكون تمثيل Markdown لمحتوى HTML؛ إذا انتهى بـ `.html`، فسيكون محتوى HTML نفسه.\n```bash\nscrapling extract get 'https://example.com' content.md\nscrapling extract get 'https://example.com' content.txt --css-selector '#fromSkipToProducts' --impersonate 'chrome'  # جميع العناصر المطابقة لمحدد CSS '#fromSkipToProducts'\nscrapling extract fetch 'https://example.com' content.md --css-selector '#fromSkipToProducts' --no-headless\nscrapling extract stealthy-fetch 'https://nopecha.com/demo/cloudflare' captchas.html --css-selector '#padded_content a' --solve-cloudflare\n```\n\n> [!NOTE]\n> هناك العديد من الميزات الإضافية، لكننا نريد إبقاء هذه الصفحة موجزة، بما في ذلك خادم MCP والـ Shell التفاعلي لـ Web Scraping. تحقق من الوثائق الكاملة [هنا](https://scrapling.readthedocs.io/en/latest/)\n\n## معايير الأداء\n\nScrapling ليس قوياً فحسب — بل هو أيضاً سريع بشكل مذهل. تقارن المعايير التالية محلل Scrapling مع أحدث إصدارات المكتبات الشائعة الأخرى.\n\n### اختبار سرعة استخراج النص (5000 عنصر متداخل)\n\n| # |      المكتبة      | الوقت (ms) | vs Scrapling |\n|---|:-----------------:|:----------:|:------------:|\n| 1 |     Scrapling     |    2.02    |     1.0x     |\n| 2 |   Parsel/Scrapy   |    2.04    |     1.01     |\n| 3 |     Raw Lxml      |    2.54    |    1.257     |\n| 4 |      PyQuery      |   24.17    |     ~12x     |\n| 5 |    Selectolax     |   82.63    |     ~41x     |\n| 6 |  MechanicalSoup   |  1549.71   |   ~767.1x    |\n| 7 |   BS4 with Lxml   |  1584.31   |   ~784.3x    |\n| 8 | BS4 with html5lib |  3391.91   |   ~1679.1x   |\n\n\n### أداء تشابه العناصر والبحث النصي\n\nقدرات العثور على العناصر التكيفية لـ Scrapling تتفوق بشكل كبير على البدائل:\n\n| المكتبة     | الوقت (ms) | vs Scrapling |\n|-------------|:----------:|:------------:|\n| Scrapling   |    2.39    |     1.0x     |\n| AutoScraper |   12.45    |    5.209x    |\n\n\n> تمثل جميع المعايير متوسطات أكثر من 100 تشغيل. انظر [benchmarks.py](https://github.com/D4Vinci/Scrapling/blob/main/benchmarks.py) للمنهجية.\n\n## التثبيت\n\nيتطلب Scrapling إصدار Python 3.10 أو أعلى:\n\n```bash\npip install scrapling\n```\n\nيتضمن هذا التثبيت فقط محرك المحلل وتبعياته، بدون أي جوالب أو تبعيات سطر الأوامر.\n\n### التبعيات الاختيارية\n\n1. إذا كنت ستستخدم أياً من الميزات الإضافية أدناه، أو الجوالب، أو فئاتها، فستحتاج إلى تثبيت تبعيات الجوالب وتبعيات المتصفح الخاصة بها على النحو التالي:\n    ```bash\n    pip install \"scrapling[fetchers]\"\n\n    scrapling install           # normal install\n    scrapling install  --force  # force reinstall\n    ```\n\n    يقوم هذا بتنزيل جميع المتصفحات، إلى جانب تبعيات النظام وتبعيات معالجة fingerprint الخاصة بها.\n\n    أو يمكنك تثبيتها من الكود بدلاً من تشغيل أمر كالتالي:\n    ```python\n    from scrapling.cli import install\n\n    install([], standalone_mode=False)          # normal install\n    install([\"--force\"], standalone_mode=False) # force reinstall\n    ```\n\n2. ميزات إضافية:\n   - تثبيت ميزة خادم MCP:\n       ```bash\n       pip install \"scrapling[ai]\"\n       ```\n   - تثبيت ميزات Shell (Shell الـ Web Scraping وأمر `extract`):\n       ```bash\n       pip install \"scrapling[shell]\"\n       ```\n   - تثبيت كل شيء:\n       ```bash\n       pip install \"scrapling[all]\"\n       ```\n   تذكر أنك تحتاج إلى تثبيت تبعيات المتصفح مع `scrapling install` بعد أي من هذه الإضافات (إذا لم تكن قد فعلت ذلك بالفعل)\n\n### Docker\nيمكنك أيضاً تثبيت صورة Docker مع جميع الإضافات والمتصفحات باستخدام الأمر التالي من DockerHub:\n```bash\ndocker pull pyd4vinci/scrapling\n```\nأو تنزيلها من سجل GitHub:\n```bash\ndocker pull ghcr.io/d4vinci/scrapling:latest\n```\nيتم بناء هذه الصورة ودفعها تلقائياً باستخدام GitHub Actions والفرع الرئيسي للمستودع.\n\n## المساهمة\n\nنرحب بالمساهمات! يرجى قراءة [إرشادات المساهمة](https://github.com/D4Vinci/Scrapling/blob/main/CONTRIBUTING.md) قبل البدء.\n\n## إخلاء المسؤولية\n\n> [!CAUTION]\n> يتم توفير هذه المكتبة للأغراض التعليمية والبحثية فقط. باستخدام هذه المكتبة، فإنك توافق على الامتثال لقوانين استخراج البيانات والخصوصية المحلية والدولية. المؤلفون والمساهمون غير مسؤولين عن أي إساءة استخدام لهذا البرنامج. احترم دائماً شروط خدمة المواقع وملفات robots.txt.\n\n## 🎓 الاستشهادات\nإذا استخدمت مكتبتنا لأغراض بحثية، يرجى الاستشهاد بنا بالمرجع التالي:\n```text\n  @misc{scrapling,\n    author = {Karim Shoair},\n    title = {Scrapling},\n    year = {2024},\n    url = {https://github.com/D4Vinci/Scrapling},\n    note = {An adaptive Web Scraping framework that handles everything from a single request to a full-scale crawl!}\n  }\n```\n\n## الترخيص\n\nهذا العمل مرخص بموجب ترخيص BSD-3-Clause.\n\n## الشكر والتقدير\n\nيتضمن هذا المشروع كوداً معدلاً من:\n- Parsel (ترخيص BSD) — يُستخدم للوحدة الفرعية [translator](https://github.com/D4Vinci/Scrapling/blob/main/scrapling/core/translator.py)\n\n---\n<div align=\"center\"><small>مصمم ومصنوع بـ ❤️ بواسطة كريم شعير.</small></div><br>\n"
  },
  {
    "path": "docs/README_CN.md",
    "content": "<!-- mcp-name: io.github.D4Vinci/Scrapling -->\n\n<h1 align=\"center\">\n    <a href=\"https://scrapling.readthedocs.io\">\n        <picture>\n          <source media=\"(prefers-color-scheme: dark)\" srcset=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/docs/assets/cover_dark.svg?sanitize=true\">\n          <img alt=\"Scrapling Poster\" src=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/docs/assets/cover_light.svg?sanitize=true\">\n        </picture>\n    </a>\n    <br>\n    <small>Effortless Web Scraping for the Modern Web</small>\n</h1>\n\n<p align=\"center\">\n    <a href=\"https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml\" alt=\"Tests\">\n        <img alt=\"Tests\" src=\"https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml/badge.svg\"></a>\n    <a href=\"https://badge.fury.io/py/Scrapling\" alt=\"PyPI version\">\n        <img alt=\"PyPI version\" src=\"https://badge.fury.io/py/Scrapling.svg\"></a>\n    <a href=\"https://clickpy.clickhouse.com/dashboard/scrapling\" rel=\"nofollow\"><img src=\"https://img.shields.io/pypi/dm/scrapling\" alt=\"PyPI package downloads\"></a>\n    <a href=\"https://github.com/D4Vinci/Scrapling/tree/main/agent-skill\" alt=\"AI Agent Skill directory\">\n        <img alt=\"Static Badge\" src=\"https://img.shields.io/badge/Skill-black?style=flat&label=Agent&link=https%3A%2F%2Fgithub.com%2FD4Vinci%2FScrapling%2Ftree%2Fmain%2Fagent-skill\"></a>\n    <a href=\"https://clawhub.ai/D4Vinci/scrapling-official\" alt=\"OpenClaw Skill\">\n        <img alt=\"OpenClaw Skill\" src=\"https://img.shields.io/badge/Clawhub-darkred?style=flat&label=OpenClaw&link=https%3A%2F%2Fclawhub.ai%2FD4Vinci%2Fscrapling-official\"></a>\n    <br/>\n    <a href=\"https://discord.gg/EMgGbDceNQ\" alt=\"Discord\" target=\"_blank\">\n      <img alt=\"Discord\" src=\"https://img.shields.io/discord/1360786381042880532?style=social&logo=discord&link=https%3A%2F%2Fdiscord.gg%2FEMgGbDceNQ\">\n    </a>\n    <a href=\"https://x.com/Scrapling_dev\" alt=\"X (formerly Twitter)\">\n      <img alt=\"X (formerly Twitter) Follow\" src=\"https://img.shields.io/twitter/follow/Scrapling_dev?style=social&logo=x&link=https%3A%2F%2Fx.com%2FScrapling_dev\">\n    </a>\n    <br/>\n    <a href=\"https://pypi.org/project/scrapling/\" alt=\"Supported Python versions\">\n        <img alt=\"Supported Python versions\" src=\"https://img.shields.io/pypi/pyversions/scrapling.svg\"></a>\n</p>\n\n<p align=\"center\">\n    <a href=\"https://scrapling.readthedocs.io/en/latest/parsing/selection.html\"><strong>选择方法</strong></a>\n    &middot;\n    <a href=\"https://scrapling.readthedocs.io/en/latest/fetching/choosing.html\"><strong>选择 Fetcher</strong></a>\n    &middot;\n    <a href=\"https://scrapling.readthedocs.io/en/latest/spiders/architecture.html\"><strong>爬虫</strong></a>\n    &middot;\n    <a href=\"https://scrapling.readthedocs.io/en/latest/spiders/proxy-blocking.html\"><strong>代理轮换</strong></a>\n    &middot;\n    <a href=\"https://scrapling.readthedocs.io/en/latest/cli/overview.html\"><strong>CLI</strong></a>\n    &middot;\n    <a href=\"https://scrapling.readthedocs.io/en/latest/ai/mcp-server.html\"><strong>MCP 模式</strong></a>\n</p>\n\nScrapling 是一个自适应 Web Scraping 框架，能处理从单个请求到大规模爬取的一切需求。\n\n它的解析器能够从网站变化中学习，并在页面更新时自动重新定位您的元素。它的 Fetcher 能够开箱即用地绕过 Cloudflare Turnstile 等反机器人系统。它的 Spider 框架让您可以扩展到并发、多 Session 爬取，支持暂停/恢复和自动 Proxy 轮换——只需几行 Python 代码。一个库，零妥协。\n\n极速爬取，实时统计和 Streaming。由 Web Scraper 为 Web Scraper 和普通用户而构建，每个人都能找到适合自己的功能。\n\n```python\nfrom scrapling.fetchers import Fetcher, AsyncFetcher, StealthyFetcher, DynamicFetcher\nStealthyFetcher.adaptive = True\np = StealthyFetcher.fetch('https://example.com', headless=True, network_idle=True)  # 隐秘地获取网站！\nproducts = p.css('.product', auto_save=True)                                        # 抓取在网站设计变更后仍能存活的数据！\nproducts = p.css('.product', adaptive=True)                                         # 之后，如果网站结构改变，传递 `adaptive=True` 来找到它们！\n```\n或扩展为完整爬取\n```python\nfrom scrapling.spiders import Spider, Response\n\nclass MySpider(Spider):\n  name = \"demo\"\n  start_urls = [\"https://example.com/\"]\n\n  async def parse(self, response: Response):\n      for item in response.css('.product'):\n          yield {\"title\": item.css('h2::text').get()}\n\nMySpider().start()\n```\n\n<p align=\"center\">\n    <a href=\"https://dataimpulse.com/?utm_source=scrapling&utm_medium=banner&utm_campaign=scrapling\" target=\"_blank\" style=\"display:flex; justify-content:center; padding:4px 0;\">\n        <img src=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/DataImpulse.png\" alt=\"At DataImpulse, we specialize in developing custom proxy services for your business. Make requests from anywhere, collect data, and enjoy fast connections with our premium proxies.\" style=\"max-height:60px;\">\n    </a>\n</p>\n\n# 铂金赞助商\n<table>\n  <tr>\n    <td width=\"200\">\n      <a href=\"https://hypersolutions.co/?utm_source=github&utm_medium=readme&utm_campaign=scrapling\" target=\"_blank\" title=\"Bot Protection Bypass API for Akamai, DataDome, Incapsula & Kasada\">\n        <img src=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/HyperSolutions.png\">\n      </a>\n    </td>\n    <td> Scrapling 可处理 Cloudflare Turnstile。对于企业级保护，<a href=\"https://hypersolutions.co?utm_source=github&utm_medium=readme&utm_campaign=scrapling\">\n        <b>Hyper Solutions</b>\n      </a> 提供 API 端点，生成适用于 <b>Akamai</b>、<b>DataDome</b>、<b>Kasada</b> 和 <b>Incapsula</b> 的有效 antibot 令牌。简单的 API 调用，无需浏览器自动化。 </td>\n  </tr>\n  <tr>\n    <td width=\"200\">\n      <a href=\"https://birdproxies.com/t/scrapling\" target=\"_blank\" title=\"At Bird Proxies, we eliminate your pains such as banned IPs, geo restriction, and high costs so you can focus on your work.\">\n        <img src=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/BirdProxies.jpg\">\n      </a>\n    </td>\n    <td>嘿，我们创建了 <a href=\"https://birdproxies.com/t/scrapling\">\n        <b>BirdProxies</b>\n      </a>，因为代理不应该复杂或昂贵。 <br /> 覆盖 195+ 地区的快速住宅和 ISP 代理，公平定价，真正的支持。 <br />\n      <b>在落地页试试我们的 FlappyBird 游戏，获取免费流量！</b>\n    </td>\n  </tr>\n  <tr>\n    <td width=\"200\">\n      <a href=\"https://evomi.com?utm_source=github&utm_medium=banner&utm_campaign=d4vinci-scrapling\" target=\"_blank\" title=\"Evomi is your Swiss Quality Proxy Provider, starting at $0.49/GB\">\n        <img src=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/evomi.png\">\n      </a>\n    </td>\n    <td>\n      <a href=\"https://evomi.com?utm_source=github&utm_medium=banner&utm_campaign=d4vinci-scrapling\">\n        <b>Evomi</b>\n      </a>：住宅代理低至 0.49 美元/GB。具备完全伪装 Chromium 的爬虫浏览器、住宅 IP、自动验证码解决和反机器人绕过。</br>\n      <b>Scraper API 轻松获取结果。支持 MCP 和 N8N 集成。</b>\n    </td>\n  </tr>\n  <tr>\n    <td width=\"200\">\n      <a href=\"https://tikhub.io/?ref=KarimShoair\" target=\"_blank\" title=\"Unlock the Power of Social Media Data & AI\">\n        <img src=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/TikHub.jpg\">\n      </a>\n    </td>\n    <td>\n      <a href=\"https://tikhub.io/?ref=KarimShoair\" target=\"_blank\">TikHub.io</a> 提供覆盖 16+ 平台（包括 TikTok、X、YouTube 和 Instagram）的 900+ 稳定 API，拥有 4000 万+ 数据集。<br /> 还提供<a href=\"https://ai.tikhub.io/?ref=KarimShoair\" target=\"_blank\">优惠 AI 模型</a> — Claude、GPT、GEMINI 等，最高优惠 71%。\n    </td>\n  </tr>\n  <tr>\n    <td width=\"200\">\n      <a href=\"https://www.nsocks.com/?keyword=2p67aivg\" target=\"_blank\" title=\"Scalable Web Data Access for AI Applications\">\n        <img src=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/nsocks.png\">\n      </a>\n    </td>\n    <td>\n    <a href=\"https://www.nsocks.com/?keyword=2p67aivg\" target=\"_blank\">Nsocks</a> 提供面向开发者和爬虫的快速住宅和 ISP 代理。全球 IP 覆盖、高匿名性、智能轮换，以及可靠的自动化和数据提取性能。使用 <a href=\"https://www.xcrawl.com/?keyword=2p67aivg\" target=\"_blank\">Xcrawl</a> 简化大规模网页爬取。\n    </td>\n  </tr>\n  <tr>\n    <td width=\"200\">\n      <a href=\"https://petrosky.io/d4vinci\" target=\"_blank\" title=\"PetroSky delivers cutting-edge VPS hosting.\">\n        <img src=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/petrosky.png\">\n      </a>\n    </td>\n    <td>\n    合上笔记本电脑，您的爬虫仍在运行。<br />\n    <a href=\"https://petrosky.io/d4vinci\" target=\"_blank\">PetroSky VPS</a> - 为不间断自动化而生的云服务器。Windows 和 Linux 系统，完全掌控。低至 €6.99/月。\n    </td>\n  </tr>\n  <tr>\n    <td width=\"200\">\n      <a href=\"https://substack.thewebscraping.club/p/scrapling-hands-on-guide?utm_source=github&utm_medium=repo&utm_campaign=scrapling\" target=\"_blank\" title=\"The #1 newsletter dedicated to Web Scraping\">\n        <img src=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/TWSC.png\">\n      </a>\n    </td>\n    <td>\n    阅读 <a href=\"https://substack.thewebscraping.club/p/scrapling-hands-on-guide?utm_source=github&utm_medium=repo&utm_campaign=scrapling\" target=\"_blank\">The Web Scraping Club 上关于 Scrapling 的完整评测</a>（2025 年 11 月），这是排名第一的网页抓取专业通讯。\n    </td>\n  </tr>\n  <tr>\n    <td width=\"200\">\n      <a href=\"https://proxy-seller.com/?partner=CU9CAA5TBYFFT2\" target=\"_blank\" title=\"Proxy-Seller provides reliable proxy infrastructure for Web Scraping\">\n        <img src=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/ProxySeller.png\">\n      </a>\n    </td>\n    <td>\n    <a href=\"https://proxy-seller.com/?partner=CU9CAA5TBYFFT2\" target=\"_blank\">Proxy-Seller</a> 提供可靠的网页抓取代理基础设施，包括 IPv4、IPv6、ISP、住宅和移动代理，具备稳定性能、广泛的地理覆盖和灵活的企业级数据采集方案。\n    </td>\n  </tr>\n</table>\n\n<i><sub>想在这里展示您的广告吗？点击 [这里](https://github.com/sponsors/D4Vinci/sponsorships?tier_id=586646)</sub></i>\n# 赞助商\n\n<!-- sponsors -->\n\n<a href=\"https://serpapi.com/?utm_source=scrapling\" target=\"_blank\" title=\"Scrape Google and other search engines with SerpApi\"><img src=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/SerpApi.png\"></a>\n<a href=\"https://visit.decodo.com/Dy6W0b\" target=\"_blank\" title=\"Try the Most Efficient Residential Proxies for Free\"><img src=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/decodo.png\"></a>\n<a href=\"https://hasdata.com/?utm_source=github&utm_medium=banner&utm_campaign=D4Vinci\" target=\"_blank\" title=\"The web scraping service that actually beats anti-bot systems!\"><img src=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/hasdata.png\"></a>\n<a href=\"https://proxyempire.io/?ref=scrapling&utm_source=scrapling\" target=\"_blank\" title=\"Collect The Data Your Project Needs with the Best Residential Proxies\"><img src=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/ProxyEmpire.png\"></a><a href=\"https://www.swiftproxy.net/\" target=\"_blank\" title=\"Unlock Reliable Proxy Services with Swiftproxy!\"><img src=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/swiftproxy.png\"></a>\n<a href=\"https://www.rapidproxy.io/?ref=d4v\" target=\"_blank\" title=\"Affordable Access to the Proxy World – bypass CAPTCHAs blocks, and avoid additional costs.\"><img src=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/rapidproxy.jpg\"></a>\n<a href=\"https://browser.cash/?utm_source=D4Vinci&utm_medium=referral\" target=\"_blank\" title=\"Browser Automation & AI Browser Agent Platform\"><img src=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/browserCash.png\"></a>\n\n<!-- /sponsors -->\n\n<i><sub>想在这里展示您的广告吗？点击 [这里](https://github.com/sponsors/D4Vinci) 并选择适合您的级别！</sub></i>\n\n---\n\n## 主要特性\n\n### Spider — 完整的爬取框架\n- 🕷️ **类 Scrapy 的 Spider API**：使用 `start_urls`、async `parse` callback 和`Request`/`Response` 对象定义 Spider。\n- ⚡ **并发爬取**：可配置的并发限制、按域名节流和下载延迟。\n- 🔄 **多 Session 支持**：统一接口，支持 HTTP 请求和隐秘无头浏览器在同一个 Spider 中使用——通过 ID 将请求路由到不同的 Session。\n- 💾 **暂停与恢复**：基于 Checkpoint 的爬取持久化。按 Ctrl+C 优雅关闭；重启后从上次停止的地方继续。\n- 📡 **Streaming 模式**：通过 `async for item in spider.stream()` 以实时统计 Streaming 抓取的数据——非常适合 UI、管道和长时间运行的爬取。\n- 🛡️ **被阻止请求检测**：自动检测并重试被阻止的请求，支持自定义逻辑。\n- 📦 **内置导出**：通过钩子和您自己的管道导出结果，或使用内置的 JSON/JSONL，分别通过 `result.items.to_json()`/`result.items.to_jsonl()`。\n\n### 支持 Session 的高级网站获取\n- **HTTP 请求**：使用 `Fetcher` 类进行快速和隐秘的 HTTP 请求。可以模拟浏览器的 TLS fingerprint、标头并使用 HTTP/3。\n- **动态加载**：通过 `DynamicFetcher` 类使用完整的浏览器自动化获取动态网站，支持 Playwright 的 Chromium 和 Google Chrome。\n- **反机器人绕过**：使用 `StealthyFetcher` 的高级隐秘功能和 fingerprint 伪装。可以轻松自动绕过所有类型的 Cloudflare Turnstile/Interstitial。\n- **Session 管理**：使用 `FetcherSession`、`StealthySession` 和 `DynamicSession` 类实现持久化 Session 支持，用于跨请求的 cookie 和状态管理。\n- **Proxy 轮换**：内置 `ProxyRotator`，支持轮询或自定义策略，适用于所有 Session 类型，并支持按请求覆盖 Proxy。\n- **域名屏蔽**：在基于浏览器的 Fetcher 中屏蔽对特定域名（及其子域名）的请求。\n- **Async 支持**：所有 Fetcher 和专用 async Session 类的完整 async 支持。\n\n### 自适应抓取和 AI 集成\n- 🔄 **智能元素跟踪**：使用智能相似性算法在网站更改后重新定位元素。\n- 🎯 **智能灵活选择**：CSS 选择器、XPath 选择器、基于过滤器的搜索、文本搜索、正则表达式搜索等。\n- 🔍 **查找相似元素**：自动定位与已找到元素相似的元素。\n- 🤖 **与 AI 一起使用的 MCP 服务器**：内置 MCP 服务器用于 AI 辅助 Web Scraping 和数据提取。MCP 服务器具有强大的自定义功能，利用 Scrapling 在将内容传递给 AI（Claude/Cursor 等）之前提取目标内容，从而加快操作并通过最小化 token 使用来降低成本。（[演示视频](https://www.youtube.com/watch?v=qyFk3ZNwOxE)）\n\n### 高性能和经过实战测试的架构\n- 🚀 **闪电般快速**：优化性能超越大多数 Python 抓取库。\n- 🔋 **内存高效**：优化的数据结构和延迟加载，最小内存占用。\n- ⚡ **快速 JSON 序列化**：比标准库快 10 倍。\n- 🏗️ **经过实战测试**：Scrapling 不仅拥有 92% 的测试覆盖率和完整的类型提示覆盖率，而且在过去一年中每天被数百名 Web Scraper 使用。\n\n### 对开发者/Web Scraper 友好的体验\n- 🎯 **交互式 Web Scraping Shell**：可选的内置 IPython Shell，具有 Scrapling 集成、快捷方式和新工具，可加快 Web Scraping 脚本开发，例如将 curl 请求转换为 Scrapling 请求并在浏览器中查看请求结果。\n- 🚀 **直接从终端使用**：可选地，您可以使用 Scrapling 抓取 URL 而无需编写任何代码！\n- 🛠️ **丰富的导航 API**：使用父级、兄弟级和子级导航方法进行高级 DOM 遍历。\n- 🧬 **增强的文本处理**：内置正则表达式、清理方法和优化的字符串操作。\n- 📝 **自动选择器生成**：为任何元素生成强大的 CSS/XPath 选择器。\n- 🔌 **熟悉的 API**：类似于 Scrapy/BeautifulSoup，使用与 Scrapy/Parsel 相同的伪元素。\n- 📘 **完整的类型覆盖**：完整的类型提示，出色的 IDE 支持和代码补全。整个代码库在每次更改时都会自动使用**PyRight**和**MyPy**扫描。\n- 🔋 **现成的 Docker 镜像**：每次发布时，包含所有浏览器的 Docker 镜像会自动构建和推送。\n\n## 入门\n\n让我们快速展示 Scrapling 的功能，无需深入了解。\n\n### 基本用法\n支持 Session 的 HTTP 请求\n```python\nfrom scrapling.fetchers import Fetcher, FetcherSession\n\nwith FetcherSession(impersonate='chrome') as session:  # 使用 Chrome 的最新版本 TLS fingerprint\n    page = session.get('https://quotes.toscrape.com/', stealthy_headers=True)\n    quotes = page.css('.quote .text::text').getall()\n\n# 或使用一次性请求\npage = Fetcher.get('https://quotes.toscrape.com/')\nquotes = page.css('.quote .text::text').getall()\n```\n高级隐秘模式\n```python\nfrom scrapling.fetchers import StealthyFetcher, StealthySession\n\nwith StealthySession(headless=True, solve_cloudflare=True) as session:  # 保持浏览器打开直到完成\n    page = session.fetch('https://nopecha.com/demo/cloudflare', google_search=False)\n    data = page.css('#padded_content a').getall()\n\n# 或使用一次性请求样式，为此请求打开浏览器，完成后关闭\npage = StealthyFetcher.fetch('https://nopecha.com/demo/cloudflare')\ndata = page.css('#padded_content a').getall()\n```\n完整的浏览器自动化\n```python\nfrom scrapling.fetchers import DynamicFetcher, DynamicSession\n\nwith DynamicSession(headless=True, disable_resources=False, network_idle=True) as session:  # 保持浏览器打开直到完成\n    page = session.fetch('https://quotes.toscrape.com/', load_dom=False)\n    data = page.xpath('//span[@class=\"text\"]/text()').getall()  # 如果您偏好 XPath 选择器\n\n# 或使用一次性请求样式，为此请求打开浏览器，完成后关闭\npage = DynamicFetcher.fetch('https://quotes.toscrape.com/')\ndata = page.css('.quote .text::text').getall()\n```\n\n### Spider\n构建具有并发请求、多种 Session 类型和暂停/恢复功能的完整爬虫：\n```python\nfrom scrapling.spiders import Spider, Request, Response\n\nclass QuotesSpider(Spider):\n    name = \"quotes\"\n    start_urls = [\"https://quotes.toscrape.com/\"]\n    concurrent_requests = 10\n\n    async def parse(self, response: Response):\n        for quote in response.css('.quote'):\n            yield {\n                \"text\": quote.css('.text::text').get(),\n                \"author\": quote.css('.author::text').get(),\n            }\n\n        next_page = response.css('.next a')\n        if next_page:\n            yield response.follow(next_page[0].attrib['href'])\n\nresult = QuotesSpider().start()\nprint(f\"抓取了 {len(result.items)} 条引用\")\nresult.items.to_json(\"quotes.json\")\n```\n在单个 Spider 中使用多种 Session 类型：\n```python\nfrom scrapling.spiders import Spider, Request, Response\nfrom scrapling.fetchers import FetcherSession, AsyncStealthySession\n\nclass MultiSessionSpider(Spider):\n    name = \"multi\"\n    start_urls = [\"https://example.com/\"]\n\n    def configure_sessions(self, manager):\n        manager.add(\"fast\", FetcherSession(impersonate=\"chrome\"))\n        manager.add(\"stealth\", AsyncStealthySession(headless=True), lazy=True)\n\n    async def parse(self, response: Response):\n        for link in response.css('a::attr(href)').getall():\n            # 将受保护的页面路由到隐秘 Session\n            if \"protected\" in link:\n                yield Request(link, sid=\"stealth\")\n            else:\n                yield Request(link, sid=\"fast\", callback=self.parse)  # 显式 callback\n```\n通过如下方式运行 Spider 来暂停和恢复长时间爬取，使用 Checkpoint：\n```python\nQuotesSpider(crawldir=\"./crawl_data\").start()\n```\n按 Ctrl+C 优雅暂停——进度会自动保存。之后，当您再次启动 Spider 时，传递相同的 `crawldir`，它将从上次停止的地方继续。\n\n### 高级解析与导航\n```python\nfrom scrapling.fetchers import Fetcher\n\n# 丰富的元素选择和导航\npage = Fetcher.get('https://quotes.toscrape.com/')\n\n# 使用多种选择方法获取引用\nquotes = page.css('.quote')  # CSS 选择器\nquotes = page.xpath('//div[@class=\"quote\"]')  # XPath\nquotes = page.find_all('div', {'class': 'quote'})  # BeautifulSoup 风格\n# 等同于\nquotes = page.find_all('div', class_='quote')\nquotes = page.find_all(['div'], class_='quote')\nquotes = page.find_all(class_='quote')  # 等等...\n# 按文本内容查找元素\nquotes = page.find_by_text('quote', tag='div')\n\n# 高级导航\nquote_text = page.css('.quote')[0].css('.text::text').get()\nquote_text = page.css('.quote').css('.text::text').getall()  # 链式选择器\nfirst_quote = page.css('.quote')[0]\nauthor = first_quote.next_sibling.css('.author::text')\nparent_container = first_quote.parent\n\n# 元素关系和相似性\nsimilar_elements = first_quote.find_similar()\nbelow_elements = first_quote.below_elements()\n```\n如果您不想获取网站，可以直接使用解析器，如下所示：\n```python\nfrom scrapling.parser import Selector\n\npage = Selector(\"<html>...</html>\")\n```\n用法完全相同！\n\n### Async Session 管理示例\n```python\nimport asyncio\nfrom scrapling.fetchers import FetcherSession, AsyncStealthySession, AsyncDynamicSession\n\nasync with FetcherSession(http3=True) as session:  # `FetcherSession`是上下文感知的，可以在 sync/async 模式下工作\n    page1 = session.get('https://quotes.toscrape.com/')\n    page2 = session.get('https://quotes.toscrape.com/', impersonate='firefox135')\n\n# Async Session 用法\nasync with AsyncStealthySession(max_pages=2) as session:\n    tasks = []\n    urls = ['https://example.com/page1', 'https://example.com/page2']\n\n    for url in urls:\n        task = session.fetch(url)\n        tasks.append(task)\n\n    print(session.get_pool_stats())  # 可选 - 浏览器标签池的状态（忙/空闲/错误）\n    results = await asyncio.gather(*tasks)\n    print(session.get_pool_stats())\n```\n\n## CLI 和交互式 Shell\n\nScrapling 包含强大的命令行界面：\n\n[![asciicast](https://asciinema.org/a/736339.svg)](https://asciinema.org/a/736339)\n\n启动交互式 Web Scraping Shell\n```bash\nscrapling shell\n```\n直接将页面提取到文件而无需编程（默认提取 `body` 标签内的内容）。如果输出文件以`.txt` 结尾，则将提取目标的文本内容。如果以`.md` 结尾，它将是 HTML 内容的 Markdown 表示；如果以`.html` 结尾，它将是 HTML 内容本身。\n```bash\nscrapling extract get 'https://example.com' content.md\nscrapling extract get 'https://example.com' content.txt --css-selector '#fromSkipToProducts' --impersonate 'chrome'  # 所有匹配 CSS 选择器'#fromSkipToProducts' 的元素\nscrapling extract fetch 'https://example.com' content.md --css-selector '#fromSkipToProducts' --no-headless\nscrapling extract stealthy-fetch 'https://nopecha.com/demo/cloudflare' captchas.html --css-selector '#padded_content a' --solve-cloudflare\n```\n\n> [!NOTE]\n> 还有许多其他功能，但我们希望保持此页面简洁，包括 MCP 服务器和交互式 Web Scraping Shell。查看完整文档 [这里](https://scrapling.readthedocs.io/en/latest/)\n\n## 性能基准\n\nScrapling 不仅功能强大——它还速度极快。以下基准测试将 Scrapling 的解析器与其他流行库的最新版本进行了比较。\n\n### 文本提取速度测试（5000 个嵌套元素）\n\n| # |         库         | 时间 (ms)  | vs Scrapling |\n|---|:-----------------:|:---------:|:------------:|\n| 1 |     Scrapling     |   2.02    |     1.0x     |\n| 2 |   Parsel/Scrapy   |   2.04    |     1.01     |\n| 3 |     Raw Lxml      |   2.54    |    1.257     |\n| 4 |      PyQuery      |   24.17   |     ~12x     |\n| 5 |    Selectolax     |   82.63   |     ~41x     |\n| 6 |  MechanicalSoup   |  1549.71  |   ~767.1x    |\n| 7 |   BS4 with Lxml   |  1584.31  |   ~784.3x    |\n| 8 | BS4 with html5lib |  3391.91  |   ~1679.1x   |\n\n\n### 元素相似性和文本搜索性能\n\nScrapling 的自适应元素查找功能明显优于替代方案：\n\n| 库           | 时间 (ms) | vs Scrapling |\n|-------------|:---------:|:------------:|\n| Scrapling   |   2.39    |     1.0x     |\n| AutoScraper |   12.45   |    5.209x    |\n\n\n> 所有基准测试代表 100+ 次运行的平均值。请参阅 [benchmarks.py](https://github.com/D4Vinci/Scrapling/blob/main/benchmarks.py) 了解方法。\n\n## 安装\n\nScrapling 需要 Python 3.10 或更高版本：\n\n```bash\npip install scrapling\n```\n\n此安装仅包括解析器引擎及其依赖项，没有任何 Fetcher 或命令行依赖项。\n\n### 可选依赖项\n\n1. 如果您要使用以下任何额外功能、Fetcher 或它们的类，您将需要安装 Fetcher 的依赖项和它们的浏览器依赖项，如下所示：\n    ```bash\n    pip install \"scrapling[fetchers]\"\n\n    scrapling install           # normal install\n    scrapling install  --force  # force reinstall\n    ```\n\n    这会下载所有浏览器，以及它们的系统依赖项和 fingerprint 操作依赖项。\n\n    或者你可以从代码中安装，而不是运行命令：\n    ```python\n    from scrapling.cli import install\n\n    install([], standalone_mode=False)          # normal install\n    install([\"--force\"], standalone_mode=False) # force reinstall\n    ```\n\n2. 额外功能：\n   - 安装 MCP 服务器功能：\n       ```bash\n       pip install \"scrapling[ai]\"\n       ```\n   - 安装 Shell 功能（Web Scraping Shell 和 `extract` 命令）：\n       ```bash\n       pip install \"scrapling[shell]\"\n       ```\n   - 安装所有内容：\n       ```bash\n       pip install \"scrapling[all]\"\n       ```\n   请记住，在安装任何这些额外功能后（如果您还没有安装），您需要使用 `scrapling install` 安装浏览器依赖项\n\n### Docker\n您还可以使用以下命令从 DockerHub 安装包含所有额外功能和浏览器的 Docker 镜像：\n```bash\ndocker pull pyd4vinci/scrapling\n```\n或从 GitHub 注册表下载：\n```bash\ndocker pull ghcr.io/d4vinci/scrapling:latest\n```\n此镜像使用 GitHub Actions 和仓库主分支自动构建和推送。\n\n## 贡献\n\n我们欢迎贡献！在开始之前，请阅读我们的 [贡献指南](https://github.com/D4Vinci/Scrapling/blob/main/CONTRIBUTING.md)。\n\n## 免责声明\n\n> [!CAUTION]\n> 此库仅用于教育和研究目的。使用此库即表示您同意遵守本地和国际数据抓取和隐私法律。作者和贡献者对本软件的任何滥用不承担责任。始终尊重网站的服务条款和 robots.txt 文件。\n\n## 🎓 引用\n如果您将我们的库用于研究目的，请使用以下参考文献引用我们：\n```text\n  @misc{scrapling,\n    author = {Karim Shoair},\n    title = {Scrapling},\n    year = {2024},\n    url = {https://github.com/D4Vinci/Scrapling},\n    note = {An adaptive Web Scraping framework that handles everything from a single request to a full-scale crawl!}\n  }\n```\n\n## 许可证\n\n本作品根据 BSD-3-Clause 许可证授权。\n\n## 致谢\n\n此项目包含改编自以下内容的代码：\n- Parsel（BSD 许可证）——用于 [translator](https://github.com/D4Vinci/Scrapling/blob/main/scrapling/core/translator.py)子模块\n\n---\n<div align=\"center\"><small>由 Karim Shoair 用❤️设计和制作。</small></div><br>\n"
  },
  {
    "path": "docs/README_DE.md",
    "content": "<!-- mcp-name: io.github.D4Vinci/Scrapling -->\n\n<h1 align=\"center\">\n    <a href=\"https://scrapling.readthedocs.io\">\n        <picture>\n          <source media=\"(prefers-color-scheme: dark)\" srcset=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/docs/assets/cover_dark.svg?sanitize=true\">\n          <img alt=\"Scrapling Poster\" src=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/docs/assets/cover_light.svg?sanitize=true\">\n        </picture>\n    </a>\n    <br>\n    <small>Effortless Web Scraping for the Modern Web</small>\n</h1>\n\n<p align=\"center\">\n    <a href=\"https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml\" alt=\"Tests\">\n        <img alt=\"Tests\" src=\"https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml/badge.svg\"></a>\n    <a href=\"https://badge.fury.io/py/Scrapling\" alt=\"PyPI version\">\n        <img alt=\"PyPI version\" src=\"https://badge.fury.io/py/Scrapling.svg\"></a>\n    <a href=\"https://clickpy.clickhouse.com/dashboard/scrapling\" rel=\"nofollow\"><img src=\"https://img.shields.io/pypi/dm/scrapling\" alt=\"PyPI package downloads\"></a>\n    <a href=\"https://github.com/D4Vinci/Scrapling/tree/main/agent-skill\" alt=\"AI Agent Skill directory\">\n        <img alt=\"Static Badge\" src=\"https://img.shields.io/badge/Skill-black?style=flat&label=Agent&link=https%3A%2F%2Fgithub.com%2FD4Vinci%2FScrapling%2Ftree%2Fmain%2Fagent-skill\"></a>\n    <a href=\"https://clawhub.ai/D4Vinci/scrapling-official\" alt=\"OpenClaw Skill\">\n        <img alt=\"OpenClaw Skill\" src=\"https://img.shields.io/badge/Clawhub-darkred?style=flat&label=OpenClaw&link=https%3A%2F%2Fclawhub.ai%2FD4Vinci%2Fscrapling-official\"></a>\n    <br/>\n    <a href=\"https://discord.gg/EMgGbDceNQ\" alt=\"Discord\" target=\"_blank\">\n      <img alt=\"Discord\" src=\"https://img.shields.io/discord/1360786381042880532?style=social&logo=discord&link=https%3A%2F%2Fdiscord.gg%2FEMgGbDceNQ\">\n    </a>\n    <a href=\"https://x.com/Scrapling_dev\" alt=\"X (formerly Twitter)\">\n      <img alt=\"X (formerly Twitter) Follow\" src=\"https://img.shields.io/twitter/follow/Scrapling_dev?style=social&logo=x&link=https%3A%2F%2Fx.com%2FScrapling_dev\">\n    </a>\n    <br/>\n    <a href=\"https://pypi.org/project/scrapling/\" alt=\"Supported Python versions\">\n        <img alt=\"Supported Python versions\" src=\"https://img.shields.io/pypi/pyversions/scrapling.svg\"></a>\n</p>\n\n<p align=\"center\">\n    <a href=\"https://scrapling.readthedocs.io/en/latest/parsing/selection.html\"><strong>Auswahlmethoden</strong></a>\n    &middot;\n    <a href=\"https://scrapling.readthedocs.io/en/latest/fetching/choosing.html\"><strong>Einen Fetcher wählen</strong></a>\n    &middot;\n    <a href=\"https://scrapling.readthedocs.io/en/latest/spiders/architecture.html\"><strong>Spiders</strong></a>\n    &middot;\n    <a href=\"https://scrapling.readthedocs.io/en/latest/spiders/proxy-blocking.html\"><strong>Proxy-Rotation</strong></a>\n    &middot;\n    <a href=\"https://scrapling.readthedocs.io/en/latest/cli/overview.html\"><strong>CLI</strong></a>\n    &middot;\n    <a href=\"https://scrapling.readthedocs.io/en/latest/ai/mcp-server.html\"><strong>MCP-Modus</strong></a>\n</p>\n\nScrapling ist ein adaptives Web-Scraping-Framework, das alles abdeckt -- von einer einzelnen Anfrage bis hin zu einem umfassenden Crawl.\n\nSein Parser lernt aus Website-Änderungen und lokalisiert Ihre Elemente automatisch neu, wenn sich Seiten aktualisieren. Seine Fetcher umgehen Anti-Bot-Systeme wie Cloudflare Turnstile direkt ab Werk. Und sein Spider-Framework ermöglicht es Ihnen, auf parallele Multi-Session-Crawls mit Pause & Resume und automatischer Proxy-Rotation hochzuskalieren -- alles in wenigen Zeilen Python. Eine Bibliothek, keine Kompromisse.\n\nBlitzschnelle Crawls mit Echtzeit-Statistiken und Streaming. Von Web Scrapern für Web Scraper und normale Benutzer entwickelt, ist für jeden etwas dabei.\n\n```python\nfrom scrapling.fetchers import Fetcher, AsyncFetcher, StealthyFetcher, DynamicFetcher\nStealthyFetcher.adaptive = True\np = StealthyFetcher.fetch('https://example.com', headless=True, network_idle=True)  # Website unbemerkt abrufen!\nproducts = p.css('.product', auto_save=True)                                        # Daten scrapen, die Website-Designänderungen überleben!\nproducts = p.css('.product', adaptive=True)                                         # Später, wenn sich die Website-Struktur ändert, `adaptive=True` übergeben, um sie zu finden!\n```\nOder auf vollständige Crawls hochskalieren\n```python\nfrom scrapling.spiders import Spider, Response\n\nclass MySpider(Spider):\n  name = \"demo\"\n  start_urls = [\"https://example.com/\"]\n\n  async def parse(self, response: Response):\n      for item in response.css('.product'):\n          yield {\"title\": item.css('h2::text').get()}\n\nMySpider().start()\n```\n\n<p align=\"center\">\n    <a href=\"https://dataimpulse.com/?utm_source=scrapling&utm_medium=banner&utm_campaign=scrapling\" target=\"_blank\" style=\"display:flex; justify-content:center; padding:4px 0;\">\n        <img src=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/DataImpulse.png\" alt=\"At DataImpulse, we specialize in developing custom proxy services for your business. Make requests from anywhere, collect data, and enjoy fast connections with our premium proxies.\" style=\"max-height:60px;\">\n    </a>\n</p>\n\n# Platin-Sponsoren\n<table>\n  <tr>\n    <td width=\"200\">\n      <a href=\"https://hypersolutions.co/?utm_source=github&utm_medium=readme&utm_campaign=scrapling\" target=\"_blank\" title=\"Bot Protection Bypass API for Akamai, DataDome, Incapsula & Kasada\">\n        <img src=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/HyperSolutions.png\">\n      </a>\n    </td>\n    <td> Scrapling bewältigt Cloudflare Turnstile. Für Schutz auf Unternehmensebene bietet <a href=\"https://hypersolutions.co?utm_source=github&utm_medium=readme&utm_campaign=scrapling\">\n        <b>Hyper Solutions</b>\n      </a> API-Endpunkte, die gültige Antibot-Tokens für <b>Akamai</b>, <b>DataDome</b>, <b>Kasada</b> und <b>Incapsula</b> generieren. Einfache API-Aufrufe, keine Browser-Automatisierung nötig. </td>\n  </tr>\n  <tr>\n    <td width=\"200\">\n      <a href=\"https://birdproxies.com/t/scrapling\" target=\"_blank\" title=\"At Bird Proxies, we eliminate your pains such as banned IPs, geo restriction, and high costs so you can focus on your work.\">\n        <img src=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/BirdProxies.jpg\">\n      </a>\n    </td>\n    <td>Hey, wir haben <a href=\"https://birdproxies.com/t/scrapling\">\n        <b>BirdProxies</b>\n      </a> gebaut, weil Proxies nicht kompliziert oder überteuert sein sollten. <br /> Schnelle Residential- und ISP-Proxies in über 195 Standorten, faire Preise und echter Support. <br />\n      <b>Probieren Sie unser FlappyBird-Spiel auf der Landingpage für kostenlose Daten!</b>\n    </td>\n  </tr>\n  <tr>\n    <td width=\"200\">\n      <a href=\"https://evomi.com?utm_source=github&utm_medium=banner&utm_campaign=d4vinci-scrapling\" target=\"_blank\" title=\"Evomi is your Swiss Quality Proxy Provider, starting at $0.49/GB\">\n        <img src=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/evomi.png\">\n      </a>\n    </td>\n    <td>\n      <a href=\"https://evomi.com?utm_source=github&utm_medium=banner&utm_campaign=d4vinci-scrapling\">\n        <b>Evomi</b>\n      </a>: Residential-Proxies ab 0,49 $/GB. Scraping-Browser mit vollständig gefälschtem Chromium, Residential-IPs, automatischer CAPTCHA-Lösung und Anti-Bot-Umgehung. </br>\n      <b>Scraper-API für problemlose Ergebnisse. MCP- und N8N-Integrationen verfügbar.</b>\n    </td>\n  </tr>\n  <tr>\n    <td width=\"200\">\n      <a href=\"https://tikhub.io/?ref=KarimShoair\" target=\"_blank\" title=\"Unlock the Power of Social Media Data & AI\">\n        <img src=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/TikHub.jpg\">\n      </a>\n    </td>\n    <td>\n      <a href=\"https://tikhub.io/?ref=KarimShoair\" target=\"_blank\">TikHub.io</a> bietet über 900 stabile APIs auf mehr als 16 Plattformen, darunter TikTok, X, YouTube und Instagram, mit über 40 Mio. Datensätzen. <br /> Bietet außerdem <a href=\"https://ai.tikhub.io/?ref=KarimShoair\" target=\"_blank\">vergünstigte KI-Modelle</a> — Claude, GPT, GEMINI und mehr mit bis zu 71% Rabatt.\n    </td>\n  </tr>\n  <tr>\n    <td width=\"200\">\n      <a href=\"https://www.nsocks.com/?keyword=2p67aivg\" target=\"_blank\" title=\"Scalable Web Data Access for AI Applications\">\n        <img src=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/nsocks.png\">\n      </a>\n    </td>\n    <td>\n    <a href=\"https://www.nsocks.com/?keyword=2p67aivg\" target=\"_blank\">Nsocks</a> bietet schnelle Residential- und ISP-Proxies für Entwickler und Scraper. Globale IP-Abdeckung, hohe Anonymität, intelligente Rotation und zuverlässige Leistung für Automatisierung und Datenextraktion. Verwenden Sie <a href=\"https://www.xcrawl.com/?keyword=2p67aivg\" target=\"_blank\">Xcrawl</a>, um großflächiges Web-Crawling zu vereinfachen.\n    </td>\n  </tr>\n  <tr>\n    <td width=\"200\">\n      <a href=\"https://petrosky.io/d4vinci\" target=\"_blank\" title=\"PetroSky delivers cutting-edge VPS hosting.\">\n        <img src=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/petrosky.png\">\n      </a>\n    </td>\n    <td>\n    Klappe den Laptop zu. Deine Scraper laufen weiter. <br />\n    <a href=\"https://petrosky.io/d4vinci\" target=\"_blank\">PetroSky VPS</a> - Cloud-Server für ununterbrochene Automatisierung. Windows- und Linux-Maschinen mit voller Kontrolle. Ab €6,99/Monat.\n    </td>\n  </tr>\n  <tr>\n    <td width=\"200\">\n      <a href=\"https://substack.thewebscraping.club/p/scrapling-hands-on-guide?utm_source=github&utm_medium=repo&utm_campaign=scrapling\" target=\"_blank\" title=\"The #1 newsletter dedicated to Web Scraping\">\n        <img src=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/TWSC.png\">\n      </a>\n    </td>\n    <td>\n    Lesen Sie eine vollständige Rezension von <a href=\"https://substack.thewebscraping.club/p/scrapling-hands-on-guide?utm_source=github&utm_medium=repo&utm_campaign=scrapling\" target=\"_blank\">Scrapling auf The Web Scraping Club</a> (Nov. 2025), dem führenden Newsletter für Web Scraping.\n    </td>\n  </tr>\n  <tr>\n    <td width=\"200\">\n      <a href=\"https://proxy-seller.com/?partner=CU9CAA5TBYFFT2\" target=\"_blank\" title=\"Proxy-Seller provides reliable proxy infrastructure for Web Scraping\">\n        <img src=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/ProxySeller.png\">\n      </a>\n    </td>\n    <td>\n    <a href=\"https://proxy-seller.com/?partner=CU9CAA5TBYFFT2\" target=\"_blank\">Proxy-Seller</a> bietet zuverlässige Proxy-Infrastruktur für Web Scraping mit IPv4-, IPv6-, ISP-, Residential- und Mobile-Proxys – stabile Leistung, breite geografische Abdeckung und flexible Tarife für die Datenerfassung im Unternehmensmaßstab.\n    </td>\n  </tr>\n</table>\n\n<i><sub>Möchten Sie Ihre Anzeige hier zeigen? Klicken Sie [hier](https://github.com/sponsors/D4Vinci/sponsorships?tier_id=586646)</sub></i>\n# Sponsoren\n\n<!-- sponsors -->\n\n<a href=\"https://serpapi.com/?utm_source=scrapling\" target=\"_blank\" title=\"Scrape Google and other search engines with SerpApi\"><img src=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/SerpApi.png\"></a>\n<a href=\"https://visit.decodo.com/Dy6W0b\" target=\"_blank\" title=\"Try the Most Efficient Residential Proxies for Free\"><img src=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/decodo.png\"></a>\n<a href=\"https://hasdata.com/?utm_source=github&utm_medium=banner&utm_campaign=D4Vinci\" target=\"_blank\" title=\"The web scraping service that actually beats anti-bot systems!\"><img src=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/hasdata.png\"></a>\n<a href=\"https://proxyempire.io/?ref=scrapling&utm_source=scrapling\" target=\"_blank\" title=\"Collect The Data Your Project Needs with the Best Residential Proxies\"><img src=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/ProxyEmpire.png\"></a><a href=\"https://www.swiftproxy.net/\" target=\"_blank\" title=\"Unlock Reliable Proxy Services with Swiftproxy!\"><img src=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/swiftproxy.png\"></a>\n<a href=\"https://www.rapidproxy.io/?ref=d4v\" target=\"_blank\" title=\"Affordable Access to the Proxy World – bypass CAPTCHAs blocks, and avoid additional costs.\"><img src=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/rapidproxy.jpg\"></a>\n<a href=\"https://browser.cash/?utm_source=D4Vinci&utm_medium=referral\" target=\"_blank\" title=\"Browser Automation & AI Browser Agent Platform\"><img src=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/browserCash.png\"></a>\n\n<!-- /sponsors -->\n\n<i><sub>Möchten Sie Ihre Anzeige hier zeigen? Klicken Sie [hier](https://github.com/sponsors/D4Vinci) und wählen Sie die Stufe, die zu Ihnen passt!</sub></i>\n\n---\n\n## Hauptmerkmale\n\n### Spiders -- Ein vollständiges Crawling-Framework\n- 🕷️ **Scrapy-ähnliche Spider-API**: Definieren Sie Spiders mit `start_urls`, async `parse` Callbacks und `Request`/`Response`-Objekten.\n- ⚡ **Paralleles Crawling**: Konfigurierbare Parallelitätslimits, domainbezogenes Throttling und Download-Verzögerungen.\n- 🔄 **Multi-Session-Unterstützung**: Einheitliche Schnittstelle für HTTP-Anfragen und heimliche Headless-Browser in einem einzigen Spider -- leiten Sie Anfragen per ID an verschiedene Sessions weiter.\n- 💾 **Pause & Resume**: Checkpoint-basierte Crawl-Persistenz. Drücken Sie Strg+C für ein kontrolliertes Herunterfahren; starten Sie neu, um dort fortzufahren, wo Sie aufgehört haben.\n- 📡 **Streaming-Modus**: Gescrapte Elemente in Echtzeit streamen über `async for item in spider.stream()` mit Echtzeit-Statistiken -- ideal für UI, Pipelines und lang laufende Crawls.\n- 🛡️ **Erkennung blockierter Anfragen**: Automatische Erkennung und Wiederholung blockierter Anfragen mit anpassbarer Logik.\n- 📦 **Integrierter Export**: Ergebnisse über Hooks und Ihre eigene Pipeline oder den integrierten JSON/JSONL-Export mit `result.items.to_json()` / `result.items.to_jsonl()` exportieren.\n\n### Erweitertes Website-Abrufen mit Session-Unterstützung\n- **HTTP-Anfragen**: Schnelle und heimliche HTTP-Anfragen mit der `Fetcher`-Klasse. Kann Browser-TLS-Fingerprints und Header imitieren und HTTP/3 verwenden.\n- **Dynamisches Laden**: Dynamische Websites mit vollständiger Browser-Automatisierung über die `DynamicFetcher`-Klasse abrufen, die Playwrights Chromium und Google Chrome unterstützt.\n- **Anti-Bot-Umgehung**: Erweiterte Stealth-Fähigkeiten mit `StealthyFetcher` und Fingerprint-Spoofing. Kann alle Arten von Cloudflares Turnstile/Interstitial einfach mit Automatisierung umgehen.\n- **Session-Verwaltung**: Persistente Session-Unterstützung mit den Klassen `FetcherSession`, `StealthySession` und `DynamicSession` für Cookie- und Zustandsverwaltung über Anfragen hinweg.\n- **Proxy-Rotation**: Integrierter `ProxyRotator` mit zyklischen oder benutzerdefinierten Rotationsstrategien über alle Session-Typen hinweg, plus Proxy-Überschreibungen pro Anfrage.\n- **Domain-Blockierung**: Anfragen an bestimmte Domains (und deren Subdomains) in browserbasierten Fetchern blockieren.\n- **Async-Unterstützung**: Vollständige async-Unterstützung über alle Fetcher und dedizierte async Session-Klassen hinweg.\n\n### Adaptives Scraping & KI-Integration\n- 🔄 **Intelligente Element-Verfolgung**: Elemente nach Website-Änderungen mit intelligenten Ähnlichkeitsalgorithmen neu lokalisieren.\n- 🎯 **Intelligente flexible Auswahl**: CSS-Selektoren, XPath-Selektoren, filterbasierte Suche, Textsuche, Regex-Suche und mehr.\n- 🔍 **Ähnliche Elemente finden**: Elemente, die gefundenen Elementen ähnlich sind, automatisch lokalisieren.\n- 🤖 **MCP-Server für die Verwendung mit KI**: Integrierter MCP-Server für KI-unterstütztes Web Scraping und Datenextraktion. Der MCP-Server verfügt über leistungsstarke, benutzerdefinierte Funktionen, die Scrapling nutzen, um gezielten Inhalt zu extrahieren, bevor er an die KI (Claude/Cursor/etc.) übergeben wird, wodurch Vorgänge beschleunigt und Kosten durch Minimierung der Token-Nutzung gesenkt werden. ([Demo-Video](https://www.youtube.com/watch?v=qyFk3ZNwOxE))\n\n### Hochleistungs- und praxiserprobte Architektur\n- 🚀 **Blitzschnell**: Optimierte Leistung, die die meisten Python-Scraping-Bibliotheken übertrifft.\n- 🔋 **Speichereffizient**: Optimierte Datenstrukturen und Lazy Loading für einen minimalen Speicher-Footprint.\n- ⚡ **Schnelle JSON-Serialisierung**: 10x schneller als die Standardbibliothek.\n- 🏗️ **Praxiserprobt**: Scrapling hat nicht nur eine Testabdeckung von 92% und eine vollständige Type-Hints-Abdeckung, sondern wird seit dem letzten Jahr täglich von Hunderten von Web Scrapern verwendet.\n\n### Entwickler-/Web-Scraper-freundliche Erfahrung\n- 🎯 **Interaktive Web-Scraping-Shell**: Optionale integrierte IPython-Shell mit Scrapling-Integration, Shortcuts und neuen Tools zur Beschleunigung der Web-Scraping-Skriptentwicklung, wie das Konvertieren von Curl-Anfragen in Scrapling-Anfragen und das Anzeigen von Anfrageergebnissen in Ihrem Browser.\n- 🚀 **Direkt vom Terminal aus verwenden**: Optional können Sie Scrapling verwenden, um eine URL zu scrapen, ohne eine einzige Codezeile zu schreiben!\n- 🛠️ **Umfangreiche Navigations-API**: Erweiterte DOM-Traversierung mit Eltern-, Geschwister- und Kind-Navigationsmethoden.\n- 🧬 **Verbesserte Textverarbeitung**: Integrierte Regex, Bereinigungsmethoden und optimierte String-Operationen.\n- 📝 **Automatische Selektorgenerierung**: Robuste CSS/XPath-Selektoren für jedes Element generieren.\n- 🔌 **Vertraute API**: Ähnlich wie Scrapy/BeautifulSoup mit denselben Pseudo-Elementen, die in Scrapy/Parsel verwendet werden.\n- 📘 **Vollständige Typabdeckung**: Vollständige Type Hints für hervorragende IDE-Unterstützung und Code-Vervollständigung. Die gesamte Codebasis wird bei jeder Änderung automatisch mit **PyRight** und **MyPy** gescannt.\n- 🔋 **Fertiges Docker-Image**: Mit jeder Veröffentlichung wird automatisch ein Docker-Image erstellt und gepusht, das alle Browser enthält.\n\n## Erste Schritte\n\nHier ein kurzer Überblick über das, was Scrapling kann, ohne zu sehr ins Detail zu gehen.\n\n### Grundlegende Verwendung\nHTTP-Anfragen mit Session-Unterstützung\n```python\nfrom scrapling.fetchers import Fetcher, FetcherSession\n\nwith FetcherSession(impersonate='chrome') as session:  # Neueste Version von Chromes TLS-Fingerprint verwenden\n    page = session.get('https://quotes.toscrape.com/', stealthy_headers=True)\n    quotes = page.css('.quote .text::text').getall()\n\n# Oder einmalige Anfragen verwenden\npage = Fetcher.get('https://quotes.toscrape.com/')\nquotes = page.css('.quote .text::text').getall()\n```\nErweiterter Stealth-Modus\n```python\nfrom scrapling.fetchers import StealthyFetcher, StealthySession\n\nwith StealthySession(headless=True, solve_cloudflare=True) as session:  # Browser offen halten, bis Sie fertig sind\n    page = session.fetch('https://nopecha.com/demo/cloudflare', google_search=False)\n    data = page.css('#padded_content a').getall()\n\n# Oder einmaligen Anfragenstil verwenden: öffnet den Browser für diese Anfrage und schließt ihn nach Abschluss\npage = StealthyFetcher.fetch('https://nopecha.com/demo/cloudflare')\ndata = page.css('#padded_content a').getall()\n```\nVollständige Browser-Automatisierung\n```python\nfrom scrapling.fetchers import DynamicFetcher, DynamicSession\n\nwith DynamicSession(headless=True, disable_resources=False, network_idle=True) as session:  # Browser offen halten, bis Sie fertig sind\n    page = session.fetch('https://quotes.toscrape.com/', load_dom=False)\n    data = page.xpath('//span[@class=\"text\"]/text()').getall()  # XPath-Selektor, falls bevorzugt\n\n# Oder einmaligen Anfragenstil verwenden: öffnet den Browser für diese Anfrage und schließt ihn nach Abschluss\npage = DynamicFetcher.fetch('https://quotes.toscrape.com/')\ndata = page.css('.quote .text::text').getall()\n```\n\n### Spiders\nVollständige Crawler mit parallelen Anfragen, mehreren Session-Typen und Pause & Resume erstellen:\n```python\nfrom scrapling.spiders import Spider, Request, Response\n\nclass QuotesSpider(Spider):\n    name = \"quotes\"\n    start_urls = [\"https://quotes.toscrape.com/\"]\n    concurrent_requests = 10\n\n    async def parse(self, response: Response):\n        for quote in response.css('.quote'):\n            yield {\n                \"text\": quote.css('.text::text').get(),\n                \"author\": quote.css('.author::text').get(),\n            }\n\n        next_page = response.css('.next a')\n        if next_page:\n            yield response.follow(next_page[0].attrib['href'])\n\nresult = QuotesSpider().start()\nprint(f\"{len(result.items)} Zitate gescrapt\")\nresult.items.to_json(\"quotes.json\")\n```\nMehrere Session-Typen in einem einzigen Spider verwenden:\n```python\nfrom scrapling.spiders import Spider, Request, Response\nfrom scrapling.fetchers import FetcherSession, AsyncStealthySession\n\nclass MultiSessionSpider(Spider):\n    name = \"multi\"\n    start_urls = [\"https://example.com/\"]\n\n    def configure_sessions(self, manager):\n        manager.add(\"fast\", FetcherSession(impersonate=\"chrome\"))\n        manager.add(\"stealth\", AsyncStealthySession(headless=True), lazy=True)\n\n    async def parse(self, response: Response):\n        for link in response.css('a::attr(href)').getall():\n            # Geschützte Seiten über die Stealth-Session leiten\n            if \"protected\" in link:\n                yield Request(link, sid=\"stealth\")\n            else:\n                yield Request(link, sid=\"fast\", callback=self.parse)  # Expliziter Callback\n```\nLange Crawls mit Checkpoints pausieren und fortsetzen, indem Sie den Spider so starten:\n```python\nQuotesSpider(crawldir=\"./crawl_data\").start()\n```\nDrücken Sie Strg+C, um kontrolliert zu pausieren -- der Fortschritt wird automatisch gespeichert. Wenn Sie den Spider später erneut starten, übergeben Sie dasselbe `crawldir`, und er setzt dort fort, wo er aufgehört hat.\n\n### Erweitertes Parsing & Navigation\n```python\nfrom scrapling.fetchers import Fetcher\n\n# Umfangreiche Elementauswahl und Navigation\npage = Fetcher.get('https://quotes.toscrape.com/')\n\n# Zitate mit verschiedenen Auswahlmethoden abrufen\nquotes = page.css('.quote')  # CSS-Selektor\nquotes = page.xpath('//div[@class=\"quote\"]')  # XPath\nquotes = page.find_all('div', {'class': 'quote'})  # BeautifulSoup-Stil\n# Gleich wie\nquotes = page.find_all('div', class_='quote')\nquotes = page.find_all(['div'], class_='quote')\nquotes = page.find_all(class_='quote')  # und so weiter...\n# Element nach Textinhalt finden\nquotes = page.find_by_text('quote', tag='div')\n\n# Erweiterte Navigation\nquote_text = page.css('.quote')[0].css('.text::text').get()\nquote_text = page.css('.quote').css('.text::text').getall()  # Verkettete Selektoren\nfirst_quote = page.css('.quote')[0]\nauthor = first_quote.next_sibling.css('.author::text')\nparent_container = first_quote.parent\n\n# Elementbeziehungen und Ähnlichkeit\nsimilar_elements = first_quote.find_similar()\nbelow_elements = first_quote.below_elements()\n```\nSie können den Parser direkt verwenden, wenn Sie keine Websites abrufen möchten, wie unten gezeigt:\n```python\nfrom scrapling.parser import Selector\n\npage = Selector(\"<html>...</html>\")\n```\nUnd es funktioniert genau auf die gleiche Weise!\n\n### Beispiele für async Session-Verwaltung\n```python\nimport asyncio\nfrom scrapling.fetchers import FetcherSession, AsyncStealthySession, AsyncDynamicSession\n\nasync with FetcherSession(http3=True) as session:  # `FetcherSession` ist kontextbewusst und kann sowohl in sync- als auch in async-Mustern arbeiten\n    page1 = session.get('https://quotes.toscrape.com/')\n    page2 = session.get('https://quotes.toscrape.com/', impersonate='firefox135')\n\n# Async-Session-Verwendung\nasync with AsyncStealthySession(max_pages=2) as session:\n    tasks = []\n    urls = ['https://example.com/page1', 'https://example.com/page2']\n\n    for url in urls:\n        task = session.fetch(url)\n        tasks.append(task)\n\n    print(session.get_pool_stats())  # Optional - Der Status des Browser-Tab-Pools (beschäftigt/frei/Fehler)\n    results = await asyncio.gather(*tasks)\n    print(session.get_pool_stats())\n```\n\n## CLI & Interaktive Shell\n\nScrapling enthält eine leistungsstarke Befehlszeilenschnittstelle:\n\n[![asciicast](https://asciinema.org/a/736339.svg)](https://asciinema.org/a/736339)\n\nInteraktive Web-Scraping-Shell starten\n```bash\nscrapling shell\n```\nSeiten direkt ohne Programmierung in eine Datei extrahieren (extrahiert standardmäßig den Inhalt im `body`-Tag). Wenn die Ausgabedatei mit `.txt` endet, wird der Textinhalt des Ziels extrahiert. Wenn sie mit `.md` endet, ist es eine Markdown-Darstellung des HTML-Inhalts; wenn sie mit `.html` endet, ist es der HTML-Inhalt selbst.\n```bash\nscrapling extract get 'https://example.com' content.md\nscrapling extract get 'https://example.com' content.txt --css-selector '#fromSkipToProducts' --impersonate 'chrome'  # Alle Elemente, die dem CSS-Selektor '#fromSkipToProducts' entsprechen\nscrapling extract fetch 'https://example.com' content.md --css-selector '#fromSkipToProducts' --no-headless\nscrapling extract stealthy-fetch 'https://nopecha.com/demo/cloudflare' captchas.html --css-selector '#padded_content a' --solve-cloudflare\n```\n\n> [!NOTE]\n> Es gibt viele zusätzliche Funktionen, aber wir möchten diese Seite prägnant halten, einschließlich des MCP-Servers und der interaktiven Web-Scraping-Shell. Schauen Sie sich die vollständige Dokumentation [hier](https://scrapling.readthedocs.io/en/latest/) an\n\n## Leistungsbenchmarks\n\nScrapling ist nicht nur leistungsstark -- es ist auch blitzschnell. Die folgenden Benchmarks vergleichen Scraplings Parser mit den neuesten Versionen anderer beliebter Bibliotheken.\n\n### Textextraktions-Geschwindigkeitstest (5000 verschachtelte Elemente)\n\n| # |    Bibliothek     | Zeit (ms) | vs Scrapling |\n|---|:-----------------:|:---------:|:------------:|\n| 1 |     Scrapling     |   2.02    |     1.0x     |\n| 2 |   Parsel/Scrapy   |   2.04    |     1.01     |\n| 3 |     Raw Lxml      |   2.54    |    1.257     |\n| 4 |      PyQuery      |   24.17   |     ~12x     |\n| 5 |    Selectolax     |   82.63   |     ~41x     |\n| 6 |  MechanicalSoup   |  1549.71  |   ~767.1x    |\n| 7 |   BS4 with Lxml   |  1584.31  |   ~784.3x    |\n| 8 | BS4 with html5lib |  3391.91  |   ~1679.1x   |\n\n\n### Element-Ähnlichkeit & Textsuche-Leistung\n\nScraplings adaptive Element-Finding-Fähigkeiten übertreffen Alternativen deutlich:\n\n| Bibliothek  | Zeit (ms) | vs Scrapling |\n|-------------|:---------:|:------------:|\n| Scrapling   |   2.39    |     1.0x     |\n| AutoScraper |   12.45   |    5.209x    |\n\n\n> Alle Benchmarks stellen Durchschnittswerte von über 100 Durchläufen dar. Siehe [benchmarks.py](https://github.com/D4Vinci/Scrapling/blob/main/benchmarks.py) für die Methodik.\n\n## Installation\n\nScrapling erfordert Python 3.10 oder höher:\n\n```bash\npip install scrapling\n```\n\nDiese Installation enthält nur die Parser-Engine und ihre Abhängigkeiten, ohne Fetcher oder Kommandozeilenabhängigkeiten.\n\n### Optionale Abhängigkeiten\n\n1. Wenn Sie eine der folgenden zusätzlichen Funktionen, die Fetcher oder ihre Klassen verwenden möchten, müssen Sie die Abhängigkeiten der Fetcher und ihre Browser-Abhängigkeiten wie folgt installieren:\n    ```bash\n    pip install \"scrapling[fetchers]\"\n\n    scrapling install           # normal install\n    scrapling install  --force  # force reinstall\n    ```\n\n    Dies lädt alle Browser zusammen mit ihren Systemabhängigkeiten und Fingerprint-Manipulationsabhängigkeiten herunter.\n\n    Oder Sie können sie aus dem Code heraus installieren, anstatt einen Befehl auszuführen:\n    ```python\n    from scrapling.cli import install\n\n    install([], standalone_mode=False)          # normal install\n    install([\"--force\"], standalone_mode=False) # force reinstall\n    ```\n\n2. Zusätzliche Funktionen:\n   - MCP-Server-Funktion installieren:\n       ```bash\n       pip install \"scrapling[ai]\"\n       ```\n   - Shell-Funktionen installieren (Web-Scraping-Shell und der `extract`-Befehl):\n       ```bash\n       pip install \"scrapling[shell]\"\n       ```\n   - Alles installieren:\n       ```bash\n       pip install \"scrapling[all]\"\n       ```\n   Denken Sie daran, dass Sie nach einem dieser Extras (falls noch nicht geschehen) die Browser-Abhängigkeiten mit `scrapling install` installieren müssen\n\n### Docker\nSie können auch ein Docker-Image mit allen Extras und Browsern mit dem folgenden Befehl von DockerHub installieren:\n```bash\ndocker pull pyd4vinci/scrapling\n```\nOder laden Sie es aus der GitHub-Registry herunter:\n```bash\ndocker pull ghcr.io/d4vinci/scrapling:latest\n```\nDieses Image wird automatisch mit GitHub Actions und dem Hauptzweig des Repositorys erstellt und gepusht.\n\n## Beitragen\n\nWir freuen uns über Beiträge! Bitte lesen Sie unsere [Beitragsrichtlinien](https://github.com/D4Vinci/Scrapling/blob/main/CONTRIBUTING.md), bevor Sie beginnen.\n\n## Haftungsausschluss\n\n> [!CAUTION]\n> Diese Bibliothek wird nur zu Bildungs- und Forschungszwecken bereitgestellt. Durch die Nutzung dieser Bibliothek erklären Sie sich damit einverstanden, lokale und internationale Gesetze zum Daten-Scraping und Datenschutz einzuhalten. Die Autoren und Mitwirkenden sind nicht verantwortlich für Missbrauch dieser Software. Respektieren Sie immer die Nutzungsbedingungen von Websites und robots.txt-Dateien.\n\n## 🎓 Zitierungen\nWenn Sie unsere Bibliothek für Forschungszwecke verwendet haben, zitieren Sie uns bitte mit der folgenden Referenz:\n```text\n  @misc{scrapling,\n    author = {Karim Shoair},\n    title = {Scrapling},\n    year = {2024},\n    url = {https://github.com/D4Vinci/Scrapling},\n    note = {An adaptive Web Scraping framework that handles everything from a single request to a full-scale crawl!}\n  }\n```\n\n## Lizenz\n\nDiese Arbeit ist unter der BSD-3-Clause-Lizenz lizenziert.\n\n## Danksagungen\n\nDieses Projekt enthält angepassten Code von:\n- Parsel (BSD-Lizenz) -- Verwendet für das [translator](https://github.com/D4Vinci/Scrapling/blob/main/scrapling/core/translator.py)-Submodul\n\n---\n<div align=\"center\"><small>Entworfen und hergestellt mit ❤️ von Karim Shoair.</small></div><br>\n"
  },
  {
    "path": "docs/README_ES.md",
    "content": "<!-- mcp-name: io.github.D4Vinci/Scrapling -->\n\n<h1 align=\"center\">\n    <a href=\"https://scrapling.readthedocs.io\">\n        <picture>\n          <source media=\"(prefers-color-scheme: dark)\" srcset=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/docs/assets/cover_dark.svg?sanitize=true\">\n          <img alt=\"Scrapling Poster\" src=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/docs/assets/cover_light.svg?sanitize=true\">\n        </picture>\n    </a>\n    <br>\n    <small>Effortless Web Scraping for the Modern Web</small>\n</h1>\n\n<p align=\"center\">\n    <a href=\"https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml\" alt=\"Tests\">\n        <img alt=\"Tests\" src=\"https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml/badge.svg\"></a>\n    <a href=\"https://badge.fury.io/py/Scrapling\" alt=\"PyPI version\">\n        <img alt=\"PyPI version\" src=\"https://badge.fury.io/py/Scrapling.svg\"></a>\n    <a href=\"https://clickpy.clickhouse.com/dashboard/scrapling\" rel=\"nofollow\"><img src=\"https://img.shields.io/pypi/dm/scrapling\" alt=\"PyPI package downloads\"></a>\n    <a href=\"https://github.com/D4Vinci/Scrapling/tree/main/agent-skill\" alt=\"AI Agent Skill directory\">\n        <img alt=\"Static Badge\" src=\"https://img.shields.io/badge/Skill-black?style=flat&label=Agent&link=https%3A%2F%2Fgithub.com%2FD4Vinci%2FScrapling%2Ftree%2Fmain%2Fagent-skill\"></a>\n    <a href=\"https://clawhub.ai/D4Vinci/scrapling-official\" alt=\"OpenClaw Skill\">\n        <img alt=\"OpenClaw Skill\" src=\"https://img.shields.io/badge/Clawhub-darkred?style=flat&label=OpenClaw&link=https%3A%2F%2Fclawhub.ai%2FD4Vinci%2Fscrapling-official\"></a>\n    <br/>\n    <a href=\"https://discord.gg/EMgGbDceNQ\" alt=\"Discord\" target=\"_blank\">\n      <img alt=\"Discord\" src=\"https://img.shields.io/discord/1360786381042880532?style=social&logo=discord&link=https%3A%2F%2Fdiscord.gg%2FEMgGbDceNQ\">\n    </a>\n    <a href=\"https://x.com/Scrapling_dev\" alt=\"X (formerly Twitter)\">\n      <img alt=\"X (formerly Twitter) Follow\" src=\"https://img.shields.io/twitter/follow/Scrapling_dev?style=social&logo=x&link=https%3A%2F%2Fx.com%2FScrapling_dev\">\n    </a>\n    <br/>\n    <a href=\"https://pypi.org/project/scrapling/\" alt=\"Supported Python versions\">\n        <img alt=\"Supported Python versions\" src=\"https://img.shields.io/pypi/pyversions/scrapling.svg\"></a>\n</p>\n\n<p align=\"center\">\n    <a href=\"https://scrapling.readthedocs.io/en/latest/parsing/selection.html\"><strong>Métodos de selección</strong></a>\n    &middot;\n    <a href=\"https://scrapling.readthedocs.io/en/latest/fetching/choosing.html\"><strong>Elegir un fetcher</strong></a>\n    &middot;\n    <a href=\"https://scrapling.readthedocs.io/en/latest/spiders/architecture.html\"><strong>Spiders</strong></a>\n    &middot;\n    <a href=\"https://scrapling.readthedocs.io/en/latest/spiders/proxy-blocking.html\"><strong>Rotación de proxy</strong></a>\n    &middot;\n    <a href=\"https://scrapling.readthedocs.io/en/latest/cli/overview.html\"><strong>CLI</strong></a>\n    &middot;\n    <a href=\"https://scrapling.readthedocs.io/en/latest/ai/mcp-server.html\"><strong>Modo MCP</strong></a>\n</p>\n\nScrapling es un framework de Web Scraping adaptativo que se encarga de todo, desde una sola solicitud hasta un rastreo a gran escala.\n\nSu parser aprende de los cambios de los sitios web y relocaliza automáticamente tus elementos cuando las páginas se actualizan. Sus fetchers evaden sistemas anti-bot como Cloudflare Turnstile de forma nativa. Y su framework Spider te permite escalar a rastreos concurrentes con múltiples sesiones, con Pause & Resume y rotación automática de Proxy, todo en unas pocas líneas de Python. Una biblioteca, cero compromisos.\n\nRastreos ultrarrápidos con estadísticas en tiempo real y Streaming. Construido por Web Scrapers para Web Scrapers y usuarios regulares, hay algo para todos.\n\n```python\nfrom scrapling.fetchers import Fetcher, AsyncFetcher, StealthyFetcher, DynamicFetcher\nStealthyFetcher.adaptive = True\np = StealthyFetcher.fetch('https://example.com', headless=True, network_idle=True)  # ¡Obtén el sitio web bajo el radar!\nproducts = p.css('.product', auto_save=True)                                        # ¡Extrae datos que sobreviven a cambios de diseño del sitio web!\nproducts = p.css('.product', adaptive=True)                                         # Más tarde, si la estructura del sitio web cambia, ¡pasa `adaptive=True` para encontrarlos!\n```\nO escala a rastreos completos\n```python\nfrom scrapling.spiders import Spider, Response\n\nclass MySpider(Spider):\n  name = \"demo\"\n  start_urls = [\"https://example.com/\"]\n\n  async def parse(self, response: Response):\n      for item in response.css('.product'):\n          yield {\"title\": item.css('h2::text').get()}\n\nMySpider().start()\n```\n\n<p align=\"center\">\n    <a href=\"https://dataimpulse.com/?utm_source=scrapling&utm_medium=banner&utm_campaign=scrapling\" target=\"_blank\" style=\"display:flex; justify-content:center; padding:4px 0;\">\n        <img src=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/DataImpulse.png\" alt=\"At DataImpulse, we specialize in developing custom proxy services for your business. Make requests from anywhere, collect data, and enjoy fast connections with our premium proxies.\" style=\"max-height:60px;\">\n    </a>\n</p>\n\n# Patrocinadores Platino\n<table>\n  <tr>\n    <td width=\"200\">\n      <a href=\"https://hypersolutions.co/?utm_source=github&utm_medium=readme&utm_campaign=scrapling\" target=\"_blank\" title=\"Bot Protection Bypass API for Akamai, DataDome, Incapsula & Kasada\">\n        <img src=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/HyperSolutions.png\">\n      </a>\n    </td>\n    <td> Scrapling maneja Cloudflare Turnstile. Para protección de nivel empresarial, <a href=\"https://hypersolutions.co?utm_source=github&utm_medium=readme&utm_campaign=scrapling\">\n        <b>Hyper Solutions</b>\n      </a> proporciona endpoints API que generan tokens antibot válidos para <b>Akamai</b>, <b>DataDome</b>, <b>Kasada</b> e <b>Incapsula</b>. Simples llamadas API, sin automatización de navegador. </td>\n  </tr>\n  <tr>\n    <td width=\"200\">\n      <a href=\"https://birdproxies.com/t/scrapling\" target=\"_blank\" title=\"At Bird Proxies, we eliminate your pains such as banned IPs, geo restriction, and high costs so you can focus on your work.\">\n        <img src=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/BirdProxies.jpg\">\n      </a>\n    </td>\n    <td>Oye, creamos <a href=\"https://birdproxies.com/t/scrapling\">\n        <b>BirdProxies</b>\n      </a> porque los proxies no deberían ser complicados ni caros. <br /> Proxies residenciales e ISP rápidos en más de 195 ubicaciones, precios justos y soporte real. <br />\n      <b>¡Prueba nuestro juego FlappyBird en la página de inicio para obtener datos gratis!</b>\n    </td>\n  </tr>\n  <tr>\n    <td width=\"200\">\n      <a href=\"https://evomi.com?utm_source=github&utm_medium=banner&utm_campaign=d4vinci-scrapling\" target=\"_blank\" title=\"Evomi is your Swiss Quality Proxy Provider, starting at $0.49/GB\">\n        <img src=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/evomi.png\">\n      </a>\n    </td>\n    <td>\n      <a href=\"https://evomi.com?utm_source=github&utm_medium=banner&utm_campaign=d4vinci-scrapling\">\n        <b>Evomi</b>\n      </a>: proxies residenciales desde 0,49 $/GB. Navegador de scraping con Chromium totalmente falsificado, IPs residenciales, resolución automática de CAPTCHA y evasión anti-bot. </br>\n      <b>API Scraper para resultados sin complicaciones. Integraciones MCP y N8N disponibles.</b>\n    </td>\n  </tr>\n  <tr>\n    <td width=\"200\">\n      <a href=\"https://tikhub.io/?ref=KarimShoair\" target=\"_blank\" title=\"Unlock the Power of Social Media Data & AI\">\n        <img src=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/TikHub.jpg\">\n      </a>\n    </td>\n    <td>\n      <a href=\"https://tikhub.io/?ref=KarimShoair\" target=\"_blank\">TikHub.io</a> ofrece más de 900 APIs estables en más de 16 plataformas, incluyendo TikTok, X, YouTube e Instagram, con más de 40M de conjuntos de datos. <br /> También ofrece <a href=\"https://ai.tikhub.io/?ref=KarimShoair\" target=\"_blank\">modelos de IA con descuento</a> — Claude, GPT, GEMINI y más con hasta un 71% de descuento.\n    </td>\n  </tr>\n  <tr>\n    <td width=\"200\">\n      <a href=\"https://www.nsocks.com/?keyword=2p67aivg\" target=\"_blank\" title=\"Scalable Web Data Access for AI Applications\">\n        <img src=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/nsocks.png\">\n      </a>\n    </td>\n    <td>\n    <a href=\"https://www.nsocks.com/?keyword=2p67aivg\" target=\"_blank\">Nsocks</a> ofrece proxies residenciales e ISP rápidos para desarrolladores y scrapers. Cobertura IP global, alto anonimato, rotación inteligente y rendimiento fiable para automatización y extracción de datos. Usa <a href=\"https://www.xcrawl.com/?keyword=2p67aivg\" target=\"_blank\">Xcrawl</a> para simplificar el crawling web a gran escala.\n    </td>\n  </tr>\n  <tr>\n    <td width=\"200\">\n      <a href=\"https://petrosky.io/d4vinci\" target=\"_blank\" title=\"PetroSky delivers cutting-edge VPS hosting.\">\n        <img src=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/petrosky.png\">\n      </a>\n    </td>\n    <td>\n    Cierra tu portátil. Tus scrapers siguen funcionando. <br />\n    <a href=\"https://petrosky.io/d4vinci\" target=\"_blank\">PetroSky VPS</a> - servidores en la nube diseñados para automatización ininterrumpida. Máquinas Windows y Linux con control total. Desde €6,99/mes.\n    </td>\n  </tr>\n  <tr>\n    <td width=\"200\">\n      <a href=\"https://substack.thewebscraping.club/p/scrapling-hands-on-guide?utm_source=github&utm_medium=repo&utm_campaign=scrapling\" target=\"_blank\" title=\"The #1 newsletter dedicated to Web Scraping\">\n        <img src=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/TWSC.png\">\n      </a>\n    </td>\n    <td>\n    Lee una reseña completa de <a href=\"https://substack.thewebscraping.club/p/scrapling-hands-on-guide?utm_source=github&utm_medium=repo&utm_campaign=scrapling\" target=\"_blank\">Scrapling en The Web Scraping Club</a> (nov. 2025), el boletín número uno dedicado al Web Scraping.\n    </td>\n  </tr>\n  <tr>\n    <td width=\"200\">\n      <a href=\"https://proxy-seller.com/?partner=CU9CAA5TBYFFT2\" target=\"_blank\" title=\"Proxy-Seller provides reliable proxy infrastructure for Web Scraping\">\n        <img src=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/ProxySeller.png\">\n      </a>\n    </td>\n    <td>\n    <a href=\"https://proxy-seller.com/?partner=CU9CAA5TBYFFT2\" target=\"_blank\">Proxy-Seller</a> ofrece una infraestructura de proxy fiable para web scraping, con proxies IPv4, IPv6, ISP, residenciales y móviles con rendimiento estable, amplia cobertura geográfica y planes flexibles para la recopilación de datos a escala empresarial.\n    </td>\n  </tr>\n</table>\n\n<i><sub>¿Quieres mostrar tu anuncio aquí? Haz clic [aquí](https://github.com/sponsors/D4Vinci/sponsorships?tier_id=586646)</sub></i>\n# Patrocinadores\n\n<!-- sponsors -->\n\n<a href=\"https://serpapi.com/?utm_source=scrapling\" target=\"_blank\" title=\"Scrape Google and other search engines with SerpApi\"><img src=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/SerpApi.png\"></a>\n<a href=\"https://visit.decodo.com/Dy6W0b\" target=\"_blank\" title=\"Try the Most Efficient Residential Proxies for Free\"><img src=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/decodo.png\"></a>\n<a href=\"https://hasdata.com/?utm_source=github&utm_medium=banner&utm_campaign=D4Vinci\" target=\"_blank\" title=\"The web scraping service that actually beats anti-bot systems!\"><img src=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/hasdata.png\"></a>\n<a href=\"https://proxyempire.io/?ref=scrapling&utm_source=scrapling\" target=\"_blank\" title=\"Collect The Data Your Project Needs with the Best Residential Proxies\"><img src=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/ProxyEmpire.png\"></a><a href=\"https://www.swiftproxy.net/\" target=\"_blank\" title=\"Unlock Reliable Proxy Services with Swiftproxy!\"><img src=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/swiftproxy.png\"></a>\n<a href=\"https://www.rapidproxy.io/?ref=d4v\" target=\"_blank\" title=\"Affordable Access to the Proxy World – bypass CAPTCHAs blocks, and avoid additional costs.\"><img src=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/rapidproxy.jpg\"></a>\n<a href=\"https://browser.cash/?utm_source=D4Vinci&utm_medium=referral\" target=\"_blank\" title=\"Browser Automation & AI Browser Agent Platform\"><img src=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/browserCash.png\"></a>\n\n<!-- /sponsors -->\n\n<i><sub>¿Quieres mostrar tu anuncio aquí? ¡Haz clic [aquí](https://github.com/sponsors/D4Vinci) y elige el nivel que te convenga!</sub></i>\n\n---\n\n## Características Principales\n\n### Spiders — Un Framework Completo de Rastreo\n- 🕷️ **API de Spider al estilo Scrapy**: Define spiders con `start_urls`, callbacks async `parse`, y objetos `Request`/`Response`.\n- ⚡ **Rastreo Concurrente**: Límites de concurrencia configurables, limitación por dominio y retrasos de descarga.\n- 🔄 **Soporte Multi-Session**: Interfaz unificada para solicitudes HTTP y navegadores headless sigilosos en un solo Spider — enruta solicitudes a diferentes sesiones por ID.\n- 💾 **Pause & Resume**: Persistencia de rastreo basada en Checkpoint. Presiona Ctrl+C para un cierre ordenado; reinicia para continuar desde donde lo dejaste.\n- 📡 **Modo Streaming**: Transmite elementos extraídos a medida que llegan con `async for item in spider.stream()` con estadísticas en tiempo real — ideal para UI, pipelines y rastreos de larga duración.\n- 🛡️ **Detección de Solicitudes Bloqueadas**: Detección automática y reintento de solicitudes bloqueadas con lógica personalizable.\n- 📦 **Exportación Integrada**: Exporta resultados a través de hooks y tu propio pipeline o el JSON/JSONL integrado con `result.items.to_json()` / `result.items.to_jsonl()` respectivamente.\n\n### Obtención Avanzada de Sitios Web con Soporte de Session\n- **Solicitudes HTTP**: Solicitudes HTTP rápidas y sigilosas con la clase `Fetcher`. Puede imitar el fingerprint TLS de los navegadores, encabezados y usar HTTP/3.\n- **Carga Dinámica**: Obtén sitios web dinámicos con automatización completa del navegador a través de la clase `DynamicFetcher` compatible con Chromium de Playwright y Google Chrome.\n- **Evasión Anti-bot**: Capacidades de sigilo avanzadas con `StealthyFetcher` y falsificación de fingerprint. Puede evadir fácilmente todos los tipos de Turnstile/Interstitial de Cloudflare con automatización.\n- **Gestión de Session**: Soporte de sesión persistente con las clases `FetcherSession`, `StealthySession` y `DynamicSession` para la gestión de cookies y estado entre solicitudes.\n- **Rotación de Proxy**: `ProxyRotator` integrado con estrategias de rotación cíclica o personalizadas en todos los tipos de sesión, además de sobrescrituras de Proxy por solicitud.\n- **Bloqueo de Dominios**: Bloquea solicitudes a dominios específicos (y sus subdominios) en fetchers basados en navegador.\n- **Soporte Async**: Soporte async completo en todos los fetchers y clases de sesión async dedicadas.\n\n### Scraping Adaptativo e Integración con IA\n- 🔄 **Seguimiento Inteligente de Elementos**: Relocaliza elementos después de cambios en el sitio web usando algoritmos inteligentes de similitud.\n- 🎯 **Selección Flexible Inteligente**: Selectores CSS, selectores XPath, búsqueda basada en filtros, búsqueda de texto, búsqueda regex y más.\n- 🔍 **Encontrar Elementos Similares**: Localiza automáticamente elementos similares a los elementos encontrados.\n- 🤖 **Servidor MCP para usar con IA**: Servidor MCP integrado para Web Scraping asistido por IA y extracción de datos. El servidor MCP presenta capacidades potentes y personalizadas que aprovechan Scrapling para extraer contenido específico antes de pasarlo a la IA (Claude/Cursor/etc), acelerando así las operaciones y reduciendo costos al minimizar el uso de tokens. ([video demo](https://www.youtube.com/watch?v=qyFk3ZNwOxE))\n\n### Arquitectura de Alto Rendimiento y Probada en Batalla\n- 🚀 **Ultrarrápido**: Rendimiento optimizado que supera a la mayoría de las bibliotecas de Web Scraping de Python.\n- 🔋 **Eficiente en Memoria**: Estructuras de datos optimizadas y carga diferida para una huella de memoria mínima.\n- ⚡ **Serialización JSON Rápida**: 10 veces más rápido que la biblioteca estándar.\n- 🏗️ **Probado en batalla**: Scrapling no solo tiene una cobertura de pruebas del 92% y cobertura completa de type hints, sino que ha sido utilizado diariamente por cientos de Web Scrapers durante el último año.\n\n### Experiencia Amigable para Desarrolladores/Web Scrapers\n- 🎯 **Shell Interactivo de Web Scraping**: Shell IPython integrado opcional con integración de Scrapling, atajos y nuevas herramientas para acelerar el desarrollo de scripts de Web Scraping, como convertir solicitudes curl a solicitudes Scrapling y ver resultados de solicitudes en tu navegador.\n- 🚀 **Úsalo directamente desde la Terminal**: Opcionalmente, ¡puedes usar Scrapling para hacer scraping de una URL sin escribir ni una sola línea de código!\n- 🛠️ **API de Navegación Rica**: Recorrido avanzado del DOM con métodos de navegación de padres, hermanos e hijos.\n- 🧬 **Procesamiento de Texto Mejorado**: Métodos integrados de regex, limpieza y operaciones de cadena optimizadas.\n- 📝 **Generación Automática de Selectores**: Genera selectores CSS/XPath robustos para cualquier elemento.\n- 🔌 **API Familiar**: Similar a Scrapy/BeautifulSoup con los mismos pseudo-elementos usados en Scrapy/Parsel.\n- 📘 **Cobertura Completa de Tipos**: Type hints completos para excelente soporte de IDE y autocompletado de código. Todo el código fuente se escanea automáticamente con **PyRight** y **MyPy** en cada cambio.\n- 🔋 **Imagen Docker Lista**: Con cada lanzamiento, se construye y publica automáticamente una imagen Docker que contiene todos los navegadores.\n\n## Primeros Pasos\n\nAquí tienes un vistazo rápido de lo que Scrapling puede hacer sin entrar en profundidad.\n\n### Uso Básico\nSolicitudes HTTP con soporte de sesión\n```python\nfrom scrapling.fetchers import Fetcher, FetcherSession\n\nwith FetcherSession(impersonate='chrome') as session:  # Usa la última versión del fingerprint TLS de Chrome\n    page = session.get('https://quotes.toscrape.com/', stealthy_headers=True)\n    quotes = page.css('.quote .text::text').getall()\n\n# O usa solicitudes de una sola vez\npage = Fetcher.get('https://quotes.toscrape.com/')\nquotes = page.css('.quote .text::text').getall()\n```\nModo sigiloso avanzado\n```python\nfrom scrapling.fetchers import StealthyFetcher, StealthySession\n\nwith StealthySession(headless=True, solve_cloudflare=True) as session:  # Mantén el navegador abierto hasta que termines\n    page = session.fetch('https://nopecha.com/demo/cloudflare', google_search=False)\n    data = page.css('#padded_content a').getall()\n\n# O usa el estilo de solicitud de una sola vez, abre el navegador para esta solicitud, luego lo cierra después de terminar\npage = StealthyFetcher.fetch('https://nopecha.com/demo/cloudflare')\ndata = page.css('#padded_content a').getall()\n```\nAutomatización completa del navegador\n```python\nfrom scrapling.fetchers import DynamicFetcher, DynamicSession\n\nwith DynamicSession(headless=True, disable_resources=False, network_idle=True) as session:  # Mantén el navegador abierto hasta que termines\n    page = session.fetch('https://quotes.toscrape.com/', load_dom=False)\n    data = page.xpath('//span[@class=\"text\"]/text()').getall()  # Selector XPath si lo prefieres\n\n# O usa el estilo de solicitud de una sola vez, abre el navegador para esta solicitud, luego lo cierra después de terminar\npage = DynamicFetcher.fetch('https://quotes.toscrape.com/')\ndata = page.css('.quote .text::text').getall()\n```\n\n### Spiders\nConstruye rastreadores completos con solicitudes concurrentes, múltiples tipos de sesión y Pause & Resume:\n```python\nfrom scrapling.spiders import Spider, Request, Response\n\nclass QuotesSpider(Spider):\n    name = \"quotes\"\n    start_urls = [\"https://quotes.toscrape.com/\"]\n    concurrent_requests = 10\n\n    async def parse(self, response: Response):\n        for quote in response.css('.quote'):\n            yield {\n                \"text\": quote.css('.text::text').get(),\n                \"author\": quote.css('.author::text').get(),\n            }\n\n        next_page = response.css('.next a')\n        if next_page:\n            yield response.follow(next_page[0].attrib['href'])\n\nresult = QuotesSpider().start()\nprint(f\"Se extrajeron {len(result.items)} citas\")\nresult.items.to_json(\"quotes.json\")\n```\nUsa múltiples tipos de sesión en un solo Spider:\n```python\nfrom scrapling.spiders import Spider, Request, Response\nfrom scrapling.fetchers import FetcherSession, AsyncStealthySession\n\nclass MultiSessionSpider(Spider):\n    name = \"multi\"\n    start_urls = [\"https://example.com/\"]\n\n    def configure_sessions(self, manager):\n        manager.add(\"fast\", FetcherSession(impersonate=\"chrome\"))\n        manager.add(\"stealth\", AsyncStealthySession(headless=True), lazy=True)\n\n    async def parse(self, response: Response):\n        for link in response.css('a::attr(href)').getall():\n            # Enruta las páginas protegidas a través de la sesión sigilosa\n            if \"protected\" in link:\n                yield Request(link, sid=\"stealth\")\n            else:\n                yield Request(link, sid=\"fast\", callback=self.parse)  # callback explícito\n```\nPausa y reanuda rastreos largos con checkpoints ejecutando el Spider así:\n```python\nQuotesSpider(crawldir=\"./crawl_data\").start()\n```\nPresiona Ctrl+C para pausar de forma ordenada — el progreso se guarda automáticamente. Después, cuando inicies el Spider de nuevo, pasa el mismo `crawldir`, y continuará desde donde se detuvo.\n\n### Análisis Avanzado y Navegación\n```python\nfrom scrapling.fetchers import Fetcher\n\n# Selección rica de elementos y navegación\npage = Fetcher.get('https://quotes.toscrape.com/')\n\n# Obtén citas con múltiples métodos de selección\nquotes = page.css('.quote')  # Selector CSS\nquotes = page.xpath('//div[@class=\"quote\"]')  # XPath\nquotes = page.find_all('div', {'class': 'quote'})  # Estilo BeautifulSoup\n# Igual que\nquotes = page.find_all('div', class_='quote')\nquotes = page.find_all(['div'], class_='quote')\nquotes = page.find_all(class_='quote')  # y así sucesivamente...\n# Encuentra elementos por contenido de texto\nquotes = page.find_by_text('quote', tag='div')\n\n# Navegación avanzada\nquote_text = page.css('.quote')[0].css('.text::text').get()\nquote_text = page.css('.quote').css('.text::text').getall()  # Selectores encadenados\nfirst_quote = page.css('.quote')[0]\nauthor = first_quote.next_sibling.css('.author::text')\nparent_container = first_quote.parent\n\n# Relaciones y similitud de elementos\nsimilar_elements = first_quote.find_similar()\nbelow_elements = first_quote.below_elements()\n```\nPuedes usar el parser directamente si no necesitas obtener sitios web, como se muestra a continuación:\n```python\nfrom scrapling.parser import Selector\n\npage = Selector(\"<html>...</html>\")\n```\n¡Y funciona exactamente de la misma manera!\n\n### Ejemplos de Gestión de Session Async\n```python\nimport asyncio\nfrom scrapling.fetchers import FetcherSession, AsyncStealthySession, AsyncDynamicSession\n\nasync with FetcherSession(http3=True) as session:  # `FetcherSession` es consciente del contexto y puede funcionar tanto en patrones sync/async\n    page1 = session.get('https://quotes.toscrape.com/')\n    page2 = session.get('https://quotes.toscrape.com/', impersonate='firefox135')\n\n# Uso de sesión async\nasync with AsyncStealthySession(max_pages=2) as session:\n    tasks = []\n    urls = ['https://example.com/page1', 'https://example.com/page2']\n\n    for url in urls:\n        task = session.fetch(url)\n        tasks.append(task)\n\n    print(session.get_pool_stats())  # Opcional - El estado del pool de pestañas del navegador (ocupado/libre/error)\n    results = await asyncio.gather(*tasks)\n    print(session.get_pool_stats())\n```\n\n## CLI y Shell Interactivo\n\nScrapling incluye una poderosa interfaz de línea de comandos:\n\n[![asciicast](https://asciinema.org/a/736339.svg)](https://asciinema.org/a/736339)\n\nLanzar el Shell interactivo de Web Scraping\n```bash\nscrapling shell\n```\nExtraer páginas a un archivo directamente sin programar (Extrae el contenido dentro de la etiqueta `body` por defecto). Si el archivo de salida termina con `.txt`, entonces se extraerá el contenido de texto del objetivo. Si termina con `.md`, será una representación Markdown del contenido HTML; si termina con `.html`, será el contenido HTML en sí mismo.\n```bash\nscrapling extract get 'https://example.com' content.md\nscrapling extract get 'https://example.com' content.txt --css-selector '#fromSkipToProducts' --impersonate 'chrome'  # Todos los elementos que coinciden con el selector CSS '#fromSkipToProducts'\nscrapling extract fetch 'https://example.com' content.md --css-selector '#fromSkipToProducts' --no-headless\nscrapling extract stealthy-fetch 'https://nopecha.com/demo/cloudflare' captchas.html --css-selector '#padded_content a' --solve-cloudflare\n```\n\n> [!NOTE]\n> Hay muchas características adicionales, pero queremos mantener esta página concisa, incluyendo el servidor MCP y el Shell Interactivo de Web Scraping. Consulta la documentación completa [aquí](https://scrapling.readthedocs.io/en/latest/)\n\n## Benchmarks de Rendimiento\n\nScrapling no solo es potente, también es ultrarrápido. Los siguientes benchmarks comparan el parser de Scrapling con las últimas versiones de otras bibliotecas populares.\n\n### Prueba de Velocidad de Extracción de Texto (5000 elementos anidados)\n\n| # |    Biblioteca     | Tiempo (ms) | vs Scrapling |\n|---|:-----------------:|:-----------:|:------------:|\n| 1 |     Scrapling     |    2.02     |     1.0x     |\n| 2 |   Parsel/Scrapy   |    2.04     |     1.01     |\n| 3 |     Raw Lxml      |    2.54     |    1.257     |\n| 4 |      PyQuery      |    24.17    |     ~12x     |\n| 5 |    Selectolax     |    82.63    |     ~41x     |\n| 6 |  MechanicalSoup   |   1549.71   |   ~767.1x    |\n| 7 |   BS4 with Lxml   |   1584.31   |   ~784.3x    |\n| 8 | BS4 with html5lib |   3391.91   |   ~1679.1x   |\n\n\n### Rendimiento de Similitud de Elementos y Búsqueda de Texto\n\nLas capacidades de búsqueda adaptativa de elementos de Scrapling superan significativamente a las alternativas:\n\n| Biblioteca  | Tiempo (ms) | vs Scrapling |\n|-------------|:-----------:|:------------:|\n| Scrapling   |    2.39     |     1.0x     |\n| AutoScraper |    12.45    |    5.209x    |\n\n\n> Todos los benchmarks representan promedios de más de 100 ejecuciones. Ver [benchmarks.py](https://github.com/D4Vinci/Scrapling/blob/main/benchmarks.py) para la metodología.\n\n## Instalación\n\nScrapling requiere Python 3.10 o superior:\n\n```bash\npip install scrapling\n```\n\nEsta instalación solo incluye el motor de análisis y sus dependencias, sin ningún fetcher ni dependencias de línea de comandos.\n\n### Dependencias Opcionales\n\n1. Si vas a usar alguna de las características adicionales a continuación, los fetchers, o sus clases, necesitarás instalar las dependencias de los fetchers y sus dependencias del navegador de la siguiente manera:\n    ```bash\n    pip install \"scrapling[fetchers]\"\n\n    scrapling install           # normal install\n    scrapling install  --force  # force reinstall\n    ```\n\n    Esto descarga todos los navegadores, junto con sus dependencias del sistema y dependencias de manipulación de fingerprint.\n\n    O puedes instalarlos desde el código en lugar de ejecutar un comando:\n    ```python\n    from scrapling.cli import install\n\n    install([], standalone_mode=False)          # normal install\n    install([\"--force\"], standalone_mode=False) # force reinstall\n    ```\n\n2. Características adicionales:\n   - Instalar la característica del servidor MCP:\n       ```bash\n       pip install \"scrapling[ai]\"\n       ```\n   - Instalar características del Shell (Shell de Web Scraping y el comando `extract`):\n       ```bash\n       pip install \"scrapling[shell]\"\n       ```\n   - Instalar todo:\n       ```bash\n       pip install \"scrapling[all]\"\n       ```\n   Recuerda que necesitas instalar las dependencias del navegador con `scrapling install` después de cualquiera de estos extras (si no lo hiciste ya)\n\n### Docker\nTambién puedes instalar una imagen Docker con todos los extras y navegadores con el siguiente comando desde DockerHub:\n```bash\ndocker pull pyd4vinci/scrapling\n```\nO descárgala desde el registro de GitHub:\n```bash\ndocker pull ghcr.io/d4vinci/scrapling:latest\n```\nEsta imagen se construye y publica automáticamente usando GitHub Actions y la rama principal del repositorio.\n\n## Contribuir\n\n¡Damos la bienvenida a las contribuciones! Por favor lee nuestras [pautas de contribución](https://github.com/D4Vinci/Scrapling/blob/main/CONTRIBUTING.md) antes de comenzar.\n\n## Descargo de Responsabilidad\n\n> [!CAUTION]\n> Esta biblioteca se proporciona solo con fines educativos y de investigación. Al usar esta biblioteca, aceptas cumplir con las leyes locales e internacionales de scraping de datos y privacidad. Los autores y contribuyentes no son responsables de ningún mal uso de este software. Respeta siempre los términos de servicio de los sitios web y los archivos robots.txt.\n\n## 🎓 Citas\nSi has utilizado nuestra biblioteca con fines de investigación, por favor cítanos con la siguiente referencia:\n```text\n  @misc{scrapling,\n    author = {Karim Shoair},\n    title = {Scrapling},\n    year = {2024},\n    url = {https://github.com/D4Vinci/Scrapling},\n    note = {An adaptive Web Scraping framework that handles everything from a single request to a full-scale crawl!}\n  }\n```\n\n## Licencia\n\nEste trabajo está licenciado bajo la Licencia BSD-3-Clause.\n\n## Agradecimientos\n\nEste proyecto incluye código adaptado de:\n- Parsel (Licencia BSD)—Usado para el submódulo [translator](https://github.com/D4Vinci/Scrapling/blob/main/scrapling/core/translator.py)\n\n---\n<div align=\"center\"><small>Diseñado y elaborado con ❤️ por Karim Shoair.</small></div><br>\n"
  },
  {
    "path": "docs/README_FR.md",
    "content": "<!-- mcp-name: io.github.D4Vinci/Scrapling -->\n\n<h1 align=\"center\">\n    <a href=\"https://scrapling.readthedocs.io\">\n        <picture>\n          <source media=\"(prefers-color-scheme: dark)\" srcset=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/docs/assets/cover_dark.svg?sanitize=true\">\n          <img alt=\"Scrapling Poster\" src=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/docs/assets/cover_light.svg?sanitize=true\">\n        </picture>\n    </a>\n    <br>\n    <small>Effortless Web Scraping for the Modern Web</small>\n</h1>\n\n<p align=\"center\">\n    <a href=\"https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml\" alt=\"Tests\">\n        <img alt=\"Tests\" src=\"https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml/badge.svg\"></a>\n    <a href=\"https://badge.fury.io/py/Scrapling\" alt=\"PyPI version\">\n        <img alt=\"PyPI version\" src=\"https://badge.fury.io/py/Scrapling.svg\"></a>\n    <a href=\"https://clickpy.clickhouse.com/dashboard/scrapling\" rel=\"nofollow\"><img src=\"https://img.shields.io/pypi/dm/scrapling\" alt=\"PyPI package downloads\"></a>\n    <a href=\"https://github.com/D4Vinci/Scrapling/tree/main/agent-skill\" alt=\"AI Agent Skill directory\">\n        <img alt=\"Static Badge\" src=\"https://img.shields.io/badge/Skill-black?style=flat&label=Agent&link=https%3A%2F%2Fgithub.com%2FD4Vinci%2FScrapling%2Ftree%2Fmain%2Fagent-skill\"></a>\n    <a href=\"https://clawhub.ai/D4Vinci/scrapling-official\" alt=\"OpenClaw Skill\">\n        <img alt=\"OpenClaw Skill\" src=\"https://img.shields.io/badge/Clawhub-darkred?style=flat&label=OpenClaw&link=https%3A%2F%2Fclawhub.ai%2FD4Vinci%2Fscrapling-official\"></a>\n    <br/>\n    <a href=\"https://discord.gg/EMgGbDceNQ\" alt=\"Discord\" target=\"_blank\">\n      <img alt=\"Discord\" src=\"https://img.shields.io/discord/1360786381042880532?style=social&logo=discord&link=https%3A%2F%2Fdiscord.gg%2FEMgGbDceNQ\">\n    </a>\n    <a href=\"https://x.com/Scrapling_dev\" alt=\"X (formerly Twitter)\">\n      <img alt=\"X (formerly Twitter) Follow\" src=\"https://img.shields.io/twitter/follow/Scrapling_dev?style=social&logo=x&link=https%3A%2F%2Fx.com%2FScrapling_dev\">\n    </a>\n    <br/>\n    <a href=\"https://pypi.org/project/scrapling/\" alt=\"Supported Python versions\">\n        <img alt=\"Supported Python versions\" src=\"https://img.shields.io/pypi/pyversions/scrapling.svg\"></a>\n</p>\n\n<p align=\"center\">\n    <a href=\"https://scrapling.readthedocs.io/en/latest/parsing/selection.html\"><strong>Méthodes de sélection</strong></a>\n    &middot;\n    <a href=\"https://scrapling.readthedocs.io/en/latest/fetching/choosing.html\"><strong>Fetchers</strong></a>\n    &middot;\n    <a href=\"https://scrapling.readthedocs.io/en/latest/spiders/architecture.html\"><strong>Spiders</strong></a>\n    &middot;\n    <a href=\"https://scrapling.readthedocs.io/en/latest/spiders/proxy-blocking.html\"><strong>Rotation de proxy</strong></a>\n    &middot;\n    <a href=\"https://scrapling.readthedocs.io/en/latest/cli/overview.html\"><strong>CLI</strong></a>\n    &middot;\n    <a href=\"https://scrapling.readthedocs.io/en/latest/ai/mcp-server.html\"><strong>MCP</strong></a>\n</p>\n\nScrapling est un framework de Web Scraping adaptatif qui gère tout, d'une simple requête à un crawl à grande échelle.\n\nSon parser apprend des modifications de sites web et relocalise automatiquement vos éléments lorsque les pages sont mises à jour. Ses fetchers contournent les systèmes anti-bot comme Cloudflare Turnstile nativement. Et son framework Spider vous permet de monter en charge vers des crawls concurrents multi-sessions avec pause/reprise et rotation automatique de proxy — le tout en quelques lignes de Python. Une seule bibliothèque, zéro compromis.\n\nDes crawls ultra-rapides avec des statistiques en temps réel et du streaming. Conçu par des Web Scrapers pour des Web Scrapers et des utilisateurs réguliers, il y en a pour tout le monde.\n\n```python\nfrom scrapling.fetchers import Fetcher, AsyncFetcher, StealthyFetcher, DynamicFetcher\nStealthyFetcher.adaptive = True\np = StealthyFetcher.fetch('https://example.com', headless=True, network_idle=True)  # Récupérer un site web en toute discrétion !\nproducts = p.css('.product', auto_save=True)                                        # Scraper des données qui survivent aux changements de design !\nproducts = p.css('.product', adaptive=True)                                         # Plus tard, si la structure du site change, passez `adaptive=True` pour les retrouver !\n```\nOu montez en charge vers des crawls complets\n```python\nfrom scrapling.spiders import Spider, Response\n\nclass MySpider(Spider):\n  name = \"demo\"\n  start_urls = [\"https://example.com/\"]\n\n  async def parse(self, response: Response):\n      for item in response.css('.product'):\n          yield {\"title\": item.css('h2::text').get()}\n\nMySpider().start()\n```\n\n<p align=\"center\">\n    <a href=\"https://dataimpulse.com/?utm_source=scrapling&utm_medium=banner&utm_campaign=scrapling\" target=\"_blank\" style=\"display:flex; justify-content:center; padding:4px 0;\">\n        <img src=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/DataImpulse.png\" alt=\"At DataImpulse, we specialize in developing custom proxy services for your business. Make requests from anywhere, collect data, and enjoy fast connections with our premium proxies.\" style=\"max-height:60px;\">\n    </a>\n</p>\n\n# Sponsors Platine\n<table>\n  <tr>\n    <td width=\"200\">\n      <a href=\"https://hypersolutions.co/?utm_source=github&utm_medium=readme&utm_campaign=scrapling\" target=\"_blank\" title=\"Bot Protection Bypass API for Akamai, DataDome, Incapsula & Kasada\">\n        <img src=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/HyperSolutions.png\">\n      </a>\n    </td>\n    <td> Scrapling gère Cloudflare Turnstile. Pour une protection de niveau entreprise, <a href=\"https://hypersolutions.co?utm_source=github&utm_medium=readme&utm_campaign=scrapling\">\n        <b>Hyper Solutions</b>\n      </a> fournit des endpoints API qui génèrent des tokens antibot valides pour <b>Akamai</b>, <b>DataDome</b>, <b>Kasada</b> et <b>Incapsula</b>. De simples appels API, sans automatisation de navigateur. </td>\n  </tr>\n  <tr>\n    <td width=\"200\">\n      <a href=\"https://birdproxies.com/t/scrapling\" target=\"_blank\" title=\"At Bird Proxies, we eliminate your pains such as banned IPs, geo restriction, and high costs so you can focus on your work.\">\n        <img src=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/BirdProxies.jpg\">\n      </a>\n    </td>\n    <td>Nous avons créé <a href=\"https://birdproxies.com/t/scrapling\">\n        <b>BirdProxies</b>\n      </a> parce que les proxies ne devraient pas être compliqués ni trop chers. Des proxies résidentiels et ISP rapides dans plus de 195 localisations, des prix équitables et un vrai support. <br />\n      <b>Essayez notre jeu FlappyBird sur la page d'accueil pour des données gratuites !</b>\n    </td>\n  </tr>\n  <tr>\n    <td width=\"200\">\n      <a href=\"https://evomi.com?utm_source=github&utm_medium=banner&utm_campaign=d4vinci-scrapling\" target=\"_blank\" title=\"Evomi is your Swiss Quality Proxy Provider, starting at $0.49/GB\">\n        <img src=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/evomi.png\">\n      </a>\n    </td>\n    <td>\n      <a href=\"https://evomi.com?utm_source=github&utm_medium=banner&utm_campaign=d4vinci-scrapling\">\n        <b>Evomi</b>\n      </a> : proxies résidentiels à partir de 0,49 $/Go. Navigateur de scraping avec Chromium entièrement falsifié, IPs résidentielles, résolution automatique de CAPTCHA et contournement anti-bot. </br>\n      <b>API Scraper pour des résultats sans tracas. Intégrations MCP et N8N disponibles.</b>\n    </td>\n  </tr>\n  <tr>\n    <td width=\"200\">\n      <a href=\"https://tikhub.io/?ref=KarimShoair\" target=\"_blank\" title=\"Unlock the Power of Social Media Data & AI\">\n        <img src=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/TikHub.jpg\">\n      </a>\n    </td>\n    <td>\n      <a href=\"https://tikhub.io/?ref=KarimShoair\" target=\"_blank\">TikHub.io</a> propose plus de 900 APIs stables sur plus de 16 plateformes, dont TikTok, X, YouTube et Instagram, avec plus de 40M de jeux de données. <br /> Propose également des <a href=\"https://ai.tikhub.io/?ref=KarimShoair\" target=\"_blank\">modèles IA à prix réduit</a> — Claude, GPT, GEMINI et plus, jusqu'à 71% de réduction.\n    </td>\n  </tr>\n  <tr>\n    <td width=\"200\">\n      <a href=\"https://www.nsocks.com/?keyword=2p67aivg\" target=\"_blank\" title=\"Scalable Web Data Access for AI Applications\">\n        <img src=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/nsocks.png\">\n      </a>\n    </td>\n    <td>\n    <a href=\"https://www.nsocks.com/?keyword=2p67aivg\" target=\"_blank\">Nsocks</a> fournit des proxies résidentiels et ISP rapides pour les développeurs et les scrapeurs. Couverture IP mondiale, anonymat élevé, rotation intelligente et performances fiables pour l'automatisation et l'extraction de données. Utilisez <a href=\"https://www.xcrawl.com/?keyword=2p67aivg\" target=\"_blank\">Xcrawl</a> pour simplifier le crawling web à grande échelle.\n    </td>\n  </tr>\n  <tr>\n    <td width=\"200\">\n      <a href=\"https://petrosky.io/d4vinci\" target=\"_blank\" title=\"PetroSky delivers cutting-edge VPS hosting.\">\n        <img src=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/petrosky.png\">\n      </a>\n    </td>\n    <td>\n    Fermez votre ordinateur. Vos scrapers continuent de tourner. <br />\n    <a href=\"https://petrosky.io/d4vinci\" target=\"_blank\">PetroSky VPS</a> - des serveurs cloud conçus pour l'automatisation sans interruption. Machines Windows et Linux avec contrôle total. À partir de 6,99 €/mois.\n    </td>\n  </tr>\n  <tr>\n    <td width=\"200\">\n      <a href=\"https://substack.thewebscraping.club/p/scrapling-hands-on-guide?utm_source=github&utm_medium=repo&utm_campaign=scrapling\" target=\"_blank\" title=\"The #1 newsletter dedicated to Web Scraping\">\n        <img src=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/TWSC.png\">\n      </a>\n    </td>\n    <td>\n    Lisez une critique complète de <a href=\"https://substack.thewebscraping.club/p/scrapling-hands-on-guide?utm_source=github&utm_medium=repo&utm_campaign=scrapling\" target=\"_blank\">Scrapling sur The Web Scraping Club</a> (nov. 2025), la newsletter n°1 dédiée au Web Scraping.\n    </td>\n  </tr>\n  <tr>\n    <td width=\"200\">\n      <a href=\"https://proxy-seller.com/?partner=CU9CAA5TBYFFT2\" target=\"_blank\" title=\"Proxy-Seller provides reliable proxy infrastructure for Web Scraping\">\n        <img src=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/ProxySeller.png\">\n      </a>\n    </td>\n    <td>\n    <a href=\"https://proxy-seller.com/?partner=CU9CAA5TBYFFT2\" target=\"_blank\">Proxy-Seller</a> fournit une infrastructure proxy fiable pour le web scraping, avec des proxys IPv4, IPv6, ISP, résidentiels et mobiles offrant des performances stables, une large couverture géographique et des plans flexibles pour la collecte de données à l'échelle entreprise.\n    </td>\n  </tr>\n</table>\n\n<i><sub>Vous souhaitez afficher votre publicité ici ? Cliquez [ici](https://github.com/sponsors/D4Vinci/sponsorships?tier_id=586646)</sub></i>\n# Sponsors\n\n<!-- sponsors -->\n\n<a href=\"https://serpapi.com/?utm_source=scrapling\" target=\"_blank\" title=\"Scrape Google and other search engines with SerpApi\"><img src=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/SerpApi.png\"></a>\n<a href=\"https://visit.decodo.com/Dy6W0b\" target=\"_blank\" title=\"Try the Most Efficient Residential Proxies for Free\"><img src=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/decodo.png\"></a>\n<a href=\"https://hasdata.com/?utm_source=github&utm_medium=banner&utm_campaign=D4Vinci\" target=\"_blank\" title=\"The web scraping service that actually beats anti-bot systems!\"><img src=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/hasdata.png\"></a>\n<a href=\"https://proxyempire.io/?ref=scrapling&utm_source=scrapling\" target=\"_blank\" title=\"Collect The Data Your Project Needs with the Best Residential Proxies\"><img src=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/ProxyEmpire.png\"></a><a href=\"https://www.swiftproxy.net/\" target=\"_blank\" title=\"Unlock Reliable Proxy Services with Swiftproxy!\"><img src=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/swiftproxy.png\"></a>\n<a href=\"https://www.rapidproxy.io/?ref=d4v\" target=\"_blank\" title=\"Affordable Access to the Proxy World – bypass CAPTCHAs blocks, and avoid additional costs.\"><img src=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/rapidproxy.jpg\"></a>\n<a href=\"https://browser.cash/?utm_source=D4Vinci&utm_medium=referral\" target=\"_blank\" title=\"Browser Automation & AI Browser Agent Platform\"><img src=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/browserCash.png\"></a>\n\n<!-- /sponsors -->\n\n<i><sub>Vous souhaitez afficher votre publicité ici ? Cliquez [ici](https://github.com/sponsors/D4Vinci) et choisissez le niveau qui vous convient !</sub></i>\n\n---\n\n## Fonctionnalités principales\n\n### Spiders — Un framework de crawling complet\n- 🕷️ **API Spider à la Scrapy** : Définissez des spiders avec `start_urls`, des callbacks async `parse` et des objets `Request`/`Response`.\n- ⚡ **Crawling concurrent** : Limites de concurrence configurables, throttling par domaine et délais de téléchargement.\n- 🔄 **Support multi-sessions** : Interface unifiée pour les requêtes HTTP et les navigateurs headless furtifs dans un seul spider — routez les requêtes vers différentes sessions par ID.\n- 💾 **Pause & Reprise** : Persistance du crawl basée sur des checkpoints. Appuyez sur Ctrl+C pour un arrêt gracieux ; redémarrez pour reprendre là où vous vous étiez arrêté.\n- 📡 **Mode streaming** : Diffusez les éléments scrapés en temps réel via `async for item in spider.stream()` avec des statistiques en temps réel — idéal pour les UI, pipelines et crawls de longue durée.\n- 🛡️ **Détection des requêtes bloquées** : Détection automatique et réessai des requêtes bloquées avec une logique personnalisable.\n- 📦 **Export intégré** : Exportez les résultats via des hooks et votre propre pipeline ou l'export JSON/JSONL intégré avec `result.items.to_json()` / `result.items.to_jsonl()` respectivement.\n\n### Récupération avancée de sites web avec support de sessions\n- **Requêtes HTTP** : Requêtes HTTP rapides et furtives avec la classe `Fetcher`. Peut imiter l'empreinte TLS des navigateurs, les headers et utiliser HTTP/3.\n- **Chargement dynamique** : Récupérez des sites web dynamiques avec une automatisation complète du navigateur via la classe `DynamicFetcher` supportant Chromium de Playwright et Google Chrome.\n- **Contournement anti-bot** : Capacités de furtivité avancées avec `StealthyFetcher` et usurpation d'empreinte. Peut facilement contourner tous les types de Turnstile/Interstitial de Cloudflare avec l'automatisation.\n- **Gestion de sessions** : Support de sessions persistantes avec les classes `FetcherSession`, `StealthySession` et `DynamicSession` pour la gestion des cookies et de l'état entre les requêtes.\n- **Rotation de proxy** : `ProxyRotator` intégré avec des stratégies de rotation cycliques ou personnalisées sur tous les types de sessions, plus des surcharges de proxy par requête.\n- **Blocage de domaines** : Bloquez les requêtes vers des domaines spécifiques (et leurs sous-domaines) dans les fetchers basés sur navigateur.\n- **Support async** : Support async complet sur tous les fetchers et classes de sessions async dédiées.\n\n### Scraping adaptatif & Intégration IA\n- 🔄 **Suivi intelligent des éléments** : Relocalisez les éléments après des modifications de site web en utilisant des algorithmes de similarité intelligents.\n- 🎯 **Sélection flexible intelligente** : Sélecteurs CSS, sélecteurs XPath, recherche par filtres, recherche textuelle, recherche regex et plus encore.\n- 🔍 **Trouver des éléments similaires** : Localisez automatiquement des éléments similaires aux éléments trouvés.\n- 🤖 **Serveur MCP pour utilisation avec l'IA** : Serveur MCP intégré pour le Web Scraping et l'extraction de données assistés par IA. Le serveur MCP dispose de capacités puissantes et personnalisées qui exploitent Scrapling pour extraire du contenu ciblé avant de le transmettre à l'IA (Claude/Cursor/etc.), accélérant ainsi les opérations et réduisant les coûts en minimisant l'utilisation de tokens. ([vidéo de démonstration](https://www.youtube.com/watch?v=qyFk3ZNwOxE))\n\n### Architecture haute performance et éprouvée\n- 🚀 **Ultra rapide** : Performance optimisée surpassant la plupart des bibliothèques de scraping Python.\n- 🔋 **Économe en mémoire** : Structures de données optimisées et chargement paresseux pour une empreinte mémoire minimale.\n- ⚡ **Sérialisation JSON rapide** : 10x plus rapide que la bibliothèque standard.\n- 🏗️ **Éprouvé en conditions réelles** : Non seulement Scrapling dispose d'une couverture de tests de 92% et d'une couverture complète des type hints, mais il est utilisé quotidiennement par des centaines de Web Scrapers depuis l'année dernière.\n\n### Expérience conviviale pour développeurs/Web Scrapers\n- 🎯 **Shell interactif de Web Scraping** : Shell IPython intégré optionnel avec intégration Scrapling, raccourcis et nouveaux outils pour accélérer le développement de scripts de Web Scraping, comme la conversion de requêtes curl en requêtes Scrapling et l'affichage des résultats dans votre navigateur.\n- 🚀 **Utilisez-le directement depuis le terminal** : Optionnellement, vous pouvez utiliser Scrapling pour scraper une URL sans écrire une seule ligne de code !\n- 🛠️ **API de navigation riche** : Traversée avancée du DOM avec des méthodes de navigation parent, frère et enfant.\n- 🧬 **Traitement de texte amélioré** : Regex intégrées, méthodes de nettoyage et opérations sur les chaînes optimisées.\n- 📝 **Génération automatique de sélecteurs** : Générez des sélecteurs CSS/XPath robustes pour n'importe quel élément.\n- 🔌 **API familière** : Similaire à Scrapy/BeautifulSoup avec les mêmes pseudo-éléments utilisés dans Scrapy/Parsel.\n- 📘 **Couverture de types complète** : Type hints complets pour un excellent support IDE et la complétion de code. L'ensemble de la base de code est automatiquement analysé avec **PyRight** et **MyPy** à chaque modification.\n- 🔋 **Image Docker prête à l'emploi** : À chaque version, une image Docker contenant tous les navigateurs est automatiquement construite et publiée.\n\n## Pour commencer\n\nVoici un aperçu rapide de ce que Scrapling peut faire sans entrer dans les détails.\n\n### Utilisation de base\nRequêtes HTTP avec support de sessions\n```python\nfrom scrapling.fetchers import Fetcher, FetcherSession\n\nwith FetcherSession(impersonate='chrome') as session:  # Utiliser la dernière version de l'empreinte TLS de Chrome\n    page = session.get('https://quotes.toscrape.com/', stealthy_headers=True)\n    quotes = page.css('.quote .text::text').getall()\n\n# Ou utiliser des requêtes ponctuelles\npage = Fetcher.get('https://quotes.toscrape.com/')\nquotes = page.css('.quote .text::text').getall()\n```\nMode furtif avancé\n```python\nfrom scrapling.fetchers import StealthyFetcher, StealthySession\n\nwith StealthySession(headless=True, solve_cloudflare=True) as session:  # Garder le navigateur ouvert jusqu'à ce que vous ayez terminé\n    page = session.fetch('https://nopecha.com/demo/cloudflare', google_search=False)\n    data = page.css('#padded_content a').getall()\n\n# Ou utiliser le style requête ponctuelle : ouvre le navigateur pour cette requête, puis le ferme après\npage = StealthyFetcher.fetch('https://nopecha.com/demo/cloudflare')\ndata = page.css('#padded_content a').getall()\n```\nAutomatisation complète du navigateur\n```python\nfrom scrapling.fetchers import DynamicFetcher, DynamicSession\n\nwith DynamicSession(headless=True, disable_resources=False, network_idle=True) as session:  # Garder le navigateur ouvert jusqu'à ce que vous ayez terminé\n    page = session.fetch('https://quotes.toscrape.com/', load_dom=False)\n    data = page.xpath('//span[@class=\"text\"]/text()').getall()  # Sélecteur XPath si vous le préférez\n\n# Ou utiliser le style requête ponctuelle : ouvre le navigateur pour cette requête, puis le ferme après\npage = DynamicFetcher.fetch('https://quotes.toscrape.com/')\ndata = page.css('.quote .text::text').getall()\n```\n\n### Spiders\nConstruisez des crawlers complets avec des requêtes concurrentes, plusieurs types de sessions et pause/reprise :\n```python\nfrom scrapling.spiders import Spider, Request, Response\n\nclass QuotesSpider(Spider):\n    name = \"quotes\"\n    start_urls = [\"https://quotes.toscrape.com/\"]\n    concurrent_requests = 10\n\n    async def parse(self, response: Response):\n        for quote in response.css('.quote'):\n            yield {\n                \"text\": quote.css('.text::text').get(),\n                \"author\": quote.css('.author::text').get(),\n            }\n\n        next_page = response.css('.next a')\n        if next_page:\n            yield response.follow(next_page[0].attrib['href'])\n\nresult = QuotesSpider().start()\nprint(f\"{len(result.items)} citations scrapées\")\nresult.items.to_json(\"quotes.json\")\n```\nUtilisez plusieurs types de sessions dans un seul spider :\n```python\nfrom scrapling.spiders import Spider, Request, Response\nfrom scrapling.fetchers import FetcherSession, AsyncStealthySession\n\nclass MultiSessionSpider(Spider):\n    name = \"multi\"\n    start_urls = [\"https://example.com/\"]\n\n    def configure_sessions(self, manager):\n        manager.add(\"fast\", FetcherSession(impersonate=\"chrome\"))\n        manager.add(\"stealth\", AsyncStealthySession(headless=True), lazy=True)\n\n    async def parse(self, response: Response):\n        for link in response.css('a::attr(href)').getall():\n            # Router les pages protégées via la session furtive\n            if \"protected\" in link:\n                yield Request(link, sid=\"stealth\")\n            else:\n                yield Request(link, sid=\"fast\", callback=self.parse)  # Callback explicite\n```\nMettez en pause et reprenez les longs crawls avec des checkpoints en lançant le spider ainsi :\n```python\nQuotesSpider(crawldir=\"./crawl_data\").start()\n```\nAppuyez sur Ctrl+C pour mettre en pause gracieusement — la progression est sauvegardée automatiquement. Plus tard, lorsque vous relancez le spider, passez le même `crawldir`, et il reprendra là où il s'était arrêté.\n\n### Parsing avancé & Navigation\n```python\nfrom scrapling.fetchers import Fetcher\n\n# Sélection riche d'éléments et navigation\npage = Fetcher.get('https://quotes.toscrape.com/')\n\n# Obtenir des citations avec plusieurs méthodes de sélection\nquotes = page.css('.quote')  # Sélecteur CSS\nquotes = page.xpath('//div[@class=\"quote\"]')  # XPath\nquotes = page.find_all('div', {'class': 'quote'})  # Style BeautifulSoup\n# Identique à\nquotes = page.find_all('div', class_='quote')\nquotes = page.find_all(['div'], class_='quote')\nquotes = page.find_all(class_='quote')  # et ainsi de suite...\n# Trouver un élément par contenu textuel\nquotes = page.find_by_text('quote', tag='div')\n\n# Navigation avancée\nquote_text = page.css('.quote')[0].css('.text::text').get()\nquote_text = page.css('.quote').css('.text::text').getall()  # Sélecteurs chaînés\nfirst_quote = page.css('.quote')[0]\nauthor = first_quote.next_sibling.css('.author::text')\nparent_container = first_quote.parent\n\n# Relations et similarité entre éléments\nsimilar_elements = first_quote.find_similar()\nbelow_elements = first_quote.below_elements()\n```\nVous pouvez utiliser le parser directement si vous ne souhaitez pas récupérer de sites web, comme ci-dessous :\n```python\nfrom scrapling.parser import Selector\n\npage = Selector(\"<html>...</html>\")\n```\nEt cela fonctionne exactement de la même manière !\n\n### Exemples de gestion de sessions async\n```python\nimport asyncio\nfrom scrapling.fetchers import FetcherSession, AsyncStealthySession, AsyncDynamicSession\n\nasync with FetcherSession(http3=True) as session:  # `FetcherSession` est sensible au contexte et peut fonctionner en mode sync comme async\n    page1 = session.get('https://quotes.toscrape.com/')\n    page2 = session.get('https://quotes.toscrape.com/', impersonate='firefox135')\n\n# Utilisation de session async\nasync with AsyncStealthySession(max_pages=2) as session:\n    tasks = []\n    urls = ['https://example.com/page1', 'https://example.com/page2']\n\n    for url in urls:\n        task = session.fetch(url)\n        tasks.append(task)\n\n    print(session.get_pool_stats())  # Optionnel - Le statut du pool d'onglets du navigateur (occupé/libre/erreur)\n    results = await asyncio.gather(*tasks)\n    print(session.get_pool_stats())\n```\n\n## CLI & Shell interactif\n\nScrapling inclut une interface en ligne de commande puissante :\n\n[![asciicast](https://asciinema.org/a/736339.svg)](https://asciinema.org/a/736339)\n\nLancer le shell interactif de Web Scraping\n```bash\nscrapling shell\n```\nExtraire des pages directement dans un fichier sans programmation (extrait par défaut le contenu de la balise `body`). Si le fichier de sortie se termine par `.txt`, le contenu textuel de la cible sera extrait. S'il se termine par `.md`, ce sera une représentation Markdown du contenu HTML ; s'il se termine par `.html`, ce sera le contenu HTML lui-même.\n```bash\nscrapling extract get 'https://example.com' content.md\nscrapling extract get 'https://example.com' content.txt --css-selector '#fromSkipToProducts' --impersonate 'chrome'  # Tous les éléments correspondant au sélecteur CSS '#fromSkipToProducts'\nscrapling extract fetch 'https://example.com' content.md --css-selector '#fromSkipToProducts' --no-headless\nscrapling extract stealthy-fetch 'https://nopecha.com/demo/cloudflare' captchas.html --css-selector '#padded_content a' --solve-cloudflare\n```\n\n> [!NOTE]\n> Il existe de nombreuses fonctionnalités supplémentaires, mais nous souhaitons garder cette page concise, y compris le serveur MCP et le shell interactif de Web Scraping. Consultez la documentation complète [ici](https://scrapling.readthedocs.io/en/latest/)\n\n## Benchmarks de performance\n\nScrapling n'est pas seulement puissant — il est aussi ultra rapide. Les benchmarks suivants comparent le parser de Scrapling avec les dernières versions d'autres bibliothèques populaires.\n\n### Test de vitesse d'extraction de texte (5000 éléments imbriqués)\n\n| # |   Bibliothèque    | Temps (ms) | vs Scrapling |\n|---|:-----------------:|:----------:|:------------:|\n| 1 |     Scrapling     |    2.02    |     1.0x     |\n| 2 |   Parsel/Scrapy   |    2.04    |     1.01     |\n| 3 |     Raw Lxml      |    2.54    |    1.257     |\n| 4 |      PyQuery      |   24.17    |     ~12x     |\n| 5 |    Selectolax     |   82.63    |     ~41x     |\n| 6 |  MechanicalSoup   |  1549.71   |   ~767.1x    |\n| 7 |   BS4 with Lxml   |  1584.31   |   ~784.3x    |\n| 8 | BS4 with html5lib |  3391.91   |   ~1679.1x   |\n\n\n### Performance de similarité d'éléments & recherche textuelle\n\nLes capacités adaptatives de recherche d'éléments de Scrapling surpassent significativement les alternatives :\n\n| Bibliothèque | Temps (ms) | vs Scrapling |\n|--------------|:----------:|:------------:|\n| Scrapling    |    2.39    |     1.0x     |\n| AutoScraper  |   12.45    |    5.209x    |\n\n\n> Tous les benchmarks représentent des moyennes de plus de 100 exécutions. Voir [benchmarks.py](https://github.com/D4Vinci/Scrapling/blob/main/benchmarks.py) pour la méthodologie.\n\n## Installation\n\nScrapling nécessite Python 3.10 ou supérieur :\n\n```bash\npip install scrapling\n```\n\nCette installation n'inclut que le moteur de parsing et ses dépendances, sans aucun fetcher ni dépendance en ligne de commande.\n\n### Dépendances optionnelles\n\n1. Si vous allez utiliser l'une des fonctionnalités supplémentaires ci-dessous, les fetchers ou leurs classes, vous devrez installer les dépendances des fetchers et leurs dépendances navigateur comme suit :\n    ```bash\n    pip install \"scrapling[fetchers]\"\n\n    scrapling install           # installation normale\n    scrapling install  --force  # réinstallation forcée\n    ```\n\n    Cela télécharge tous les navigateurs, ainsi que leurs dépendances système et les dépendances de manipulation d'empreintes.\n\n    Ou vous pouvez les installer depuis le code au lieu d'exécuter une commande :\n    ```python\n    from scrapling.cli import install\n\n    install([], standalone_mode=False)          # installation normale\n    install([\"--force\"], standalone_mode=False) # réinstallation forcée\n    ```\n\n2. Fonctionnalités supplémentaires :\n   - Installer la fonctionnalité serveur MCP :\n       ```bash\n       pip install \"scrapling[ai]\"\n       ```\n   - Installer les fonctionnalités shell (shell de Web Scraping et la commande `extract`) :\n       ```bash\n       pip install \"scrapling[shell]\"\n       ```\n   - Tout installer :\n       ```bash\n       pip install \"scrapling[all]\"\n       ```\n   N'oubliez pas que vous devez installer les dépendances navigateur avec `scrapling install` après l'un de ces extras (si vous ne l'avez pas déjà fait)\n\n### Docker\nVous pouvez également installer une image Docker avec tous les extras et navigateurs avec la commande suivante depuis DockerHub :\n```bash\ndocker pull pyd4vinci/scrapling\n```\nOu téléchargez-la depuis le registre GitHub :\n```bash\ndocker pull ghcr.io/d4vinci/scrapling:latest\n```\nCette image est automatiquement construite et publiée en utilisant GitHub Actions et la branche principale du dépôt.\n\n## Contribuer\n\nLes contributions sont les bienvenues ! Veuillez lire nos [directives de contribution](https://github.com/D4Vinci/Scrapling/blob/main/CONTRIBUTING.md) avant de commencer.\n\n## Avertissement\n\n> [!CAUTION]\n> Cette bibliothèque est fournie uniquement à des fins éducatives et de recherche. En utilisant cette bibliothèque, vous acceptez de vous conformer aux lois locales et internationales sur le scraping de données et la confidentialité. Les auteurs et contributeurs ne sont pas responsables de toute utilisation abusive de ce logiciel. Respectez toujours les conditions d'utilisation des sites web et les fichiers robots.txt.\n\n## 🎓 Citations\nSi vous avez utilisé notre bibliothèque à des fins de recherche, veuillez nous citer avec la référence suivante :\n```text\n  @misc{scrapling,\n    author = {Karim Shoair},\n    title = {Scrapling},\n    year = {2024},\n    url = {https://github.com/D4Vinci/Scrapling},\n    note = {An adaptive Web Scraping framework that handles everything from a single request to a full-scale crawl!}\n  }\n```\n\n## Licence\n\nCe travail est sous licence BSD-3-Clause.\n\n## Remerciements\n\nCe projet inclut du code adapté de :\n- Parsel (Licence BSD) — Utilisé pour le sous-module [translator](https://github.com/D4Vinci/Scrapling/blob/main/scrapling/core/translator.py)\n\n---\n<div align=\"center\"><small>Conçu et développé avec ❤️ par Karim Shoair.</small></div><br>\n"
  },
  {
    "path": "docs/README_JP.md",
    "content": "<!-- mcp-name: io.github.D4Vinci/Scrapling -->\n\n<h1 align=\"center\">\n    <a href=\"https://scrapling.readthedocs.io\">\n        <picture>\n          <source media=\"(prefers-color-scheme: dark)\" srcset=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/docs/assets/cover_dark.svg?sanitize=true\">\n          <img alt=\"Scrapling Poster\" src=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/docs/assets/cover_light.svg?sanitize=true\">\n        </picture>\n    </a>\n    <br>\n    <small>Effortless Web Scraping for the Modern Web</small>\n</h1>\n\n<p align=\"center\">\n    <a href=\"https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml\" alt=\"Tests\">\n        <img alt=\"Tests\" src=\"https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml/badge.svg\"></a>\n    <a href=\"https://badge.fury.io/py/Scrapling\" alt=\"PyPI version\">\n        <img alt=\"PyPI version\" src=\"https://badge.fury.io/py/Scrapling.svg\"></a>\n    <a href=\"https://clickpy.clickhouse.com/dashboard/scrapling\" rel=\"nofollow\"><img src=\"https://img.shields.io/pypi/dm/scrapling\" alt=\"PyPI package downloads\"></a>\n    <a href=\"https://github.com/D4Vinci/Scrapling/tree/main/agent-skill\" alt=\"AI Agent Skill directory\">\n        <img alt=\"Static Badge\" src=\"https://img.shields.io/badge/Skill-black?style=flat&label=Agent&link=https%3A%2F%2Fgithub.com%2FD4Vinci%2FScrapling%2Ftree%2Fmain%2Fagent-skill\"></a>\n    <a href=\"https://clawhub.ai/D4Vinci/scrapling-official\" alt=\"OpenClaw Skill\">\n        <img alt=\"OpenClaw Skill\" src=\"https://img.shields.io/badge/Clawhub-darkred?style=flat&label=OpenClaw&link=https%3A%2F%2Fclawhub.ai%2FD4Vinci%2Fscrapling-official\"></a>\n    <br/>\n    <a href=\"https://discord.gg/EMgGbDceNQ\" alt=\"Discord\" target=\"_blank\">\n      <img alt=\"Discord\" src=\"https://img.shields.io/discord/1360786381042880532?style=social&logo=discord&link=https%3A%2F%2Fdiscord.gg%2FEMgGbDceNQ\">\n    </a>\n    <a href=\"https://x.com/Scrapling_dev\" alt=\"X (formerly Twitter)\">\n      <img alt=\"X (formerly Twitter) Follow\" src=\"https://img.shields.io/twitter/follow/Scrapling_dev?style=social&logo=x&link=https%3A%2F%2Fx.com%2FScrapling_dev\">\n    </a>\n    <br/>\n    <a href=\"https://pypi.org/project/scrapling/\" alt=\"Supported Python versions\">\n        <img alt=\"Supported Python versions\" src=\"https://img.shields.io/pypi/pyversions/scrapling.svg\"></a>\n</p>\n\n<p align=\"center\">\n    <a href=\"https://scrapling.readthedocs.io/en/latest/parsing/selection.html\"><strong>選択メソッド</strong></a>\n    &middot;\n    <a href=\"https://scrapling.readthedocs.io/en/latest/fetching/choosing.html\"><strong>Fetcher の選び方</strong></a>\n    &middot;\n    <a href=\"https://scrapling.readthedocs.io/en/latest/spiders/architecture.html\"><strong>スパイダー</strong></a>\n    &middot;\n    <a href=\"https://scrapling.readthedocs.io/en/latest/spiders/proxy-blocking.html\"><strong>プロキシローテーション</strong></a>\n    &middot;\n    <a href=\"https://scrapling.readthedocs.io/en/latest/cli/overview.html\"><strong>CLI</strong></a>\n    &middot;\n    <a href=\"https://scrapling.readthedocs.io/en/latest/ai/mcp-server.html\"><strong>MCP モード</strong></a>\n</p>\n\nScrapling は、単一のリクエストから本格的なクロールまですべてを処理する適応型 Web Scraping フレームワークです。\n\nそのパーサーはウェブサイトの変更から学習し、ページが更新されたときに要素を自動的に再配置します。Fetcher はすぐに使える Cloudflare Turnstile などのアンチボットシステムを回避します。そして Spider フレームワークにより、Pause & Resume や自動 Proxy 回転機能を備えた並行マルチ Session クロールへとスケールアップできます — すべてわずか数行の Python で。1 つのライブラリ、妥協なし。\n\nリアルタイム統計と Streaming による超高速クロール。Web Scraper によって、Web Scraper と一般ユーザーのために構築され、誰にでも何かがあります。\n\n```python\nfrom scrapling.fetchers import Fetcher, AsyncFetcher, StealthyFetcher, DynamicFetcher\nStealthyFetcher.adaptive = True\np = StealthyFetcher.fetch('https://example.com', headless=True, network_idle=True)  # レーダーの下でウェブサイトを取得！\nproducts = p.css('.product', auto_save=True)                                        # ウェブサイトのデザイン変更に耐えるデータをスクレイプ！\nproducts = p.css('.product', adaptive=True)                                         # 後でウェブサイトの構造が変わったら、`adaptive=True`を渡して見つける！\n```\nまたは本格的なクロールへスケールアップ\n```python\nfrom scrapling.spiders import Spider, Response\n\nclass MySpider(Spider):\n  name = \"demo\"\n  start_urls = [\"https://example.com/\"]\n\n  async def parse(self, response: Response):\n      for item in response.css('.product'):\n          yield {\"title\": item.css('h2::text').get()}\n\nMySpider().start()\n```\n\n<p align=\"center\">\n    <a href=\"https://dataimpulse.com/?utm_source=scrapling&utm_medium=banner&utm_campaign=scrapling\" target=\"_blank\" style=\"display:flex; justify-content:center; padding:4px 0;\">\n        <img src=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/DataImpulse.png\" alt=\"At DataImpulse, we specialize in developing custom proxy services for your business. Make requests from anywhere, collect data, and enjoy fast connections with our premium proxies.\" style=\"max-height:60px;\">\n    </a>\n</p>\n\n# プラチナスポンサー\n<table>\n  <tr>\n    <td width=\"200\">\n      <a href=\"https://hypersolutions.co/?utm_source=github&utm_medium=readme&utm_campaign=scrapling\" target=\"_blank\" title=\"Bot Protection Bypass API for Akamai, DataDome, Incapsula & Kasada\">\n        <img src=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/HyperSolutions.png\">\n      </a>\n    </td>\n    <td> Scrapling は Cloudflare Turnstile に対応。エンタープライズレベルの保護には、<a href=\"https://hypersolutions.co?utm_source=github&utm_medium=readme&utm_campaign=scrapling\">\n        <b>Hyper Solutions</b>\n      </a>が<b>Akamai</b>、<b>DataDome</b>、<b>Kasada</b>、<b>Incapsula</b>向けの有効な antibot トークンを生成する API エンドポイントを提供。シンプルな API 呼び出しで、ブラウザ自動化不要。 </td>\n  </tr>\n  <tr>\n    <td width=\"200\">\n      <a href=\"https://birdproxies.com/t/scrapling\" target=\"_blank\" title=\"At Bird Proxies, we eliminate your pains such as banned IPs, geo restriction, and high costs so you can focus on your work.\">\n        <img src=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/BirdProxies.jpg\">\n      </a>\n    </td>\n    <td>プロキシは複雑で高価であるべきではないと考え、<a href=\"https://birdproxies.com/t/scrapling\">\n        <b>BirdProxies</b>\n      </a>を構築しました。 <br /> 195以上のロケーションの高速レジデンシャル・ISPプロキシ、公正な価格設定、そして本物のサポート。 <br />\n      <b>ランディングページでFlappyBird ゲームを試して無料データをゲット！</b>\n    </td>\n  </tr>\n  <tr>\n    <td width=\"200\">\n      <a href=\"https://evomi.com?utm_source=github&utm_medium=banner&utm_campaign=d4vinci-scrapling\" target=\"_blank\" title=\"Evomi is your Swiss Quality Proxy Provider, starting at $0.49/GB\">\n        <img src=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/evomi.png\">\n      </a>\n    </td>\n    <td>\n      <a href=\"https://evomi.com?utm_source=github&utm_medium=banner&utm_campaign=d4vinci-scrapling\">\n        <b>Evomi</b>\n      </a>：レジデンシャルプロキシが $0.49/GB から。完全に偽装された Chromium によるスクレイピングブラウザ、レジデンシャル IP、自動 CAPTCHA 解決、アンチボットバイパス。</br>\n      <b>Scraper API で手間なく結果を取得。MCP と N8N の統合に対応。</b>\n    </td>\n  </tr>\n  <tr>\n    <td width=\"200\">\n      <a href=\"https://tikhub.io/?ref=KarimShoair\" target=\"_blank\" title=\"Unlock the Power of Social Media Data & AI\">\n        <img src=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/TikHub.jpg\">\n      </a>\n    </td>\n    <td>\n      <a href=\"https://tikhub.io/?ref=KarimShoair\" target=\"_blank\">TikHub.io</a> は TikTok、X、YouTube、Instagram を含む 16 以上のプラットフォームで 900 以上の安定した API を提供し、4,000 万以上のデータセットを保有。<br /> さらに <a href=\"https://ai.tikhub.io/?ref=KarimShoair\" target=\"_blank\">割引 AI モデル</a>も提供 — Claude、GPT、GEMINI など最大 71% オフ。\n    </td>\n  </tr>\n  <tr>\n    <td width=\"200\">\n      <a href=\"https://www.nsocks.com/?keyword=2p67aivg\" target=\"_blank\" title=\"Scalable Web Data Access for AI Applications\">\n        <img src=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/nsocks.png\">\n      </a>\n    </td>\n    <td>\n    <a href=\"https://www.nsocks.com/?keyword=2p67aivg\" target=\"_blank\">Nsocks</a> は開発者やスクレイパー向けの高速なレジデンシャルおよび ISP プロキシを提供。グローバル IP カバレッジ、高い匿名性、スマートなローテーション、自動化とデータ抽出のための信頼性の高いパフォーマンス。<a href=\"https://www.xcrawl.com/?keyword=2p67aivg\" target=\"_blank\">Xcrawl</a> で大規模ウェブクローリングを簡素化。\n    </td>\n  </tr>\n  <tr>\n    <td width=\"200\">\n      <a href=\"https://petrosky.io/d4vinci\" target=\"_blank\" title=\"PetroSky delivers cutting-edge VPS hosting.\">\n        <img src=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/petrosky.png\">\n      </a>\n    </td>\n    <td>\n    ノートパソコンを閉じても、スクレイパーは動き続けます。<br />\n    <a href=\"https://petrosky.io/d4vinci\" target=\"_blank\">PetroSky VPS</a> - ノンストップ自動化のために構築されたクラウドサーバー。Windows と Linux マシンを完全制御。月額 €6.99 から。\n    </td>\n  </tr>\n  <tr>\n    <td width=\"200\">\n      <a href=\"https://substack.thewebscraping.club/p/scrapling-hands-on-guide?utm_source=github&utm_medium=repo&utm_campaign=scrapling\" target=\"_blank\" title=\"The #1 newsletter dedicated to Web Scraping\">\n        <img src=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/TWSC.png\">\n      </a>\n    </td>\n    <td>\n    <a href=\"https://substack.thewebscraping.club/p/scrapling-hands-on-guide?utm_source=github&utm_medium=repo&utm_campaign=scrapling\" target=\"_blank\">The Web Scraping Club で Scrapling の詳細レビュー</a>（2025年11月）をお読みください。Web スクレイピング専門の No.1 ニュースレターです。\n    </td>\n  </tr>\n  <tr>\n    <td width=\"200\">\n      <a href=\"https://proxy-seller.com/?partner=CU9CAA5TBYFFT2\" target=\"_blank\" title=\"Proxy-Seller provides reliable proxy infrastructure for Web Scraping\">\n        <img src=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/ProxySeller.png\">\n      </a>\n    </td>\n    <td>\n    <a href=\"https://proxy-seller.com/?partner=CU9CAA5TBYFFT2\" target=\"_blank\">Proxy-Seller</a> は Web スクレイピング向けの信頼性の高いプロキシインフラを提供しています。IPv4、IPv6、ISP、レジデンシャル、モバイルプロキシに対応し、安定したパフォーマンス、幅広い地理的カバレッジ、企業規模のデータ収集に柔軟なプランを備えています。\n    </td>\n  </tr>\n</table>\n\n<i><sub>ここに広告を表示したいですか？[こちら](https://github.com/sponsors/D4Vinci/sponsorships?tier_id=586646)をクリック</sub></i>\n# スポンサー\n\n<!-- sponsors -->\n\n<a href=\"https://serpapi.com/?utm_source=scrapling\" target=\"_blank\" title=\"Scrape Google and other search engines with SerpApi\"><img src=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/SerpApi.png\"></a>\n<a href=\"https://visit.decodo.com/Dy6W0b\" target=\"_blank\" title=\"Try the Most Efficient Residential Proxies for Free\"><img src=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/decodo.png\"></a>\n<a href=\"https://hasdata.com/?utm_source=github&utm_medium=banner&utm_campaign=D4Vinci\" target=\"_blank\" title=\"The web scraping service that actually beats anti-bot systems!\"><img src=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/hasdata.png\"></a>\n<a href=\"https://proxyempire.io/?ref=scrapling&utm_source=scrapling\" target=\"_blank\" title=\"Collect The Data Your Project Needs with the Best Residential Proxies\"><img src=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/ProxyEmpire.png\"></a><a href=\"https://www.swiftproxy.net/\" target=\"_blank\" title=\"Unlock Reliable Proxy Services with Swiftproxy!\"><img src=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/swiftproxy.png\"></a>\n<a href=\"https://www.rapidproxy.io/?ref=d4v\" target=\"_blank\" title=\"Affordable Access to the Proxy World – bypass CAPTCHAs blocks, and avoid additional costs.\"><img src=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/rapidproxy.jpg\"></a>\n<a href=\"https://browser.cash/?utm_source=D4Vinci&utm_medium=referral\" target=\"_blank\" title=\"Browser Automation & AI Browser Agent Platform\"><img src=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/browserCash.png\"></a>\n\n<!-- /sponsors -->\n\n<i><sub>ここに広告を表示したいですか？[こちら](https://github.com/sponsors/D4Vinci)をクリックして、あなたに合ったティアを選択してください！</sub></i>\n\n---\n\n## 主な機能\n\n### Spider — 本格的なクロールフレームワーク\n- 🕷️ **Scrapy 風の Spider API**：`start_urls`、async `parse` callback、`Request`/`Response` オブジェクトで Spider を定義。\n- ⚡ **並行クロール**：設定可能な並行数制限、ドメインごとのスロットリング、ダウンロード遅延。\n- 🔄 **マルチ Session サポート**：HTTP リクエストとステルスヘッドレスブラウザの統一インターフェース — ID によって異なる Session にリクエストをルーティング。\n- 💾 **Pause & Resume**：Checkpoint ベースのクロール永続化。Ctrl+C で正常にシャットダウン；再起動すると中断したところから再開。\n- 📡 **Streaming モード**：`async for item in spider.stream()` でリアルタイム統計とともにスクレイプされたアイテムを Streaming で受信 — UI、パイプライン、長時間実行クロールに最適。\n- 🛡️ **ブロックされたリクエストの検出**：カスタマイズ可能なロジックによるブロックされたリクエストの自動検出とリトライ。\n- 📦 **組み込みエクスポート**：フックや独自のパイプライン、または組み込みの JSON/JSONL で結果をエクスポート。それぞれ`result.items.to_json()` / `result.items.to_jsonl()`を使用。\n\n### Session サポート付き高度なウェブサイト取得\n- **HTTP リクエスト**：`Fetcher` クラスで高速かつステルスな HTTP リクエスト。ブラウザの TLS fingerprint、ヘッダーを模倣し、HTTP/3 を使用可能。\n- **動的読み込み**：Playwright の Chromium と Google Chrome をサポートする `DynamicFetcher` クラスによる完全なブラウザ自動化で動的ウェブサイトを取得。\n- **アンチボット回避**：`StealthyFetcher` と fingerprint 偽装による高度なステルス機能。自動化で Cloudflare の Turnstile/Interstitial のすべてのタイプを簡単に回避。\n- **Session 管理**：リクエスト間で Cookie と状態を管理するための `FetcherSession`、`StealthySession`、`DynamicSession` クラスによる永続的な Session サポート。\n- **Proxy 回転**：すべての Session タイプに対応したラウンドロビンまたはカスタム戦略の組み込み `ProxyRotator`、さらにリクエストごとの Proxy オーバーライド。\n- **ドメインブロック**：ブラウザベースの Fetcher で特定のドメイン（およびそのサブドメイン）へのリクエストをブロック。\n- **async サポート**：すべての Fetcher および専用 async Session クラス全体での完全な async サポート。\n\n### 適応型スクレイピングと AI 統合\n- 🔄 **スマート要素追跡**：インテリジェントな類似性アルゴリズムを使用してウェブサイトの変更後に要素を再配置。\n- 🎯 **スマート柔軟選択**：CSS セレクタ、XPath セレクタ、フィルタベース検索、テキスト検索、正規表現検索など。\n- 🔍 **類似要素の検出**：見つかった要素に類似した要素を自動的に特定。\n- 🤖 **AI と使用する MCP サーバー**：AI 支援 Web Scraping とデータ抽出のための組み込み MCP サーバー。MCP サーバーは、AI（Claude/Cursor など）に渡す前に Scrapling を活用してターゲットコンテンツを抽出する強力でカスタムな機能を備えており、操作を高速化し、トークン使用量を最小限に抑えることでコストを削減します。（[デモ動画](https://www.youtube.com/watch?v=qyFk3ZNwOxE)）\n\n### 高性能で実戦テスト済みのアーキテクチャ\n- 🚀 **超高速**：ほとんどの Python スクレイピングライブラリを上回る最適化されたパフォーマンス。\n- 🔋 **メモリ効率**：最小のメモリフットプリントのための最適化されたデータ構造と遅延読み込み。\n- ⚡ **高速 JSON シリアル化**：標準ライブラリの 10 倍の速度。\n- 🏗️ **実戦テスト済み**：Scrapling は 92% のテストカバレッジと完全な型ヒントカバレッジを備えているだけでなく、過去1年間に数百人の Web Scraper によって毎日使用されてきました。\n\n### 開発者/Web Scraper にやさしい体験\n- 🎯 **インタラクティブ Web Scraping Shell**：Scrapling 統合、ショートカット、curl リクエストを Scrapling リクエストに変換したり、ブラウザでリクエスト結果を表示したりするなどの新しいツールを備えたオプションの組み込み IPython Shell で、Web Scraping スクリプトの開発を加速。\n- 🚀 **ターミナルから直接使用**：オプションで、コードを一行も書かずに Scrapling を使用して URL をスクレイプできます！\n- 🛠️ **豊富なナビゲーション API**：親、兄弟、子のナビゲーションメソッドによる高度な DOM トラバーサル。\n- 🧬 **強化されたテキスト処理**：組み込みの正規表現、クリーニングメソッド、最適化された文字列操作。\n- 📝 **自動セレクタ生成**：任意の要素に対して堅牢な CSS/XPath セレクタを生成。\n- 🔌 **馴染みのある API**：Scrapy/Parsel で使用されている同じ疑似要素を持つ Scrapy/BeautifulSoup に似た設計。\n- 📘 **完全な型カバレッジ**：優れた IDE サポートとコード補完のための完全な型ヒント。コードベース全体が変更のたびに**PyRight**と**MyPy**で自動的にスキャンされます。\n- 🔋 **すぐに使える Docker イメージ**：各リリースで、すべてのブラウザを含む Docker イメージが自動的にビルドおよびプッシュされます。\n\n## はじめに\n\n深く掘り下げずに、Scrapling にできることの簡単な概要をお見せしましょう。\n\n### 基本的な使い方\nSession サポート付き HTTP リクエスト\n```python\nfrom scrapling.fetchers import Fetcher, FetcherSession\n\nwith FetcherSession(impersonate='chrome') as session:  # Chrome の TLS fingerprint の最新バージョンを使用\n    page = session.get('https://quotes.toscrape.com/', stealthy_headers=True)\n    quotes = page.css('.quote .text::text').getall()\n\n# または一回限りのリクエストを使用\npage = Fetcher.get('https://quotes.toscrape.com/')\nquotes = page.css('.quote .text::text').getall()\n```\n高度なステルスモード\n```python\nfrom scrapling.fetchers import StealthyFetcher, StealthySession\n\nwith StealthySession(headless=True, solve_cloudflare=True) as session:  # 完了するまでブラウザを開いたままにする\n    page = session.fetch('https://nopecha.com/demo/cloudflare', google_search=False)\n    data = page.css('#padded_content a').getall()\n\n# または一回限りのリクエストスタイル、このリクエストのためにブラウザを開き、完了後に閉じる\npage = StealthyFetcher.fetch('https://nopecha.com/demo/cloudflare')\ndata = page.css('#padded_content a').getall()\n```\n完全なブラウザ自動化\n```python\nfrom scrapling.fetchers import DynamicFetcher, DynamicSession\n\nwith DynamicSession(headless=True, disable_resources=False, network_idle=True) as session:  # 完了するまでブラウザを開いたままにする\n    page = session.fetch('https://quotes.toscrape.com/', load_dom=False)\n    data = page.xpath('//span[@class=\"text\"]/text()').getall()  # お好みであれば XPath セレクタを使用\n\n# または一回限りのリクエストスタイル、このリクエストのためにブラウザを開き、完了後に閉じる\npage = DynamicFetcher.fetch('https://quotes.toscrape.com/')\ndata = page.css('.quote .text::text').getall()\n```\n\n### Spider\n並行リクエスト、複数の Session タイプ、Pause & Resume を備えた本格的なクローラーを構築：\n```python\nfrom scrapling.spiders import Spider, Request, Response\n\nclass QuotesSpider(Spider):\n    name = \"quotes\"\n    start_urls = [\"https://quotes.toscrape.com/\"]\n    concurrent_requests = 10\n\n    async def parse(self, response: Response):\n        for quote in response.css('.quote'):\n            yield {\n                \"text\": quote.css('.text::text').get(),\n                \"author\": quote.css('.author::text').get(),\n            }\n\n        next_page = response.css('.next a')\n        if next_page:\n            yield response.follow(next_page[0].attrib['href'])\n\nresult = QuotesSpider().start()\nprint(f\"{len(result.items)}件の引用をスクレイプしました\")\nresult.items.to_json(\"quotes.json\")\n```\n単一の Spider で複数の Session タイプを使用：\n```python\nfrom scrapling.spiders import Spider, Request, Response\nfrom scrapling.fetchers import FetcherSession, AsyncStealthySession\n\nclass MultiSessionSpider(Spider):\n    name = \"multi\"\n    start_urls = [\"https://example.com/\"]\n\n    def configure_sessions(self, manager):\n        manager.add(\"fast\", FetcherSession(impersonate=\"chrome\"))\n        manager.add(\"stealth\", AsyncStealthySession(headless=True), lazy=True)\n\n    async def parse(self, response: Response):\n        for link in response.css('a::attr(href)').getall():\n            # 保護されたページはステルス Session を通してルーティング\n            if \"protected\" in link:\n                yield Request(link, sid=\"stealth\")\n            else:\n                yield Request(link, sid=\"fast\", callback=self.parse)  # 明示的な callback\n```\nCheckpoint を使用して長時間のクロールをPause & Resume：\n```python\nQuotesSpider(crawldir=\"./crawl_data\").start()\n```\nCtrl+C を押すと正常に一時停止し、進捗は自動的に保存されます。後で Spider を再度起動する際に同じ`crawldir`を渡すと、中断したところから再開します。\n\n### 高度なパースとナビゲーション\n```python\nfrom scrapling.fetchers import Fetcher\n\n# 豊富な要素選択とナビゲーション\npage = Fetcher.get('https://quotes.toscrape.com/')\n\n# 複数の選択メソッドで引用を取得\nquotes = page.css('.quote')  # CSS セレクタ\nquotes = page.xpath('//div[@class=\"quote\"]')  # XPath\nquotes = page.find_all('div', {'class': 'quote'})  # BeautifulSoup スタイル\n# 以下と同じ\nquotes = page.find_all('div', class_='quote')\nquotes = page.find_all(['div'], class_='quote')\nquotes = page.find_all(class_='quote')  # など...\n# テキスト内容で要素を検索\nquotes = page.find_by_text('quote', tag='div')\n\n# 高度なナビゲーション\nquote_text = page.css('.quote')[0].css('.text::text').get()\nquote_text = page.css('.quote').css('.text::text').getall()  # チェーンセレクタ\nfirst_quote = page.css('.quote')[0]\nauthor = first_quote.next_sibling.css('.author::text')\nparent_container = first_quote.parent\n\n# 要素の関連性と類似性\nsimilar_elements = first_quote.find_similar()\nbelow_elements = first_quote.below_elements()\n```\nウェブサイトを取得せずにパーサーをすぐに使用することもできます：\n```python\nfrom scrapling.parser import Selector\n\npage = Selector(\"<html>...</html>\")\n```\nまったく同じ方法で動作します！\n\n### 非同期 Session 管理の例\n```python\nimport asyncio\nfrom scrapling.fetchers import FetcherSession, AsyncStealthySession, AsyncDynamicSession\n\nasync with FetcherSession(http3=True) as session:  # `FetcherSession` はコンテキストアウェアで、同期/非同期両方のパターンで動作可能\n    page1 = session.get('https://quotes.toscrape.com/')\n    page2 = session.get('https://quotes.toscrape.com/', impersonate='firefox135')\n\n# 非同期 Session の使用\nasync with AsyncStealthySession(max_pages=2) as session:\n    tasks = []\n    urls = ['https://example.com/page1', 'https://example.com/page2']\n\n    for url in urls:\n        task = session.fetch(url)\n        tasks.append(task)\n\n    print(session.get_pool_stats())  # オプション - ブラウザタブプールのステータス（ビジー/フリー/エラー）\n    results = await asyncio.gather(*tasks)\n    print(session.get_pool_stats())\n```\n\n## CLI とインタラクティブ Shell\n\nScrapling には強力なコマンドラインインターフェースが含まれています：\n\n[![asciicast](https://asciinema.org/a/736339.svg)](https://asciinema.org/a/736339)\n\nインタラクティブ Web Scraping Shell を起動\n```bash\nscrapling shell\n```\nプログラミングせずに直接ページをファイルに抽出（デフォルトで`body`タグ内のコンテンツを抽出）。出力ファイルが`.txt`で終わる場合、ターゲットのテキストコンテンツが抽出されます。`.md`で終わる場合、HTML コンテンツの Markdown 表現になります。`.html` で終わる場合、HTML コンテンツそのものになります。\n```bash\nscrapling extract get 'https://example.com' content.md\nscrapling extract get 'https://example.com' content.txt --css-selector '#fromSkipToProducts' --impersonate 'chrome'  # CSS セレクタ'#fromSkipToProducts'に一致するすべての要素\nscrapling extract fetch 'https://example.com' content.md --css-selector '#fromSkipToProducts' --no-headless\nscrapling extract stealthy-fetch 'https://nopecha.com/demo/cloudflare' captchas.html --css-selector '#padded_content a' --solve-cloudflare\n```\n\n> [!NOTE]\n> MCP サーバーやインタラクティブ Web Scraping Shell など、他にも多くの追加機能がありますが、このページは簡潔に保ちたいと思います。完全なドキュメントは[こちら](https://scrapling.readthedocs.io/en/latest/)をご覧ください\n\n## パフォーマンスベンチマーク\n\nScrapling は強力であるだけでなく、超高速です。以下のベンチマークは、Scrapling のパーサーを他の人気ライブラリの最新バージョンと比較しています。\n\n### テキスト抽出速度テスト（5000 個のネストされた要素）\n\n| # |      ライブラリ      | 時間 (ms) | vs Scrapling |\n|---|:-----------------:|:---------:|:------------:|\n| 1 |     Scrapling     |   2.02    |     1.0x     |\n| 2 |   Parsel/Scrapy   |   2.04    |     1.01     |\n| 3 |     Raw Lxml      |   2.54    |    1.257     |\n| 4 |      PyQuery      |   24.17   |     ~12x     |\n| 5 |    Selectolax     |   82.63   |     ~41x     |\n| 6 |  MechanicalSoup   |  1549.71  |   ~767.1x    |\n| 7 |   BS4 with Lxml   |  1584.31  |   ~784.3x    |\n| 8 | BS4 with html5lib |  3391.91  |   ~1679.1x   |\n\n\n### 要素類似性とテキスト検索のパフォーマンス\n\nScrapling の適応型要素検索機能は代替手段を大幅に上回ります：\n\n| ライブラリ     | 時間 (ms) | vs Scrapling |\n|-------------|:---------:|:------------:|\n| Scrapling   |   2.39    |     1.0x     |\n| AutoScraper |   12.45   |    5.209x    |\n\n\n> すべてのベンチマークは 100 回以上の実行の平均を表します。方法論については[benchmarks.py](https://github.com/D4Vinci/Scrapling/blob/main/benchmarks.py)を参照してください。\n\n## インストール\n\nScrapling には Python 3.10 以上が必要です：\n\n```bash\npip install scrapling\n```\n\nこのインストールにはパーサーエンジンとその依存関係のみが含まれており、Fetcher やコマンドライン依存関係は含まれていません。\n\n### オプションの依存関係\n\n1. 以下の追加機能、Fetcher、またはそれらのクラスのいずれかを使用する場合は、Fetcher の依存関係とブラウザの依存関係を次のようにインストールする必要があります：\n    ```bash\n    pip install \"scrapling[fetchers]\"\n\n    scrapling install           # normal install\n    scrapling install  --force  # force reinstall\n    ```\n\n    これにより、すべてのブラウザ、およびそれらのシステム依存関係とfingerprint 操作依存関係がダウンロードされます。\n\n    または、コマンドを実行する代わりにコードからインストールすることもできます：\n    ```python\n    from scrapling.cli import install\n\n    install([], standalone_mode=False)          # normal install\n    install([\"--force\"], standalone_mode=False) # force reinstall\n    ```\n\n2. 追加機能：\n   - MCP サーバー機能をインストール：\n       ```bash\n       pip install \"scrapling[ai]\"\n       ```\n   - Shell 機能（Web Scraping Shell と`extract`コマンド）をインストール：\n       ```bash\n       pip install \"scrapling[shell]\"\n       ```\n   - すべてをインストール：\n       ```bash\n       pip install \"scrapling[all]\"\n       ```\n   これらの追加機能のいずれかの後（まだインストールしていない場合）、`scrapling install`でブラウザの依存関係をインストールする必要があることを忘れないでください\n\n### Docker\nDockerHub から次のコマンドですべての追加機能とブラウザを含む Docker イメージをインストールすることもできます：\n```bash\ndocker pull pyd4vinci/scrapling\n```\nまたは GitHub レジストリからダウンロード：\n```bash\ndocker pull ghcr.io/d4vinci/scrapling:latest\n```\nこのイメージは、GitHub Actions とリポジトリのメインブランチを使用して自動的にビルドおよびプッシュされます。\n\n## 貢献\n\n貢献を歓迎します！始める前に[貢献ガイドライン](https://github.com/D4Vinci/Scrapling/blob/main/CONTRIBUTING.md)をお読みください。\n\n## 免責事項\n\n> [!CAUTION]\n> このライブラリは教育および研究目的のみで提供されています。このライブラリを使用することにより、地域および国際的なデータスクレイピングおよびプライバシー法に準拠することに同意したものとみなされます。著者および貢献者は、このソフトウェアの誤用について責任を負いません。常にウェブサイトの利用規約とrobots.txt ファイルを尊重してください。\n\n## 🎓 引用\n研究目的で当ライブラリを使用された場合は、以下の参考文献で引用してください：\n```text\n  @misc{scrapling,\n    author = {Karim Shoair},\n    title = {Scrapling},\n    year = {2024},\n    url = {https://github.com/D4Vinci/Scrapling},\n    note = {An adaptive Web Scraping framework that handles everything from a single request to a full-scale crawl!}\n  }\n```\n\n## ライセンス\n\nこの作品は BSD-3-Clause ライセンスの下でライセンスされています。\n\n## 謝辞\n\nこのプロジェクトには次から適応されたコードが含まれています：\n- Parsel（BSD ライセンス）— [translator](https://github.com/D4Vinci/Scrapling/blob/main/scrapling/core/translator.py) サブモジュールに使用\n\n---\n<div align=\"center\"><small>Karim Shoair によって❤️でデザインおよび作成されました。</small></div><br>\n"
  },
  {
    "path": "docs/README_KR.md",
    "content": "<!-- mcp-name: io.github.D4Vinci/Scrapling -->\n\n<h1 align=\"center\">\n    <a href=\"https://scrapling.readthedocs.io\">\n        <picture>\n          <source media=\"(prefers-color-scheme: dark)\" srcset=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/docs/assets/cover_dark.svg?sanitize=true\">\n          <img alt=\"Scrapling Poster\" src=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/docs/assets/cover_light.svg?sanitize=true\">\n        </picture>\n    </a>\n    <br>\n    <small>Effortless Web Scraping for the Modern Web</small>\n</h1>\n\n<p align=\"center\">\n    <a href=\"https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml\" alt=\"Tests\">\n        <img alt=\"Tests\" src=\"https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml/badge.svg\"></a>\n    <a href=\"https://badge.fury.io/py/Scrapling\" alt=\"PyPI version\">\n        <img alt=\"PyPI version\" src=\"https://badge.fury.io/py/Scrapling.svg\"></a>\n    <a href=\"https://clickpy.clickhouse.com/dashboard/scrapling\" rel=\"nofollow\"><img src=\"https://img.shields.io/pypi/dm/scrapling\" alt=\"PyPI package downloads\"></a>\n    <a href=\"https://github.com/D4Vinci/Scrapling/tree/main/agent-skill\" alt=\"AI Agent Skill directory\">\n        <img alt=\"Static Badge\" src=\"https://img.shields.io/badge/Skill-black?style=flat&label=Agent&link=https%3A%2F%2Fgithub.com%2FD4Vinci%2FScrapling%2Ftree%2Fmain%2Fagent-skill\"></a>\n    <a href=\"https://clawhub.ai/D4Vinci/scrapling-official\" alt=\"OpenClaw Skill\">\n        <img alt=\"OpenClaw Skill\" src=\"https://img.shields.io/badge/Clawhub-darkred?style=flat&label=OpenClaw&link=https%3A%2F%2Fclawhub.ai%2FD4Vinci%2Fscrapling-official\"></a>\n    <br/>\n    <a href=\"https://discord.gg/EMgGbDceNQ\" alt=\"Discord\" target=\"_blank\">\n      <img alt=\"Discord\" src=\"https://img.shields.io/discord/1360786381042880532?style=social&logo=discord&link=https%3A%2F%2Fdiscord.gg%2FEMgGbDceNQ\">\n    </a>\n    <a href=\"https://x.com/Scrapling_dev\" alt=\"X (formerly Twitter)\">\n      <img alt=\"X (formerly Twitter) Follow\" src=\"https://img.shields.io/twitter/follow/Scrapling_dev?style=social&logo=x&link=https%3A%2F%2Fx.com%2FScrapling_dev\">\n    </a>\n    <br/>\n    <a href=\"https://pypi.org/project/scrapling/\" alt=\"Supported Python versions\">\n        <img alt=\"Supported Python versions\" src=\"https://img.shields.io/pypi/pyversions/scrapling.svg\"></a>\n</p>\n\n<p align=\"center\">\n    <a href=\"https://scrapling.readthedocs.io/en/latest/parsing/selection.html\"><strong>선택 메서드</strong></a>\n    &middot;\n    <a href=\"https://scrapling.readthedocs.io/en/latest/fetching/choosing.html\"><strong>Fetcher 선택 가이드</strong></a>\n    &middot;\n    <a href=\"https://scrapling.readthedocs.io/en/latest/spiders/architecture.html\"><strong>Spider</strong></a>\n    &middot;\n    <a href=\"https://scrapling.readthedocs.io/en/latest/spiders/proxy-blocking.html\"><strong>프록시 로테이션</strong></a>\n    &middot;\n    <a href=\"https://scrapling.readthedocs.io/en/latest/cli/overview.html\"><strong>CLI</strong></a>\n    &middot;\n    <a href=\"https://scrapling.readthedocs.io/en/latest/ai/mcp-server.html\"><strong>MCP 서버</strong></a>\n</p>\n\nScrapling은 단일 요청부터 대규모 크롤링까지 모든 것을 처리하는 적응형 Web Scraping 프레임워크입니다.\n\n파서는 웹사이트 변경 사항을 학습하고, 페이지가 업데이트되면 요소를 자동으로 재배치합니다. Fetcher는 Cloudflare Turnstile 같은 안티봇 시스템을 별도 설정 없이 우회합니다. Spider 프레임워크를 사용하면 일시정지/재개 및 자동 프록시 로테이션을 갖춘 동시 멀티 세션 크롤링으로 확장할 수 있습니다 — 모두 Python 몇 줄이면 됩니다. 하나의 라이브러리, 타협 없는 성능.\n\n실시간 통계와 스트리밍을 통한 초고속 크롤링. Web Scraper가 만들고, Web Scraper와 일반 사용자 모두를 위해 설계했습니다.\n\n```python\nfrom scrapling.fetchers import Fetcher, AsyncFetcher, StealthyFetcher, DynamicFetcher\nStealthyFetcher.adaptive = True\np = StealthyFetcher.fetch('https://example.com', headless=True, network_idle=True)  # 탐지를 피해 웹사이트를 가져옵니다!\nproducts = p.css('.product', auto_save=True)                                        # 웹사이트 디자인 변경에도 살아남는 데이터를 스크레이핑!\nproducts = p.css('.product', adaptive=True)                                         # 나중에 웹사이트 구조가 바뀌면, `adaptive=True`를 전달해서 찾으세요!\n```\n또는 본격적인 크롤링으로 확장\n```python\nfrom scrapling.spiders import Spider, Response\n\nclass MySpider(Spider):\n  name = \"demo\"\n  start_urls = [\"https://example.com/\"]\n\n  async def parse(self, response: Response):\n      for item in response.css('.product'):\n          yield {\"title\": item.css('h2::text').get()}\n\nMySpider().start()\n```\n\n<p align=\"center\">\n    <a href=\"https://dataimpulse.com/?utm_source=scrapling&utm_medium=banner&utm_campaign=scrapling\" target=\"_blank\" style=\"display:flex; justify-content:center; padding:4px 0;\">\n        <img src=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/DataImpulse.png\" alt=\"At DataImpulse, we specialize in developing custom proxy services for your business. Make requests from anywhere, collect data, and enjoy fast connections with our premium proxies.\" style=\"max-height:60px;\">\n    </a>\n</p>\n\n# 플래티넘 스폰서\n<table>\n  <tr>\n    <td width=\"200\">\n      <a href=\"https://hypersolutions.co/?utm_source=github&utm_medium=readme&utm_campaign=scrapling\" target=\"_blank\" title=\"Bot Protection Bypass API for Akamai, DataDome, Incapsula & Kasada\">\n        <img src=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/HyperSolutions.png\">\n      </a>\n    </td>\n    <td> Scrapling은 Cloudflare Turnstile을 처리합니다. 엔터프라이즈급 보호가 필요하다면, <a href=\"https://hypersolutions.co?utm_source=github&utm_medium=readme&utm_campaign=scrapling\">\n        <b>Hyper Solutions</b>\n      </a>가 <b>Akamai</b>, <b>DataDome</b>, <b>Kasada</b>, <b>Incapsula</b>용 유효한 안티봇 토큰을 생성하는 API 엔드포인트를 제공합니다. 간단한 API 호출만으로, 브라우저 자동화가 필요 없습니다. </td>\n  </tr>\n  <tr>\n    <td width=\"200\">\n      <a href=\"https://birdproxies.com/t/scrapling\" target=\"_blank\" title=\"At Bird Proxies, we eliminate your pains such as banned IPs, geo restriction, and high costs so you can focus on your work.\">\n        <img src=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/BirdProxies.jpg\">\n      </a>\n    </td>\n    <td>프록시는 복잡하거나 비쌀 이유가 없다는 생각으로 <a href=\"https://birdproxies.com/t/scrapling\">\n        <b>BirdProxies</b>\n      </a>를 만들었습니다. <br /> 195개 이상 지역의 빠른 레지덴셜 및 ISP 프록시, 합리적인 가격, 실질적인 지원. <br />\n      <b>랜딩 페이지에서 FlappyBird 게임을 플레이하고 무료 데이터를 받으세요!</b>\n    </td>\n  </tr>\n  <tr>\n    <td width=\"200\">\n      <a href=\"https://evomi.com?utm_source=github&utm_medium=banner&utm_campaign=d4vinci-scrapling\" target=\"_blank\" title=\"Evomi is your Swiss Quality Proxy Provider, starting at $0.49/GB\">\n        <img src=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/evomi.png\">\n      </a>\n    </td>\n    <td>\n      <a href=\"https://evomi.com?utm_source=github&utm_medium=banner&utm_campaign=d4vinci-scrapling\">\n        <b>Evomi</b>\n      </a>: 레지덴셜 프록시 GB당 $0.49부터. 완전히 위장된 Chromium 스크레이핑 브라우저, 레지덴셜 IP, 자동 CAPTCHA 해결, 안티봇 우회.</br>\n      <b>Scraper API로 번거로움 없이 결과를 얻으세요. MCP 및 N8N 통합 지원.</b>\n    </td>\n  </tr>\n  <tr>\n    <td width=\"200\">\n      <a href=\"https://tikhub.io/?ref=KarimShoair\" target=\"_blank\" title=\"Unlock the Power of Social Media Data & AI\">\n        <img src=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/TikHub.jpg\">\n      </a>\n    </td>\n    <td>\n      <a href=\"https://tikhub.io/?ref=KarimShoair\" target=\"_blank\">TikHub.io</a>는 TikTok, X, YouTube, Instagram 등 16개 이상 플랫폼에서 900개 이상의 안정적인 API를 제공하며, 4,000만 이상의 데이터셋을 보유하고 있습니다. <br /> <a href=\"https://ai.tikhub.io/?ref=KarimShoair\" target=\"_blank\">할인된 AI 모델</a>도 제공 — Claude, GPT, GEMINI 등 최대 71% 할인.\n    </td>\n  </tr>\n  <tr>\n    <td width=\"200\">\n      <a href=\"https://www.nsocks.com/?keyword=2p67aivg\" target=\"_blank\" title=\"Scalable Web Data Access for AI Applications\">\n        <img src=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/nsocks.png\">\n      </a>\n    </td>\n    <td>\n    <a href=\"https://www.nsocks.com/?keyword=2p67aivg\" target=\"_blank\">Nsocks</a>는 개발자와 스크레이퍼를 위한 빠른 레지덴셜 및 ISP 프록시를 제공합니다. 글로벌 IP 커버리지, 높은 익명성, 스마트 로테이션, 자동화와 데이터 추출을 위한 안정적인 성능. <a href=\"https://www.xcrawl.com/?keyword=2p67aivg\" target=\"_blank\">Xcrawl</a>로 대규모 웹 크롤링을 간소화하세요.\n    </td>\n  </tr>\n  <tr>\n    <td width=\"200\">\n      <a href=\"https://petrosky.io/d4vinci\" target=\"_blank\" title=\"PetroSky delivers cutting-edge VPS hosting.\">\n        <img src=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/petrosky.png\">\n      </a>\n    </td>\n    <td>\n    노트북을 닫으세요. 스크래퍼는 계속 작동합니다. <br />\n    <a href=\"https://petrosky.io/d4vinci\" target=\"_blank\">PetroSky VPS</a> - 논스톱 자동화를 위한 클라우드 서버. Windows 및 Linux 머신을 완벽하게 제어. 월 €6.99부터.\n    </td>\n  </tr>\n  <tr>\n    <td width=\"200\">\n      <a href=\"https://substack.thewebscraping.club/p/scrapling-hands-on-guide?utm_source=github&utm_medium=repo&utm_campaign=scrapling\" target=\"_blank\" title=\"The #1 newsletter dedicated to Web Scraping\">\n        <img src=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/TWSC.png\">\n      </a>\n    </td>\n    <td>\n    <a href=\"https://substack.thewebscraping.club/p/scrapling-hands-on-guide?utm_source=github&utm_medium=repo&utm_campaign=scrapling\" target=\"_blank\">The Web Scraping Club에서 Scrapling의 전체 리뷰</a>(2025년 11월)를 읽어보세요. 웹 스크래핑 전문 No.1 뉴스레터입니다.\n    </td>\n  </tr>\n  <tr>\n    <td width=\"200\">\n      <a href=\"https://proxy-seller.com/?partner=CU9CAA5TBYFFT2\" target=\"_blank\" title=\"Proxy-Seller provides reliable proxy infrastructure for Web Scraping\">\n        <img src=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/ProxySeller.png\">\n      </a>\n    </td>\n    <td>\n    <a href=\"https://proxy-seller.com/?partner=CU9CAA5TBYFFT2\" target=\"_blank\">Proxy-Seller</a>는 웹 스크래핑을 위한 안정적인 프록시 인프라를 제공합니다. IPv4, IPv6, ISP, 주거용 및 모바일 프록시를 지원하며, 안정적인 성능, 광범위한 지역 커버리지, 기업 규모의 데이터 수집을 위한 유연한 요금제를 갖추고 있습니다.\n    </td>\n  </tr>\n</table>\n\n<i><sub>여기에 광고를 게재하고 싶으신가요? [여기](https://github.com/sponsors/D4Vinci/sponsorships?tier_id=586646)를 클릭하세요</sub></i>\n# 스폰서\n\n<!-- sponsors -->\n\n<a href=\"https://serpapi.com/?utm_source=scrapling\" target=\"_blank\" title=\"Scrape Google and other search engines with SerpApi\"><img src=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/SerpApi.png\"></a>\n<a href=\"https://visit.decodo.com/Dy6W0b\" target=\"_blank\" title=\"Try the Most Efficient Residential Proxies for Free\"><img src=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/decodo.png\"></a>\n<a href=\"https://hasdata.com/?utm_source=github&utm_medium=banner&utm_campaign=D4Vinci\" target=\"_blank\" title=\"The web scraping service that actually beats anti-bot systems!\"><img src=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/hasdata.png\"></a>\n<a href=\"https://proxyempire.io/?ref=scrapling&utm_source=scrapling\" target=\"_blank\" title=\"Collect The Data Your Project Needs with the Best Residential Proxies\"><img src=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/ProxyEmpire.png\"></a><a href=\"https://www.swiftproxy.net/\" target=\"_blank\" title=\"Unlock Reliable Proxy Services with Swiftproxy!\"><img src=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/swiftproxy.png\"></a>\n<a href=\"https://www.rapidproxy.io/?ref=d4v\" target=\"_blank\" title=\"Affordable Access to the Proxy World – bypass CAPTCHAs blocks, and avoid additional costs.\"><img src=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/rapidproxy.jpg\"></a>\n<a href=\"https://browser.cash/?utm_source=D4Vinci&utm_medium=referral\" target=\"_blank\" title=\"Browser Automation & AI Browser Agent Platform\"><img src=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/browserCash.png\"></a>\n\n<!-- /sponsors -->\n\n<i><sub>여기에 광고를 게재하고 싶으신가요? [여기](https://github.com/sponsors/D4Vinci)를 클릭하고 원하는 티어를 선택하세요!</sub></i>\n\n---\n\n## 주요 기능\n\n### Spider — 본격적인 크롤링 프레임워크\n- 🕷️ **Scrapy 스타일 Spider API**: `start_urls`, 비동기 `parse` 콜백, `Request`/`Response` 객체로 Spider를 정의합니다.\n- ⚡ **동시 크롤링**: 설정 가능한 동시 요청 수 제한, 도메인별 스로틀링, 다운로드 딜레이를 지원합니다.\n- 🔄 **멀티 세션 지원**: HTTP 요청과 스텔스 헤드리스 브라우저를 하나의 인터페이스로 통합 — ID로 요청을 다른 세션에 라우팅합니다.\n- 💾 **일시정지 & 재개**: 체크포인트 기반의 크롤링 영속화. Ctrl+C로 정상 종료하고, 재시작하면 중단된 지점부터 이어갑니다.\n- 📡 **스트리밍 모드**: `async for item in spider.stream()`으로 스크레이핑된 아이템을 실시간 통계와 함께 스트리밍으로 수신 — UI, 파이프라인, 장시간 크롤링에 적합합니다.\n- 🛡️ **차단된 요청 감지**: 커스텀 로직을 통한 차단된 요청의 자동 감지 및 재시도를 지원합니다.\n- 📦 **내장 내보내기**: 훅이나 자체 파이프라인, 또는 내장 JSON/JSONL로 결과를 내보냅니다. 각각 `result.items.to_json()` / `result.items.to_jsonl()`을 사용합니다.\n\n### 세션을 지원하는 고급 웹사이트 가져오기\n- **HTTP 요청**: `Fetcher` 클래스로 빠르고 은밀한 HTTP 요청. 브라우저의 TLS fingerprint, 헤더를 모방하고, HTTP/3를 사용할 수 있습니다.\n- **동적 로딩**: Playwright의 Chromium과 Google Chrome을 지원하는 `DynamicFetcher` 클래스로 완전한 브라우저 자동화를 통해 동적 웹사이트를 가져옵니다.\n- **안티봇 우회**: `StealthyFetcher`와 fingerprint 위장을 통한 고급 스텔스 기능. 자동화로 모든 유형의 Cloudflare Turnstile/Interstitial을 손쉽게 우회합니다.\n- **세션 관리**: `FetcherSession`, `StealthySession`, `DynamicSession` 클래스로 요청 간 쿠키와 상태를 관리하는 영속적 세션을 지원합니다.\n- **프록시 로테이션**: 모든 세션 타입에 대응하는 순환 또는 커스텀 전략의 내장 `ProxyRotator`와 요청별 프록시 오버라이드를 제공합니다.\n- **도메인 차단**: 브라우저 기반 Fetcher에서 특정 도메인(및 하위 도메인)으로의 요청을 차단합니다.\n- **비동기 지원**: 모든 Fetcher와 전용 비동기 세션 클래스에서 완전한 비동기를 지원합니다.\n\n### 적응형 스크레이핑 & AI 통합\n- 🔄 **스마트 요소 추적**: 지능적인 유사도 알고리즘으로 웹사이트 변경 후에도 요소를 재배치합니다.\n- 🎯 **유연한 스마트 선택**: CSS selector, XPath selector, 필터 기반 검색, 텍스트 검색, 정규식 검색 등을 지원합니다.\n- 🔍 **유사 요소 찾기**: 발견된 요소와 유사한 요소를 자동으로 찾아냅니다.\n- 🤖 **AI와 함께 사용하는 MCP 서버**: AI 기반 Web Scraping과 데이터 추출을 위한 내장 MCP 서버. AI(Claude/Cursor 등)에 전달하기 전에 Scrapling을 활용해 대상 콘텐츠를 추출하는 강력한 커스텀 기능을 갖추고 있어, 작업 속도를 높이고 토큰 사용량을 최소화해 비용을 절감합니다. ([데모 영상](https://www.youtube.com/watch?v=qyFk3ZNwOxE))\n\n### 고성능 & 실전 검증된 아키텍처\n- 🚀 **초고속**: 대부분의 Python 스크레이핑 라이브러리를 능가하는 최적화된 성능.\n- 🔋 **메모리 효율**: 최적화된 데이터 구조와 지연 로딩으로 메모리 사용을 최소화합니다.\n- ⚡ **고속 JSON 직렬화**: 표준 라이브러리보다 10배 빠릅니다.\n- 🏗️ **실전 검증**: Scrapling은 92%의 테스트 커버리지와 완전한 타입 힌트 커버리지를 갖추고 있을 뿐 아니라, 지난 1년간 수백 명의 Web Scraper가 매일 사용해 왔습니다.\n\n### 개발자/Web Scraper 친화적 경험\n- 🎯 **인터랙티브 Web Scraping Shell**: Scrapling 통합, 단축키, curl 요청을 Scrapling 요청으로 변환하거나 브라우저에서 요청 결과를 확인하는 등의 도구를 갖춘 선택적 내장 IPython Shell로, Web Scraping 스크립트 개발을 가속합니다.\n- 🚀 **터미널에서 바로 사용**: 코드 한 줄 없이 Scrapling으로 URL을 스크레이핑할 수 있습니다!\n- 🛠️ **풍부한 내비게이션 API**: 부모, 형제, 자식 탐색 메서드를 통한 고급 DOM 순회를 지원합니다.\n- 🧬 **향상된 텍스트 처리**: 내장 정규식, 클리닝 메서드, 최적화된 문자열 연산을 제공합니다.\n- 📝 **자동 셀렉터 생성**: 모든 요소에 대해 견고한 CSS/XPath selector를 생성합니다.\n- 🔌 **익숙한 API**: Scrapy/Parsel에서 사용하는 것과 동일한 의사 요소(pseudo-element)를 가진 Scrapy/BeautifulSoup 스타일의 API.\n- 📘 **완전한 타입 커버리지**: 뛰어난 IDE 지원과 코드 자동완성을 위한 완전한 타입 힌트. 코드베이스 전체가 변경될 때마다 **PyRight**와 **MyPy**로 자동 검사됩니다.\n- 🔋 **바로 사용 가능한 Docker 이미지**: 매 릴리스마다 모든 브라우저를 포함한 Docker 이미지가 자동으로 빌드 및 푸시됩니다.\n\n## 시작하기\n\n깊이 들어가지 않고, Scrapling이 할 수 있는 것들을 간단히 살펴보겠습니다.\n\n### 기본 사용법\n세션을 지원하는 HTTP 요청\n```python\nfrom scrapling.fetchers import Fetcher, FetcherSession\n\nwith FetcherSession(impersonate='chrome') as session:  # Chrome의 최신 TLS fingerprint 사용\n    page = session.get('https://quotes.toscrape.com/', stealthy_headers=True)\n    quotes = page.css('.quote .text::text').getall()\n\n# 또는 일회성 요청 사용\npage = Fetcher.get('https://quotes.toscrape.com/')\nquotes = page.css('.quote .text::text').getall()\n```\n고급 스텔스 모드\n```python\nfrom scrapling.fetchers import StealthyFetcher, StealthySession\n\nwith StealthySession(headless=True, solve_cloudflare=True) as session:  # 작업이 끝날 때까지 브라우저를 열어둡니다\n    page = session.fetch('https://nopecha.com/demo/cloudflare', google_search=False)\n    data = page.css('#padded_content a').getall()\n\n# 또는 일회성 요청 스타일 — 이 요청을 위해 브라우저를 열고, 완료 후 닫습니다\npage = StealthyFetcher.fetch('https://nopecha.com/demo/cloudflare')\ndata = page.css('#padded_content a').getall()\n```\n완전한 브라우저 자동화\n```python\nfrom scrapling.fetchers import DynamicFetcher, DynamicSession\n\nwith DynamicSession(headless=True, disable_resources=False, network_idle=True) as session:  # 작업이 끝날 때까지 브라우저를 열어둡니다\n    page = session.fetch('https://quotes.toscrape.com/', load_dom=False)\n    data = page.xpath('//span[@class=\"text\"]/text()').getall()  # 원하시면 XPath selector도 사용 가능\n\n# 또는 일회성 요청 스타일 — 이 요청을 위해 브라우저를 열고, 완료 후 닫습니다\npage = DynamicFetcher.fetch('https://quotes.toscrape.com/')\ndata = page.css('.quote .text::text').getall()\n```\n\n### Spider\n동시 요청, 여러 세션 타입, 일시정지 & 재개를 갖춘 본격적인 크롤러 구축:\n```python\nfrom scrapling.spiders import Spider, Request, Response\n\nclass QuotesSpider(Spider):\n    name = \"quotes\"\n    start_urls = [\"https://quotes.toscrape.com/\"]\n    concurrent_requests = 10\n\n    async def parse(self, response: Response):\n        for quote in response.css('.quote'):\n            yield {\n                \"text\": quote.css('.text::text').get(),\n                \"author\": quote.css('.author::text').get(),\n            }\n\n        next_page = response.css('.next a')\n        if next_page:\n            yield response.follow(next_page[0].attrib['href'])\n\nresult = QuotesSpider().start()\nprint(f\"{len(result.items)}개의 인용구를 스크레이핑했습니다\")\nresult.items.to_json(\"quotes.json\")\n```\n하나의 Spider에서 여러 세션 타입 사용:\n```python\nfrom scrapling.spiders import Spider, Request, Response\nfrom scrapling.fetchers import FetcherSession, AsyncStealthySession\n\nclass MultiSessionSpider(Spider):\n    name = \"multi\"\n    start_urls = [\"https://example.com/\"]\n\n    def configure_sessions(self, manager):\n        manager.add(\"fast\", FetcherSession(impersonate=\"chrome\"))\n        manager.add(\"stealth\", AsyncStealthySession(headless=True), lazy=True)\n\n    async def parse(self, response: Response):\n        for link in response.css('a::attr(href)').getall():\n            # 보호된 페이지는 스텔스 세션을 통해 라우팅\n            if \"protected\" in link:\n                yield Request(link, sid=\"stealth\")\n            else:\n                yield Request(link, sid=\"fast\", callback=self.parse)  # 명시적 콜백\n```\n체크포인트를 사용해 장시간 크롤링을 일시정지 & 재개:\n```python\nQuotesSpider(crawldir=\"./crawl_data\").start()\n```\nCtrl+C를 누르면 정상적으로 일시정지되고, 진행 상황이 자동 저장됩니다. 이후 Spider를 다시 시작할 때 동일한 `crawldir`을 전달하면 중단된 지점부터 재개합니다.\n\n### 고급 파싱 & 내비게이션\n```python\nfrom scrapling.fetchers import Fetcher\n\n# 풍부한 요소 선택과 내비게이션\npage = Fetcher.get('https://quotes.toscrape.com/')\n\n# 여러 선택 메서드로 인용구 가져오기\nquotes = page.css('.quote')  # CSS selector\nquotes = page.xpath('//div[@class=\"quote\"]')  # XPath\nquotes = page.find_all('div', {'class': 'quote'})  # BeautifulSoup 스타일\n# 아래와 동일\nquotes = page.find_all('div', class_='quote')\nquotes = page.find_all(['div'], class_='quote')\nquotes = page.find_all(class_='quote')  # 등등...\n# 텍스트 내용으로 요소 찾기\nquotes = page.find_by_text('quote', tag='div')\n\n# 고급 내비게이션\nquote_text = page.css('.quote')[0].css('.text::text').get()\nquote_text = page.css('.quote').css('.text::text').getall()  # 체이닝 셀렉터\nfirst_quote = page.css('.quote')[0]\nauthor = first_quote.next_sibling.css('.author::text')\nparent_container = first_quote.parent\n\n# 요소 관계와 유사도\nsimilar_elements = first_quote.find_similar()\nbelow_elements = first_quote.below_elements()\n```\n웹사이트를 가져오지 않고 파서를 바로 사용할 수도 있습니다:\n```python\nfrom scrapling.parser import Selector\n\npage = Selector(\"<html>...</html>\")\n```\n사용법은 완전히 동일합니다!\n\n### 비동기 세션 관리 예시\n```python\nimport asyncio\nfrom scrapling.fetchers import FetcherSession, AsyncStealthySession, AsyncDynamicSession\n\nasync with FetcherSession(http3=True) as session:  # `FetcherSession`은 컨텍스트 인식이 가능하며 동기/비동기 패턴 모두에서 작동\n    page1 = session.get('https://quotes.toscrape.com/')\n    page2 = session.get('https://quotes.toscrape.com/', impersonate='firefox135')\n\n# 비동기 세션 사용\nasync with AsyncStealthySession(max_pages=2) as session:\n    tasks = []\n    urls = ['https://example.com/page1', 'https://example.com/page2']\n\n    for url in urls:\n        task = session.fetch(url)\n        tasks.append(task)\n\n    print(session.get_pool_stats())  # 선택 사항 - 브라우저 탭 풀 상태 (사용 중/유휴/에러)\n    results = await asyncio.gather(*tasks)\n    print(session.get_pool_stats())\n```\n\n## CLI & 인터랙티브 Shell\n\nScrapling에는 강력한 커맨드라인 인터페이스가 포함되어 있습니다:\n\n[![asciicast](https://asciinema.org/a/736339.svg)](https://asciinema.org/a/736339)\n\n인터랙티브 Web Scraping Shell 실행\n```bash\nscrapling shell\n```\n프로그래밍 없이 페이지를 파일로 바로 추출합니다 (기본적으로 `body` 태그 내부의 콘텐츠를 추출). 출력 파일이 `.txt`로 끝나면 대상의 텍스트 콘텐츠가 추출됩니다. `.md`로 끝나면 HTML 콘텐츠의 Markdown 표현이 됩니다. `.html`로 끝나면 HTML 콘텐츠 자체가 됩니다.\n```bash\nscrapling extract get 'https://example.com' content.md\nscrapling extract get 'https://example.com' content.txt --css-selector '#fromSkipToProducts' --impersonate 'chrome'  # CSS selector '#fromSkipToProducts'에 매칭되는 모든 요소\nscrapling extract fetch 'https://example.com' content.md --css-selector '#fromSkipToProducts' --no-headless\nscrapling extract stealthy-fetch 'https://nopecha.com/demo/cloudflare' captchas.html --css-selector '#padded_content a' --solve-cloudflare\n```\n\n> [!NOTE]\n> MCP 서버와 인터랙티브 Web Scraping Shell 등 더 많은 기능이 있지만, 이 페이지는 간결하게 유지하겠습니다. 전체 문서는 [여기](https://scrapling.readthedocs.io/en/latest/)에서 확인하세요.\n\n## 성능 벤치마크\n\nScrapling은 강력할 뿐만 아니라 초고속입니다. 아래 벤치마크는 Scrapling의 파서를 다른 인기 라이브러리의 최신 버전과 비교한 것입니다.\n\n### 텍스트 추출 속도 테스트 (5000개 중첩 요소)\n\n| # |      Library      | Time (ms) | vs Scrapling |\n|---|:-----------------:|:---------:|:------------:|\n| 1 |     Scrapling     |   2.02    |     1.0x     |\n| 2 |   Parsel/Scrapy   |   2.04    |     1.01     |\n| 3 |     Raw Lxml      |   2.54    |    1.257     |\n| 4 |      PyQuery      |   24.17   |     ~12x     |\n| 5 |    Selectolax     |   82.63   |     ~41x     |\n| 6 |  MechanicalSoup   |  1549.71  |   ~767.1x    |\n| 7 |   BS4 with Lxml   |  1584.31  |   ~784.3x    |\n| 8 | BS4 with html5lib |  3391.91  |   ~1679.1x   |\n\n\n### 요소 유사도 & 텍스트 검색 성능\n\nScrapling의 적응형 요소 찾기 기능은 대안들을 크게 앞섭니다:\n\n| Library     | Time (ms) | vs Scrapling |\n|-------------|:---------:|:------------:|\n| Scrapling   |   2.39    |     1.0x     |\n| AutoScraper |   12.45   |    5.209x    |\n\n\n> 모든 벤치마크는 100회 이상 실행의 평균입니다. 측정 방법은 [benchmarks.py](https://github.com/D4Vinci/Scrapling/blob/main/benchmarks.py)를 참조하세요.\n\n## 설치\n\nScrapling은 Python 3.10 이상이 필요합니다:\n\n```bash\npip install scrapling\n```\n\n이 설치에는 파서 엔진과 의존성만 포함되며, Fetcher나 커맨드라인 의존성은 포함되지 않습니다.\n\n### 선택적 의존성\n\n1. 아래의 추가 기능, Fetcher, 또는 관련 클래스를 사용하려면 Fetcher 의존성과 브라우저 의존성을 다음과 같이 설치해야 합니다:\n    ```bash\n    pip install \"scrapling[fetchers]\"\n\n    scrapling install           # 일반 설치\n    scrapling install  --force  # 강제 재설치\n    ```\n\n    이렇게 하면 모든 브라우저와 시스템 의존성, fingerprint 조작 의존성이 다운로드됩니다.\n\n    또는 명령어 대신 코드에서 설치할 수도 있습니다:\n    ```python\n    from scrapling.cli import install\n\n    install([], standalone_mode=False)          # 일반 설치\n    install([\"--force\"], standalone_mode=False) # 강제 재설치\n    ```\n\n2. 추가 기능:\n   - MCP 서버 기능 설치:\n       ```bash\n       pip install \"scrapling[ai]\"\n       ```\n   - Shell 기능 (Web Scraping Shell 및 `extract` 명령어) 설치:\n       ```bash\n       pip install \"scrapling[shell]\"\n       ```\n   - 모든 기능 설치:\n       ```bash\n       pip install \"scrapling[all]\"\n       ```\n   위 추가 기능을 설치한 후에도 (아직 하지 않았다면) `scrapling install`로 브라우저 의존성을 설치해야 합니다.\n\n### Docker\nDockerHub에서 모든 추가 기능과 브라우저가 포함된 Docker 이미지를 설치할 수도 있습니다:\n```bash\ndocker pull pyd4vinci/scrapling\n```\n또는 GitHub 레지스트리에서 다운로드:\n```bash\ndocker pull ghcr.io/d4vinci/scrapling:latest\n```\n이 이미지는 GitHub Actions와 레포지토리의 main 브랜치를 사용하여 자동으로 빌드 및 푸시됩니다.\n\n## 기여하기\n\n기여를 환영합니다! 시작하기 전에 [기여 가이드라인](https://github.com/D4Vinci/Scrapling/blob/main/CONTRIBUTING.md)을 읽어주세요.\n\n## 면책 조항\n\n> [!CAUTION]\n> 이 라이브러리는 교육 및 연구 목적으로만 제공됩니다. 이 라이브러리를 사용함으로써, 국내외 데이터 스크레이핑 및 개인정보 보호 관련 법률을 준수하는 데 동의한 것으로 간주됩니다. 저자와 기여자는 이 소프트웨어의 오용에 대해 책임지지 않습니다. 항상 웹사이트의 이용약관과 robots.txt 파일을 존중하세요.\n\n## 🎓 인용\n연구 목적으로 이 라이브러리를 사용하셨다면, 아래 참고 문헌으로 인용해 주세요:\n```text\n  @misc{scrapling,\n    author = {Karim Shoair},\n    title = {Scrapling},\n    year = {2024},\n    url = {https://github.com/D4Vinci/Scrapling},\n    note = {An adaptive Web Scraping framework that handles everything from a single request to a full-scale crawl!}\n  }\n```\n\n## 라이선스\n\n이 프로젝트는 BSD-3-Clause 라이선스 하에 배포됩니다.\n\n## 감사의 말\n\n이 프로젝트에는 다음에서 차용한 코드가 포함되어 있습니다:\n- Parsel (BSD 라이선스) — [translator](https://github.com/D4Vinci/Scrapling/blob/main/scrapling/core/translator.py) 서브모듈에 사용\n\n---\n<div align=\"center\"><small>Karim Shoair가 ❤️으로 디자인하고 만들었습니다.</small></div><br>\n"
  },
  {
    "path": "docs/README_RU.md",
    "content": "<!-- mcp-name: io.github.D4Vinci/Scrapling -->\n\n<h1 align=\"center\">\n    <a href=\"https://scrapling.readthedocs.io\">\n        <picture>\n          <source media=\"(prefers-color-scheme: dark)\" srcset=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/docs/assets/cover_dark.svg?sanitize=true\">\n          <img alt=\"Scrapling Poster\" src=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/docs/assets/cover_light.svg?sanitize=true\">\n        </picture>\n    </a>\n    <br>\n    <small>Effortless Web Scraping for the Modern Web</small>\n</h1>\n\n<p align=\"center\">\n    <a href=\"https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml\" alt=\"Tests\">\n        <img alt=\"Tests\" src=\"https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml/badge.svg\"></a>\n    <a href=\"https://badge.fury.io/py/Scrapling\" alt=\"PyPI version\">\n        <img alt=\"PyPI version\" src=\"https://badge.fury.io/py/Scrapling.svg\"></a>\n    <a href=\"https://clickpy.clickhouse.com/dashboard/scrapling\" rel=\"nofollow\"><img src=\"https://img.shields.io/pypi/dm/scrapling\" alt=\"PyPI package downloads\"></a>\n    <a href=\"https://github.com/D4Vinci/Scrapling/tree/main/agent-skill\" alt=\"AI Agent Skill directory\">\n        <img alt=\"Static Badge\" src=\"https://img.shields.io/badge/Skill-black?style=flat&label=Agent&link=https%3A%2F%2Fgithub.com%2FD4Vinci%2FScrapling%2Ftree%2Fmain%2Fagent-skill\"></a>\n    <a href=\"https://clawhub.ai/D4Vinci/scrapling-official\" alt=\"OpenClaw Skill\">\n        <img alt=\"OpenClaw Skill\" src=\"https://img.shields.io/badge/Clawhub-darkred?style=flat&label=OpenClaw&link=https%3A%2F%2Fclawhub.ai%2FD4Vinci%2Fscrapling-official\"></a>\n    <br/>\n    <a href=\"https://discord.gg/EMgGbDceNQ\" alt=\"Discord\" target=\"_blank\">\n      <img alt=\"Discord\" src=\"https://img.shields.io/discord/1360786381042880532?style=social&logo=discord&link=https%3A%2F%2Fdiscord.gg%2FEMgGbDceNQ\">\n    </a>\n    <a href=\"https://x.com/Scrapling_dev\" alt=\"X (formerly Twitter)\">\n      <img alt=\"X (formerly Twitter) Follow\" src=\"https://img.shields.io/twitter/follow/Scrapling_dev?style=social&logo=x&link=https%3A%2F%2Fx.com%2FScrapling_dev\">\n    </a>\n    <br/>\n    <a href=\"https://pypi.org/project/scrapling/\" alt=\"Supported Python versions\">\n        <img alt=\"Supported Python versions\" src=\"https://img.shields.io/pypi/pyversions/scrapling.svg\"></a>\n</p>\n\n<p align=\"center\">\n    <a href=\"https://scrapling.readthedocs.io/en/latest/parsing/selection.html\"><strong>Методы выбора</strong></a>\n    &middot;\n    <a href=\"https://scrapling.readthedocs.io/en/latest/fetching/choosing.html\"><strong>Выбор Fetcher</strong></a>\n    &middot;\n    <a href=\"https://scrapling.readthedocs.io/en/latest/spiders/architecture.html\"><strong>Пауки</strong></a>\n    &middot;\n    <a href=\"https://scrapling.readthedocs.io/en/latest/spiders/proxy-blocking.html\"><strong>Ротация прокси</strong></a>\n    &middot;\n    <a href=\"https://scrapling.readthedocs.io/en/latest/cli/overview.html\"><strong>CLI</strong></a>\n    &middot;\n    <a href=\"https://scrapling.readthedocs.io/en/latest/ai/mcp-server.html\"><strong>Режим MCP</strong></a>\n</p>\n\nScrapling — это адаптивный фреймворк для Web Scraping, который берёт на себя всё: от одного запроса до полномасштабного обхода сайтов.\n\nЕго парсер учится на изменениях сайтов и автоматически перемещает ваши элементы при обновлении страниц. Его Fetcher'ы обходят анти-бот системы вроде Cloudflare Turnstile прямо из коробки. А его Spider-фреймворк позволяет масштабироваться до параллельных, многосессионных обходов с Pause & Resume и автоматической ротацией Proxy — и всё это в нескольких строках Python. Одна библиотека, без компромиссов.\n\nМолниеносно быстрые обходы с отслеживанием статистики в реальном времени и Streaming. Создано веб-скраперами для веб-скраперов и обычных пользователей — здесь есть что-то для каждого.\n\n```python\nfrom scrapling.fetchers import Fetcher, AsyncFetcher, StealthyFetcher, DynamicFetcher\nStealthyFetcher.adaptive = True\np = StealthyFetcher.fetch('https://example.com', headless=True, network_idle=True)  # Загрузите сайт незаметно!\nproducts = p.css('.product', auto_save=True)                                        # Скрапьте данные, которые переживут изменения дизайна сайта!\nproducts = p.css('.product', adaptive=True)                                         # Позже, если структура сайта изменится, передайте `adaptive=True`, чтобы найти их!\n```\nИли масштабируйте до полного обхода\n```python\nfrom scrapling.spiders import Spider, Response\n\nclass MySpider(Spider):\n  name = \"demo\"\n  start_urls = [\"https://example.com/\"]\n\n  async def parse(self, response: Response):\n      for item in response.css('.product'):\n          yield {\"title\": item.css('h2::text').get()}\n\nMySpider().start()\n```\n\n<p align=\"center\">\n    <a href=\"https://dataimpulse.com/?utm_source=scrapling&utm_medium=banner&utm_campaign=scrapling\" target=\"_blank\" style=\"display:flex; justify-content:center; padding:4px 0;\">\n        <img src=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/DataImpulse.png\" alt=\"At DataImpulse, we specialize in developing custom proxy services for your business. Make requests from anywhere, collect data, and enjoy fast connections with our premium proxies.\" style=\"max-height:60px;\">\n    </a>\n</p>\n\n# Платиновые спонсоры\n<table>\n  <tr>\n    <td width=\"200\">\n      <a href=\"https://hypersolutions.co/?utm_source=github&utm_medium=readme&utm_campaign=scrapling\" target=\"_blank\" title=\"Bot Protection Bypass API for Akamai, DataDome, Incapsula & Kasada\">\n        <img src=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/HyperSolutions.png\">\n      </a>\n    </td>\n    <td> Scrapling справляется с Cloudflare Turnstile. Для защиты корпоративного уровня\n      <a href=\"https://hypersolutions.co?utm_source=github&utm_medium=readme&utm_campaign=scrapling\">\n        <b>Hyper Solutions</b>\n      </a> предоставляет API-эндпоинты, генерирующие валидные antibot-токены для <b>Akamai</b>, <b>DataDome</b>, <b>Kasada</b> и <b>Incapsula</b> . Простые API-вызовы, без автоматизации браузера.\n    </td>\n  </tr>\n  <tr>\n    <td width=\"200\">\n      <a href=\"https://birdproxies.com/t/scrapling\" target=\"_blank\" title=\"At Bird Proxies, we eliminate your pains such as banned IPs, geo restriction, and high costs so you can focus on your work.\">\n        <img src=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/BirdProxies.jpg\">\n      </a>\n    </td>\n    <td>Мы создали\n      <a href=\"https://birdproxies.com/t/scrapling\">\n        <b>BirdProxies</b>\n      </a>, потому что прокси не должны быть сложными или дорогими. <br /> Быстрые резидентные и ISP прокси в 195+ локациях, честные цены и настоящая поддержка. <br />\n      <b>Попробуйте нашу игру FlappyBird на лендинге и получите бесплатные данные!</b>\n    </td>\n  </tr>\n  <tr>\n    <td width=\"200\">\n      <a href=\"https://evomi.com?utm_source=github&utm_medium=banner&utm_campaign=d4vinci-scrapling\" target=\"_blank\" title=\"Evomi is your Swiss Quality Proxy Provider, starting at $0.49/GB\">\n        <img src=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/evomi.png\">\n      </a>\n    </td>\n    <td>\n      <a href=\"https://evomi.com?utm_source=github&utm_medium=banner&utm_campaign=d4vinci-scrapling\">\n        <b>Evomi</b>\n      </a>: резидентные прокси от $0.49/ГБ. Браузер для скрапинга с полностью подменённым Chromium, резидентными IP, автоматическим решением CAPTCHA и обходом анти-бот систем. </br>\n      <b>Scraper API для получения результатов без лишних сложностей. Доступны интеграции с MCP и N8N.</b>\n    </td>\n  </tr>\n  <tr>\n    <td width=\"200\">\n      <a href=\"https://tikhub.io/?ref=KarimShoair\" target=\"_blank\" title=\"Unlock the Power of Social Media Data & AI\">\n        <img src=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/TikHub.jpg\">\n      </a>\n    </td>\n    <td>\n      <a href=\"https://tikhub.io/?ref=KarimShoair\" target=\"_blank\">TikHub.io</a> предоставляет более 900 стабильных API на 16+ платформах, включая TikTok, X, YouTube и Instagram, с более чем 40 млн наборов данных. <br /> Также предлагает <a href=\"https://ai.tikhub.io/?ref=KarimShoair\" target=\"_blank\">AI-модели со скидкой</a> — Claude, GPT, GEMINI и другие со скидкой до 71%.\n    </td>\n  </tr>\n  <tr>\n    <td width=\"200\">\n      <a href=\"https://www.nsocks.com/?keyword=2p67aivg\" target=\"_blank\" title=\"Scalable Web Data Access for AI Applications\">\n        <img src=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/nsocks.png\">\n      </a>\n    </td>\n    <td>\n    <a href=\"https://www.nsocks.com/?keyword=2p67aivg\" target=\"_blank\">Nsocks</a> предоставляет быстрые резидентные и ISP прокси для разработчиков и скраперов. Глобальное покрытие IP, высокая анонимность, умная ротация и надёжная производительность для автоматизации и извлечения данных. Используйте <a href=\"https://www.xcrawl.com/?keyword=2p67aivg\" target=\"_blank\">Xcrawl</a> для упрощения масштабного веб-краулинга.\n    </td>\n  </tr>\n  <tr>\n    <td width=\"200\">\n      <a href=\"https://petrosky.io/d4vinci\" target=\"_blank\" title=\"PetroSky delivers cutting-edge VPS hosting.\">\n        <img src=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/petrosky.png\">\n      </a>\n    </td>\n    <td>\n    Закройте ноутбук. Ваши скраперы продолжают работать. <br />\n    <a href=\"https://petrosky.io/d4vinci\" target=\"_blank\">PetroSky VPS</a> - облачные серверы для непрерывной автоматизации. Машины на Windows и Linux с полным контролем. От €6,99/мес.\n    </td>\n  </tr>\n  <tr>\n    <td width=\"200\">\n      <a href=\"https://substack.thewebscraping.club/p/scrapling-hands-on-guide?utm_source=github&utm_medium=repo&utm_campaign=scrapling\" target=\"_blank\" title=\"The #1 newsletter dedicated to Web Scraping\">\n        <img src=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/TWSC.png\">\n      </a>\n    </td>\n    <td>\n    Прочитайте полный обзор <a href=\"https://substack.thewebscraping.club/p/scrapling-hands-on-guide?utm_source=github&utm_medium=repo&utm_campaign=scrapling\" target=\"_blank\">Scrapling на The Web Scraping Club</a> (ноябрь 2025) — рассылка №1, посвящённая веб-скрейпингу.\n    </td>\n  </tr>\n  <tr>\n    <td width=\"200\">\n      <a href=\"https://proxy-seller.com/?partner=CU9CAA5TBYFFT2\" target=\"_blank\" title=\"Proxy-Seller provides reliable proxy infrastructure for Web Scraping\">\n        <img src=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/ProxySeller.png\">\n      </a>\n    </td>\n    <td>\n    <a href=\"https://proxy-seller.com/?partner=CU9CAA5TBYFFT2\" target=\"_blank\">Proxy-Seller</a> предоставляет надёжную прокси-инфраструктуру для веб-скрейпинга: IPv4, IPv6, ISP, резидентные и мобильные прокси со стабильной производительностью, широким географическим покрытием и гибкими тарифами для сбора данных в масштабах бизнеса.\n    </td>\n  </tr>\n</table>\n\n<i><sub>Хотите показать здесь свою рекламу? Нажмите [здесь](https://github.com/sponsors/D4Vinci/sponsorships?tier_id=586646)</sub></i>\n# Спонсоры\n\n<!-- sponsors -->\n\n<a href=\"https://serpapi.com/?utm_source=scrapling\" target=\"_blank\" title=\"Scrape Google and other search engines with SerpApi\"><img src=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/SerpApi.png\"></a>\n<a href=\"https://visit.decodo.com/Dy6W0b\" target=\"_blank\" title=\"Try the Most Efficient Residential Proxies for Free\"><img src=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/decodo.png\"></a>\n<a href=\"https://hasdata.com/?utm_source=github&utm_medium=banner&utm_campaign=D4Vinci\" target=\"_blank\" title=\"The web scraping service that actually beats anti-bot systems!\"><img src=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/hasdata.png\"></a>\n<a href=\"https://proxyempire.io/?ref=scrapling&utm_source=scrapling\" target=\"_blank\" title=\"Collect The Data Your Project Needs with the Best Residential Proxies\"><img src=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/ProxyEmpire.png\"></a><a href=\"https://www.swiftproxy.net/\" target=\"_blank\" title=\"Unlock Reliable Proxy Services with Swiftproxy!\"><img src=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/swiftproxy.png\"></a>\n<a href=\"https://www.rapidproxy.io/?ref=d4v\" target=\"_blank\" title=\"Affordable Access to the Proxy World – bypass CAPTCHAs blocks, and avoid additional costs.\"><img src=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/rapidproxy.jpg\"></a>\n<a href=\"https://browser.cash/?utm_source=D4Vinci&utm_medium=referral\" target=\"_blank\" title=\"Browser Automation & AI Browser Agent Platform\"><img src=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/browserCash.png\"></a>\n\n<!-- /sponsors -->\n\n<i><sub>Хотите показать здесь свою рекламу? Нажмите [здесь](https://github.com/sponsors/D4Vinci) и выберите подходящий вам уровень!</sub></i>\n\n---\n\n## Ключевые особенности\n\n### Spider'ы — полноценный фреймворк для обхода сайтов\n- 🕷️ **Scrapy-подобный Spider API**: Определяйте Spider'ов с `start_urls`, async `parse` callback'ами и объектами `Request`/`Response`.\n- ⚡ **Параллельный обход**: Настраиваемые лимиты параллелизма, ограничение скорости по домену и задержки загрузки.\n- 🔄 **Поддержка нескольких сессий**: Единый интерфейс для HTTP-запросов и скрытных headless-браузеров в одном Spider — маршрутизируйте запросы к разным сессиям по ID.\n- 💾 **Pause & Resume**: Persistence обхода на основе Checkpoint'ов. Нажмите Ctrl+C для мягкой остановки; перезапустите, чтобы продолжить с того места, где вы остановились.\n- 📡 **Режим Streaming**: Стримьте извлечённые элементы по мере их поступления через `async for item in spider.stream()` со статистикой в реальном времени — идеально для UI, конвейеров и длительных обходов.\n- 🛡️ **Обнаружение заблокированных запросов**: Автоматическое обнаружение и повторная отправка заблокированных запросов с настраиваемой логикой.\n- 📦 **Встроенный экспорт**: Экспортируйте результаты через хуки и собственный конвейер или встроенный JSON/JSONL с `result.items.to_json()` / `result.items.to_jsonl()` соответственно.\n\n### Продвинутая загрузка сайтов с поддержкой Session\n- **HTTP-запросы**: Быстрые и скрытные HTTP-запросы с классом `Fetcher`. Может имитировать TLS fingerprint браузера, заголовки и использовать HTTP/3.\n- **Динамическая загрузка**: Загрузка динамических сайтов с полной автоматизацией браузера через класс `DynamicFetcher`, поддерживающий Chromium от Playwright и Google Chrome.\n- **Обход анти-ботов**: Расширенные возможности скрытности с `StealthyFetcher` и подмену fingerprint'ов. Может легко обойти все типы Cloudflare Turnstile/Interstitial с помощью автоматизации.\n- **Управление сессиями**: Поддержка постоянных сессий с классами `FetcherSession`, `StealthySession` и `DynamicSession` для управления cookie и состоянием между запросами.\n- **Ротация Proxy**: Встроенный `ProxyRotator` с циклической или пользовательскими стратегиями для всех типов сессий, а также переопределение Proxy для каждого запроса.\n- **Блокировка доменов**: Блокируйте запросы к определённым доменам (и их поддоменам) в браузерных Fetcher'ах.\n- **Поддержка async**: Полная async-поддержка во всех Fetcher'ах и выделенных async-классах сессий.\n\n### Адаптивный скрапинг и интеграция с ИИ\n- 🔄 **Умное отслеживание элементов**: Перемещайте элементы после изменений сайта с помощью интеллектуальных алгоритмов подобия.\n- 🎯 **Умный гибкий выбор**: CSS-селекторы, XPath-селекторы, поиск на основе фильтров, текстовый поиск, поиск по регулярным выражениям и многое другое.\n- 🔍 **Поиск похожих элементов**: Автоматически находите элементы, похожие на найденные.\n- 🤖 **MCP-сервер для использования с ИИ**: Встроенный MCP-сервер для Web Scraping с помощью ИИ и извлечения данных. MCP-сервер обладает мощными пользовательскими возможностями, которые используют Scrapling для извлечения целевого контента перед передачей его ИИ (Claude/Cursor/и т.д.), тем самым ускоряя операции и снижая затраты за счёт минимизации использования токенов. ([демо-видео](https://www.youtube.com/watch?v=qyFk3ZNwOxE))\n\n### Высокопроизводительная и проверенная в боях архитектура\n- 🚀 **Молниеносная скорость**: Оптимизированная производительность, превосходящая большинство Python-библиотек для скрапинга.\n- 🔋 **Эффективное использование памяти**: Оптимизированные структуры данных и ленивая загрузка для минимального потребления памяти.\n- ⚡ **Быстрая сериализация JSON**: В 10 раз быстрее стандартной библиотеки.\n- 🏗️ **Проверено в боях**: Scrapling имеет не только 92% покрытия тестами и полное покрытие type hints, но и ежедневно использовался сотнями веб-скраперов в течение последнего года.\n\n### Удобный для разработчиков/веб-скраперов опыт\n- 🎯 **Интерактивная Web Scraping Shell**: Опциональная встроенная IPython-оболочка с интеграцией Scrapling, ярлыками и новыми инструментами для ускорения разработки скриптов Web Scraping, такими как преобразование curl-запросов в запросы Scrapling и просмотр результатов запросов в браузере.\n- 🚀 **Используйте прямо из терминала**: При желании вы можете использовать Scrapling для скрапинга URL без написания ни одной строки кода!\n- 🛠️ **Богатый API навигации**: Расширенный обход DOM с методами навигации по родителям, братьям и детям.\n- 🧬 **Улучшенная обработка текста**: Встроенные регулярные выражения, методы очистки и оптимизированные операции со строками.\n- 📝 **Автоматическая генерация селекторов**: Генерация надёжных CSS/XPath-селекторов для любого элемента.\n- 🔌 **Знакомый API**: Похож на Scrapy/BeautifulSoup с теми же псевдоэлементами, используемыми в Scrapy/Parsel.\n- 📘 **Полное покрытие типами**: Полные type hints для отличной поддержки IDE и автодополнения кода. Вся кодовая база автоматически проверяется **PyRight** и **MyPy** при каждом изменении.\n- 🔋 **Готовый Docker-образ**: С каждым релизом автоматически создаётся и публикуется Docker-образ, содержащий все браузеры.\n\n## Начало работы\n\nДавайте кратко покажем, на что способен Scrapling, без глубокого погружения.\n\n### Базовое использование\nHTTP-запросы с поддержкой Session\n```python\nfrom scrapling.fetchers import Fetcher, FetcherSession\n\nwith FetcherSession(impersonate='chrome') as session:  # Используйте последнюю версию TLS fingerprint Chrome\n    page = session.get('https://quotes.toscrape.com/', stealthy_headers=True)\n    quotes = page.css('.quote .text::text').getall()\n\n# Или используйте одноразовые запросы\npage = Fetcher.get('https://quotes.toscrape.com/')\nquotes = page.css('.quote .text::text').getall()\n```\nРасширенный режим скрытности\n```python\nfrom scrapling.fetchers import StealthyFetcher, StealthySession\n\nwith StealthySession(headless=True, solve_cloudflare=True) as session:  # Держите браузер открытым, пока не закончите\n    page = session.fetch('https://nopecha.com/demo/cloudflare', google_search=False)\n    data = page.css('#padded_content a').getall()\n\n# Или используйте стиль одноразового запроса — открывает браузер для этого запроса, затем закрывает его после завершения\npage = StealthyFetcher.fetch('https://nopecha.com/demo/cloudflare')\ndata = page.css('#padded_content a').getall()\n```\nПолная автоматизация браузера\n```python\nfrom scrapling.fetchers import DynamicFetcher, DynamicSession\n\nwith DynamicSession(headless=True, disable_resources=False, network_idle=True) as session:  # Держите браузер открытым, пока не закончите\n    page = session.fetch('https://quotes.toscrape.com/', load_dom=False)\n    data = page.xpath('//span[@class=\"text\"]/text()').getall()  # XPath-селектор, если вы предпочитаете его\n\n# Или используйте стиль одноразового запроса — открывает браузер для этого запроса, затем закрывает его после завершения\npage = DynamicFetcher.fetch('https://quotes.toscrape.com/')\ndata = page.css('.quote .text::text').getall()\n```\n\n### Spider'ы\nСоздавайте полноценные обходчики с параллельными запросами, несколькими типами сессий и Pause & Resume:\n```python\nfrom scrapling.spiders import Spider, Request, Response\n\nclass QuotesSpider(Spider):\n    name = \"quotes\"\n    start_urls = [\"https://quotes.toscrape.com/\"]\n    concurrent_requests = 10\n\n    async def parse(self, response: Response):\n        for quote in response.css('.quote'):\n            yield {\n                \"text\": quote.css('.text::text').get(),\n                \"author\": quote.css('.author::text').get(),\n            }\n\n        next_page = response.css('.next a')\n        if next_page:\n            yield response.follow(next_page[0].attrib['href'])\n\nresult = QuotesSpider().start()\nprint(f\"Извлечено {len(result.items)} цитат\")\nresult.items.to_json(\"quotes.json\")\n```\nИспользуйте несколько типов сессий в одном Spider:\n```python\nfrom scrapling.spiders import Spider, Request, Response\nfrom scrapling.fetchers import FetcherSession, AsyncStealthySession\n\nclass MultiSessionSpider(Spider):\n    name = \"multi\"\n    start_urls = [\"https://example.com/\"]\n\n    def configure_sessions(self, manager):\n        manager.add(\"fast\", FetcherSession(impersonate=\"chrome\"))\n        manager.add(\"stealth\", AsyncStealthySession(headless=True), lazy=True)\n\n    async def parse(self, response: Response):\n        for link in response.css('a::attr(href)').getall():\n            # Направляйте защищённые страницы через stealth-сессию\n            if \"protected\" in link:\n                yield Request(link, sid=\"stealth\")\n            else:\n                yield Request(link, sid=\"fast\", callback=self.parse)  # явный callback\n```\nПриостанавливайте и возобновляйте длительные обходы с помощью Checkpoint'ов, запуская Spider следующим образом:\n```python\nQuotesSpider(crawldir=\"./crawl_data\").start()\n```\nНажмите Ctrl+C для мягкой остановки — прогресс сохраняется автоматически. Позже, когда вы снова запустите Spider, передайте тот же `crawldir`, и он продолжит с того места, где остановился.\n\n### Продвинутый парсинг и навигация\n```python\nfrom scrapling.fetchers import Fetcher\n\n# Богатый выбор элементов и навигация\npage = Fetcher.get('https://quotes.toscrape.com/')\n\n# Получение цитат различными методами выбора\nquotes = page.css('.quote')  # CSS-селектор\nquotes = page.xpath('//div[@class=\"quote\"]')  # XPath\nquotes = page.find_all('div', {'class': 'quote'})  # В стиле BeautifulSoup\n# То же самое, что\nquotes = page.find_all('div', class_='quote')\nquotes = page.find_all(['div'], class_='quote')\nquotes = page.find_all(class_='quote')  # и так далее...\n# Найти элемент по текстовому содержимому\nquotes = page.find_by_text('quote', tag='div')\n\n# Продвинутая навигация\nquote_text = page.css('.quote')[0].css('.text::text').get()\nquote_text = page.css('.quote').css('.text::text').getall()  # Цепочка селекторов\nfirst_quote = page.css('.quote')[0]\nauthor = first_quote.next_sibling.css('.author::text')\nparent_container = first_quote.parent\n\n# Связи элементов и подобие\nsimilar_elements = first_quote.find_similar()\nbelow_elements = first_quote.below_elements()\n```\nВы можете использовать парсер напрямую, если не хотите загружать сайты, как показано ниже:\n```python\nfrom scrapling.parser import Selector\n\npage = Selector(\"<html>...</html>\")\n```\nИ он работает точно так же!\n\n### Примеры async Session\n```python\nimport asyncio\nfrom scrapling.fetchers import FetcherSession, AsyncStealthySession, AsyncDynamicSession\n\nasync with FetcherSession(http3=True) as session:  # `FetcherSession` контекстно-осведомлён и может работать как в sync, так и в async-режимах\n    page1 = session.get('https://quotes.toscrape.com/')\n    page2 = session.get('https://quotes.toscrape.com/', impersonate='firefox135')\n\n# Использование async-сессии\nasync with AsyncStealthySession(max_pages=2) as session:\n    tasks = []\n    urls = ['https://example.com/page1', 'https://example.com/page2']\n\n    for url in urls:\n        task = session.fetch(url)\n        tasks.append(task)\n\n    print(session.get_pool_stats())  # Опционально — статус пула вкладок браузера (занят/свободен/ошибка)\n    results = await asyncio.gather(*tasks)\n    print(session.get_pool_stats())\n```\n\n## CLI и интерактивная Shell\n\nScrapling включает мощный интерфейс командной строки:\n\n[![asciicast](https://asciinema.org/a/736339.svg)](https://asciinema.org/a/736339)\n\nЗапустить интерактивную Web Scraping Shell\n```bash\nscrapling shell\n```\nИзвлечь страницы в файл напрямую без программирования (по умолчанию извлекает содержимое внутри тега `body`). Если выходной файл заканчивается на `.txt`, будет извлечено текстовое содержимое цели. Если заканчивается на `.md`, это будет Markdown-представление HTML-содержимого; если заканчивается на `.html`, это будет само HTML-содержимое.\n```bash\nscrapling extract get 'https://example.com' content.md\nscrapling extract get 'https://example.com' content.txt --css-selector '#fromSkipToProducts' --impersonate 'chrome'  # Все элементы, соответствующие CSS-селектору '#fromSkipToProducts'\nscrapling extract fetch 'https://example.com' content.md --css-selector '#fromSkipToProducts' --no-headless\nscrapling extract stealthy-fetch 'https://nopecha.com/demo/cloudflare' captchas.html --css-selector '#padded_content a' --solve-cloudflare\n```\n\n> [!NOTE]\n> Есть множество дополнительных возможностей, но мы хотим сохранить эту страницу краткой, включая MCP-сервер и интерактивную Web Scraping Shell. Ознакомьтесь с полной документацией [здесь](https://scrapling.readthedocs.io/en/latest/)\n\n## Тесты производительности\n\nScrapling не только мощный — он ещё и невероятно быстрый. Следующие тесты производительности сравнивают парсер Scrapling с последними версиями других популярных библиотек.\n\n### Тест скорости извлечения текста (5000 вложенных элементов)\n\n| # |    Библиотека     | Время (мс) | vs Scrapling |\n|---|:-----------------:|:----------:|:------------:|\n| 1 |     Scrapling     |    2.02    |     1.0x     |\n| 2 |   Parsel/Scrapy   |    2.04    |     1.01     |\n| 3 |     Raw Lxml      |    2.54    |    1.257     |\n| 4 |      PyQuery      |   24.17    |     ~12x     |\n| 5 |    Selectolax     |   82.63    |     ~41x     |\n| 6 |  MechanicalSoup   |  1549.71   |   ~767.1x    |\n| 7 |   BS4 with Lxml   |  1584.31   |   ~784.3x    |\n| 8 | BS4 with html5lib |  3391.91   |   ~1679.1x   |\n\n\n### Производительность подобия элементов и текстового поиска\n\nВозможности адаптивного поиска элементов Scrapling значительно превосходят альтернативы:\n\n| Библиотека  | Время (мс) | vs Scrapling |\n|-------------|:----------:|:------------:|\n| Scrapling   |    2.39    |     1.0x     |\n| AutoScraper |   12.45    |    5.209x    |\n\n\n> Все тесты производительности представляют собой средние значения более 100 запусков. См. [benchmarks.py](https://github.com/D4Vinci/Scrapling/blob/main/benchmarks.py) для методологии.\n\n## Установка\n\nScrapling требует Python 3.10 или выше:\n\n```bash\npip install scrapling\n```\n\nЭта установка включает только движок парсера и его зависимости, без каких-либо Fetcher'ов или зависимостей командной строки.\n\n### Опциональные зависимости\n\n1. Если вы собираетесь использовать какие-либо из дополнительных возможностей ниже, Fetcher'ы или их классы, вам необходимо установить зависимости Fetcher'ов и браузеров следующим образом:\n    ```bash\n    pip install \"scrapling[fetchers]\"\n\n    scrapling install           # normal install\n    scrapling install  --force  # force reinstall\n    ```\n\n    Это загрузит все браузеры вместе с их системными зависимостями и зависимостями для манипуляции fingerprint'ами.\n\n    Или вы можете установить их из кода вместо выполнения команды:\n    ```python\n    from scrapling.cli import install\n\n    install([], standalone_mode=False)          # normal install\n    install([\"--force\"], standalone_mode=False) # force reinstall\n    ```\n\n2. Дополнительные возможности:\n   - Установить функцию MCP-сервера:\n       ```bash\n       pip install \"scrapling[ai]\"\n       ```\n   - Установить функции Shell (Web Scraping Shell и команда `extract`):\n       ```bash\n       pip install \"scrapling[shell]\"\n       ```\n   - Установить всё:\n       ```bash\n       pip install \"scrapling[all]\"\n       ```\n   Помните, что вам нужно установить зависимости браузеров с помощью `scrapling install` после любого из этих дополнений (если вы ещё этого не сделали)\n\n### Docker\nВы также можете установить Docker-образ со всеми дополнениями и браузерами с помощью следующей команды из DockerHub:\n```bash\ndocker pull pyd4vinci/scrapling\n```\nИли скачайте его из реестра GitHub:\n```bash\ndocker pull ghcr.io/d4vinci/scrapling:latest\n```\nЭтот образ автоматически создаётся и публикуется с помощью GitHub Actions и основной ветки репозитория.\n\n## Участие в разработке\n\nМы приветствуем участие! Пожалуйста, прочитайте наши [руководства по участию в разработке](https://github.com/D4Vinci/Scrapling/blob/main/CONTRIBUTING.md) перед началом работы.\n\n## Отказ от ответственности\n\n> [!CAUTION]\n> Эта библиотека предоставляется только в образовательных и исследовательских целях. Используя эту библиотеку, вы соглашаетесь соблюдать местные и международные законы о скрапинге данных и конфиденциальности. Авторы и участники не несут ответственности за любое неправомерное использование этого программного обеспечения. Всегда уважайте условия обслуживания веб-сайтов и файлы robots.txt.\n\n## 🎓 Цитирование\nЕсли вы использовали нашу библиотеку в исследовательских целях, пожалуйста, цитируйте нас со следующей ссылкой:\n```text\n  @misc{scrapling,\n    author = {Karim Shoair},\n    title = {Scrapling},\n    year = {2024},\n    url = {https://github.com/D4Vinci/Scrapling},\n    note = {An adaptive Web Scraping framework that handles everything from a single request to a full-scale crawl!}\n  }\n```\n\n## Лицензия\n\nЭта работа лицензирована по лицензии BSD-3-Clause.\n\n## Благодарности\n\nЭтот проект включает код, адаптированный из:\n- Parsel (лицензия BSD) — Используется для подмодуля [translator](https://github.com/D4Vinci/Scrapling/blob/main/scrapling/core/translator.py)\n\n---\n<div align=\"center\"><small>Разработано и создано с ❤️ Карим Шоаир.</small></div><br>\n"
  },
  {
    "path": "docs/ai/mcp-server.md",
    "content": "# Scrapling MCP Server Guide\n\n<iframe width=\"560\" height=\"315\" src=\"https://www.youtube.com/embed/qyFk3ZNwOxE?si=3FHzgcYCb66iJ6e3\" title=\"YouTube video player\" frameborder=\"0\" allow=\"accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; web-share\" referrerpolicy=\"strict-origin-when-cross-origin\" allowfullscreen></iframe>\n\nThe **Scrapling MCP Server** is a new feature that brings Scrapling's powerful Web Scraping capabilities directly to your favorite AI chatbot or AI agent. This integration allows you to scrape websites, extract data, and bypass anti-bot protections conversationally through Claude's AI interface or any interface that supports MCP.\n\n## Features\n\nThe Scrapling MCP Server provides six powerful tools for web scraping:\n\n### 🚀 Basic HTTP Scraping\n- **`get`**: Fast HTTP requests with browser fingerprint impersonation, generating real browser headers matching the TLS version, HTTP/3, and more!\n- **`bulk_get`**: An async version of the above tool that allows scraping of multiple URLs at the same time!\n\n### 🌐 Dynamic Content Scraping  \n- **`fetch`**: Rapidly fetch dynamic content with Chromium/Chrome browser with complete control over the request/browser, and more!\n- **`bulk_fetch`**: An async version of the above tool that allows scraping of multiple URLs in different browser tabs at the same time!\n\n### 🔒 Stealth Scraping\n- **`stealthy_fetch`**: Uses our Stealthy browser to bypass Cloudflare Turnstile/Interstitial and other anti-bot systems with complete control over the request/browser! \n- **`bulk_stealthy_fetch`**: An async version of the above tool that allows stealth scraping of multiple URLs in different browser tabs at the same time!\n\n### Key Capabilities\n- **Smart Content Extraction**: Convert web pages/elements to Markdown, HTML, or extract a clean version of the text content\n- **CSS Selector Support**: Use the Scrapling engine to target specific elements with precision before handing the content to the AI\n- **Anti-Bot Bypass**: Handle Cloudflare Turnstile, Interstitial, and other protections\n- **Proxy Support**: Use proxies for anonymity and geo-targeting\n- **Browser Impersonation**: Mimic real browsers with TLS fingerprinting, real browser headers matching that version, and more\n- **Parallel Processing**: Scrape multiple URLs concurrently for efficiency\n\n#### But why use Scrapling MCP Server instead of other available tools?\n\nAside from its stealth capabilities and ability to bypass Cloudflare Turnstile/Interstitial, Scrapling's server is the only one that lets you select specific elements to pass to the AI, saving a lot of time and tokens!\n\nThe way other servers work is that they extract the content, then pass it all to the AI to extract the fields you want. This causes the AI to consume far more tokens than needed (from irrelevant content). Scrapling solves this problem by allowing you to pass a CSS selector to narrow down the content you want before passing it to the AI, which makes the whole process much faster and more efficient.\n\nIf you don't know how to write/use CSS selectors, don't worry. You can tell the AI in the prompt to write selectors to match possible fields for you and watch it try different combinations until it finds the right one, as we will show in the examples section.\n\n## Installation\n\nInstall Scrapling with MCP Support, then double-check that the browser dependencies are installed.\n\n```bash\n# Install Scrapling with MCP server dependencies\npip install \"scrapling[ai]\"\n\n# Install browser dependencies\nscrapling install\n```\n\nOr use the Docker image directly from the Docker registry:\n```bash\ndocker pull pyd4vinci/scrapling\n```\nOr download it from the GitHub registry:\n```bash\ndocker pull ghcr.io/d4vinci/scrapling:latest\n```\n\n## Setting up the MCP Server\n\nHere we will explain how to add Scrapling MCP Server to [Claude Desktop](https://claude.ai/download) and [Claude Code](https://www.anthropic.com/claude-code), but the same logic applies to any other chatbot that supports MCP:\n\n### Claude Desktop\n\n1. Open Claude Desktop\n2. Click the hamburger menu (☰) at the top left → Settings → Developer → Edit Config\n3. Add the Scrapling MCP server configuration:\n```json\n\"ScraplingServer\": {\n  \"command\": \"scrapling\",\n  \"args\": [\n    \"mcp\"\n  ]\n}\n```\nIf that's the first MCP server you're adding, set the content of the file to this: \n```json\n{\n  \"mcpServers\": {\n    \"ScraplingServer\": {\n      \"command\": \"scrapling\",\n      \"args\": [\n        \"mcp\"\n      ]\n    }\n  }\n}\n```\nAs per the [official article](https://modelcontextprotocol.io/quickstart/user), this action either creates a new configuration file if none exists or opens your existing configuration. The file is located at\n\n1. **MacOS**: `~/Library/Application Support/Claude/claude_desktop_config.json`\n2. **Windows**: `%APPDATA%\\Claude\\claude_desktop_config.json`\n\nTo ensure it's working, use the full path to the `scrapling` executable. Open the terminal and execute the following command:\n\n1. **MacOS**: `which scrapling`\n2. **Windows**: `where scrapling`\n\nFor me, on my Mac, it returned `/Users/<MyUsername>/.venv/bin/scrapling`, so the config I used in the end is:\n```json\n{\n  \"mcpServers\": {\n    \"ScraplingServer\": {\n      \"command\": \"/Users/<MyUsername>/.venv/bin/scrapling\",\n      \"args\": [\n        \"mcp\"\n      ]\n    }\n  }\n}\n```\n#### Docker\nIf you are using the Docker image, then it would be something like\n```json\n{\n  \"mcpServers\": {\n    \"ScraplingServer\": {\n      \"command\": \"docker\",\n      \"args\": [\n        \"run\", \"-i\", \"--rm\", \"scrapling\", \"mcp\"\n      ]\n    }\n  }\n}\n```\n\nThe same logic applies to [Cursor](https://cursor.com/docs/context/mcp), [WindSurf](https://windsurf.com/university/tutorials/configuring-first-mcp-server), and others.\n\n### Claude Code\nHere it's much simpler to do. If you have [Claude Code](https://www.anthropic.com/claude-code) installed, open the terminal and execute the following command:\n\n```bash\nclaude mcp add ScraplingServer \"/Users/<MyUsername>/.venv/bin/scrapling\" mcp\n```\nSame as above, to get Scrapling's executable path, open the terminal and execute the following command:\n\n1. **MacOS**: `which scrapling`\n2. **Windows**: `where scrapling`\n\nHere's the main article from Anthropic on [how to add MCP servers to Claude code](https://docs.anthropic.com/en/docs/claude-code/mcp#option-1%3A-add-a-local-stdio-server) for further details.\n\n\nThen, after you've added the server, you need to completely quit and restart the app you used above. In Claude Desktop, you should see an MCP server indicator (🔧) in the bottom-right corner of the chat input or see `ScraplingServer` in the `Search and tools` dropdown in the chat input box.\n\n### Streamable HTTP\nAs per version 0.3.6, we have added the ability to make the MCP server use the 'Streamable HTTP' transport mode instead of the traditional 'stdio' transport.\n\nSo instead of using the following command (the 'stdio' one):\n```bash\nscrapling mcp\n```\nUse the following to enable 'Streamable HTTP' transport mode:\n```bash\nscrapling mcp --http\n```\nHence, the default value for the host the server is listening to is '0.0.0.0' and the port is 8000, which both can be configured as below:\n```bash\nscrapling mcp --http --host '127.0.0.1' --port 8000\n```\n\n## Examples\n\nNow we will show you some examples of prompts we used while testing the MCP server, but you are probably more creative than we are and better at prompt engineering than we are :)\n\nWe will gradually go from simple prompts to more complex ones. We will use Claude Desktop for the examples, but the same logic applies to the rest, of course.\n\n1. **Basic Web Scraping**\n\n    Extract the main content from a webpage as Markdown:\n    \n    ```\n    Scrape the main content from https://example.com and convert it to markdown format.\n    ```\n    \n    Claude will use the `get` tool to fetch the page and return clean, readable content. If it fails, it will continue retrying every second for 3 attempts, unless you instruct it otherwise. If it fails to retrieve content for any reason, such as protection or if it's a dynamic website, it will automatically try the other tools. If Claude didn't do that automatically for some reason, you can add that to the prompt.\n    \n    A more optimized version of the same prompt would be:\n    ```\n    Use regular requests to scrape the main content from https://example.com and convert it to markdown format.\n    ```\n    This tells Claude which tool to use here, so it doesn't have to guess. Sometimes it will start using normal requests on its own, and at other times, it will assume browsers are better suited for this website without any apparent reason. As a rule of thumb, you should always tell Claude which tool to use to save time and money and get consistent results.\n\n2. **Targeted Data Extraction**\n\n    Extract specific elements using CSS selectors:\n    \n    ```\n    Get all product titles from https://shop.example.com using the CSS selector '.product-title'. If the request fails, retry up to 5 times every 10 seconds.\n    ```\n    \n    The server will extract only the elements matching your selector and return them as a structured list. Notice I told it to set the tool to try up to 5 times in case the website has connection issues, but the default setting should be fine for most cases.\n\n3. **E-commerce Data Collection**\n\n    Another example of a bit more complex prompt:\n    ```\n    Extract product information from these e-commerce URLs using bulk browser fetches:\n    - https://shop1.com/product-a\n    - https://shop2.com/product-b  \n    - https://shop3.com/product-c\n    \n    Get the product names, prices, and descriptions from each page.\n    ```\n    \n    Claude will use `bulk_fetch` to concurrently scrape all URLs, then analyze the extracted data.\n\n4. **More advanced workflow**\n\n    Let's say I want to get all the action games available on PlayStation's store first page right now. I can use the following prompt to do that:\n    ```\n    Extract the URLs of all games in this page, then do a bulk request to them and return a list of all action games: https://store.playstation.com/en-us/pages/browse\n    ```\n    Note that I instructed it to use a bulk request for all the URLs collected. If I hadn't mentioned it, sometimes it works as intended, and other times it makes a separate request to each URL, which takes significantly longer. This prompt takes approximately one minute to complete.\n    \n    However, because I wasn't specific enough, it actually used the `stealthy_fetch` here and the `bulk_stealthy_fetch` in the second step, which unnecessarily consumed a large number of tokens. A better prompt would be:\n    ```\n    Use normal requests to extract the URLs of all games in this page, then do a bulk request to them and return a list of all action games: https://store.playstation.com/en-us/pages/browse\n    ```\n    And if you know how to write CSS selectors, you can instruct Claude to apply the selectors to the elements you want, and it will nearly complete the task immediately.\n    ```\n    Use normal requests to extract the URLs of all games on the page below, then perform a bulk request to them and return a list of all action games.\n    The selector for games in the first page is `[href*=\"/concept/\"]` and the selector for the genre in the second request is `[data-qa=\"gameInfo#releaseInformation#genre-value\"]`.\n    \n    URL: https://store.playstation.com/en-us/pages/browse\n    ```\n\n5. **Get data from a website with Cloudflare protection**\n\n    If you think the website you are targeting has Cloudflare protection, tell Claude instead of letting it discover it on its own.\n    ```\n    What's the price of this product? Be cautious, as it utilizes Cloudflare's Turnstile protection. Make the browser visible while you work.\n\n    https://ao.com/product/oo101uk-ninja-woodfire-outdoor-pizza-oven-brown-99357-685.aspx\n    ```\n\n6. **Long workflow**\n\n    You can, for example, use a prompt like this:\n    ```\n    Extract all product URLs for the following category, then return the prices and details for the first 3 products.\n    \n    https://www.arnotts.ie/furniture/bedroom/bed-frames/\n    ```\n    But a better prompt would be:\n    ```\n    Go to the following category URL and extract all product URLs using the CSS selector \"a\". Then, fetch the first 3 product pages in parallel and extract each product’s price and details.\n    \n    Keep the output in markdown format to reduce irrelevant content.\n    \n    Category URL:\n    https://www.arnotts.ie/furniture/bedroom/bed-frames/\n    ```\n\nAnd so on, you get the idea. Your creativity is the key here.\n\n## Best Practices\n\nHere is some technical advice for you.\n\n### 1. Choose the Right Tool\n- **`get`**: Fast, simple websites\n- **`fetch`**: Sites with JavaScript/dynamic content  \n- **`stealthy_fetch`**: Protected sites, Cloudflare, anti-bot systems\n\n### 2. Optimize Performance\n- Use bulk tools for multiple URLs\n- Disable unnecessary resources\n- Set appropriate timeouts\n- Use CSS selectors for targeted extraction\n\n### 3. Handle Dynamic Content\n- Use `network_idle` for SPAs\n- Set `wait_selector` for specific elements\n- Increase timeout for slow-loading sites\n\n### 4. Data Quality\n- Use `main_content_only=true` to avoid navigation/ads\n- Choose an appropriate `extraction_type` for your use case\n\n## Legal and Ethical Considerations\n\n⚠️ **Important Guidelines:**\n\n- **Check robots.txt**: Visit `https://website.com/robots.txt` to see scraping rules\n- **Respect rate limits**: Don't overwhelm servers with requests\n- **Terms of Service**: Read and comply with website terms\n- **Copyright**: Respect intellectual property rights\n- **Privacy**: Be mindful of personal data protection laws\n- **Commercial use**: Ensure you have permission for business purposes\n\n---\n\n*Built with ❤️ by the Scrapling team. Happy scraping!*"
  },
  {
    "path": "docs/api-reference/custom-types.md",
    "content": "---\nsearch:\n  exclude: true\n---\n\n# Custom Types API Reference\n\nHere's the reference information for all custom types of classes Scrapling implemented, with all their parameters, attributes, and methods.\n\nYou can import all of them directly like below:\n\n```python\nfrom scrapling.core.custom_types import TextHandler, TextHandlers, AttributesHandler\n```\n\n## ::: scrapling.core.custom_types.TextHandler\n    handler: python\n    :docstring:\n\n## ::: scrapling.core.custom_types.TextHandlers\n    handler: python\n    :docstring:\n\n## ::: scrapling.core.custom_types.AttributesHandler\n    handler: python\n    :docstring:\n"
  },
  {
    "path": "docs/api-reference/fetchers.md",
    "content": "---\nsearch:\n  exclude: true\n---\n\n# Fetchers Classes\n\nHere's the reference information for all fetcher-type classes' parameters, attributes, and methods.\n\nYou can import all of them directly like below:\n\n```python\nfrom scrapling.fetchers import (\n    Fetcher, AsyncFetcher, StealthyFetcher, DynamicFetcher,\n    FetcherSession, AsyncStealthySession, StealthySession, DynamicSession, AsyncDynamicSession\n)\n```\n\n## ::: scrapling.fetchers.Fetcher\n    handler: python\n    :docstring:\n\n## ::: scrapling.fetchers.AsyncFetcher\n    handler: python\n    :docstring:\n\n## ::: scrapling.fetchers.DynamicFetcher\n    handler: python\n    :docstring:\n\n## ::: scrapling.fetchers.StealthyFetcher\n    handler: python\n    :docstring:\n\n\n## Session Classes\n\n### HTTP Sessions\n\n## ::: scrapling.fetchers.FetcherSession\n    handler: python\n    :docstring:\n\n### Stealth Sessions\n\n## ::: scrapling.fetchers.StealthySession\n    handler: python\n    :docstring:\n\n## ::: scrapling.fetchers.AsyncStealthySession\n    handler: python\n    :docstring:\n\n### Dynamic Sessions\n\n## ::: scrapling.fetchers.DynamicSession\n    handler: python\n    :docstring:\n\n## ::: scrapling.fetchers.AsyncDynamicSession\n    handler: python\n    :docstring:\n\n"
  },
  {
    "path": "docs/api-reference/mcp-server.md",
    "content": "---\nsearch:\n  exclude: true\n---\n\n# MCP Server API Reference\n\nThe **Scrapling MCP Server** provides six powerful tools for web scraping through the Model Context Protocol (MCP). This server integrates Scrapling's capabilities directly into AI chatbots and agents, allowing conversational web scraping with advanced anti-bot bypass features.\n\nYou can start the MCP server by running:\n\n```bash\nscrapling mcp\n```\n\nOr import the server class directly:\n\n```python\nfrom scrapling.core.ai import ScraplingMCPServer\n\nserver = ScraplingMCPServer()\nserver.serve(http=False, host=\"0.0.0.0\", port=8000)\n```\n\n## Response Model\n\nThe standardized response structure that's returned by all MCP server tools:\n\n## ::: scrapling.core.ai.ResponseModel\n    handler: python\n    :docstring:\n\n## MCP Server Class\n\nThe main MCP server class that provides all web scraping tools:\n\n## ::: scrapling.core.ai.ScraplingMCPServer\n    handler: python\n    :docstring:"
  },
  {
    "path": "docs/api-reference/proxy-rotation.md",
    "content": "---\nsearch:\n  exclude: true\n---\n\n# Proxy Rotation\n\nThe `ProxyRotator` class provides thread-safe proxy rotation for any fetcher or session.\n\nYou can import it directly like below:\n\n```python\nfrom scrapling.fetchers import ProxyRotator\n```\n\n## ::: scrapling.engines.toolbelt.proxy_rotation.ProxyRotator\n    handler: python\n    :docstring:\n"
  },
  {
    "path": "docs/api-reference/response.md",
    "content": "---\nsearch:\n  exclude: true\n---\n\n# Response Class\n\nThe `Response` class wraps HTTP responses returned by all fetchers, providing access to status, headers, body, cookies, and a `Selector` for parsing.\n\nYou can import the `Response` class like below:\n\n```python\nfrom scrapling.engines.toolbelt.custom import Response\n```\n\n## ::: scrapling.engines.toolbelt.custom.Response\n    handler: python\n    :docstring:\n"
  },
  {
    "path": "docs/api-reference/selector.md",
    "content": "---\nsearch:\n  exclude: true\n---\n\n# Selector Class\n\nThe `Selector` class is the core parsing engine in Scrapling that provides HTML parsing and element selection capabilities.\n\nHere's the reference information for the `Selector` class, with all its parameters, attributes, and methods.\n\nYou can import the `Selector` class directly from `scrapling`:\n\n```python\nfrom scrapling.parser import Selector\n```\n\n## ::: scrapling.parser.Selector\n    handler: python\n    :docstring:\n\n## ::: scrapling.parser.Selectors\n    handler: python\n    :docstring:\n\n"
  },
  {
    "path": "docs/api-reference/spiders.md",
    "content": "---\nsearch:\n  exclude: true\n---\n\n# Spider Classes\n\nHere's the reference information for the spider framework classes' parameters, attributes, and methods.\n\nYou can import them directly like below:\n\n```python\nfrom scrapling.spiders import Spider, Request, CrawlResult, SessionManager, Response\n```\n\n## ::: scrapling.spiders.Spider\n    handler: python\n    :docstring:\n\n## ::: scrapling.spiders.Request\n    handler: python\n    :docstring:\n\n## Result Classes\n\n## ::: scrapling.spiders.result.CrawlResult\n    handler: python\n    :docstring:\n\n## ::: scrapling.spiders.result.CrawlStats\n    handler: python\n    :docstring:\n\n## ::: scrapling.spiders.result.ItemList\n    handler: python\n    :docstring:\n\n## Session Management\n\n## ::: scrapling.spiders.session.SessionManager\n    handler: python\n    :docstring:\n"
  },
  {
    "path": "docs/benchmarks.md",
    "content": "# Performance Benchmarks\n\nScrapling isn't just powerful—it's also blazing fast. The following benchmarks compare Scrapling's parser with the latest versions of other popular libraries.\n\n### Text Extraction Speed Test (5000 nested elements)\n\n| # |      Library      | Time (ms) | vs Scrapling | \n|---|:-----------------:|:---------:|:------------:|\n| 1 |     Scrapling     |   2.02    |     1.0x     |\n| 2 |   Parsel/Scrapy   |   2.04    |     1.01     |\n| 3 |     Raw Lxml      |   2.54    |    1.257     |\n| 4 |      PyQuery      |   24.17   |     ~12x     |\n| 5 |    Selectolax     |   82.63   |     ~41x     |\n| 6 |  MechanicalSoup   |  1549.71  |   ~767.1x    |\n| 7 |   BS4 with Lxml   |  1584.31  |   ~784.3x    |\n| 8 | BS4 with html5lib |  3391.91  |   ~1679.1x   |\n\n\n### Element Similarity & Text Search Performance\n\nScrapling's adaptive element finding capabilities significantly outperform alternatives:\n\n| Library     | Time (ms) | vs Scrapling |\n|-------------|:---------:|:------------:|\n| Scrapling   |   2.39    |     1.0x     |\n| AutoScraper |   12.45   |    5.209x    |\n\n> All benchmarks represent averages of 100+ runs. See [benchmarks.py](https://github.com/D4Vinci/Scrapling/blob/main/benchmarks.py) for methodology.\n"
  },
  {
    "path": "docs/cli/extract-commands.md",
    "content": "# Scrapling Extract Command Guide\n\n**Web Scraping through the terminal without requiring any programming!**\n\nThe `scrapling extract` command lets you download and extract content from websites directly from your terminal without writing any code. Ideal for beginners, researchers, and anyone requiring rapid web data extraction.\n\n!!! success \"Prerequisites\"\n\n    1. You've completed or read the [Fetchers basics](../fetching/choosing.md) page to understand what the [Response object](../fetching/choosing.md#response-object) is and which fetcher to use.\n    2. You've completed or read the [Querying elements](../parsing/selection.md) page to understand how to find/extract elements from the [Selector](../parsing/main_classes.md#selector)/[Response](../fetching/choosing.md#response-object) object.\n    3. You've completed or read the [Main classes](../parsing/main_classes.md) page to know what properties/methods the [Response](../fetching/choosing.md#response-object) class is inheriting from the [Selector](../parsing/main_classes.md#selector) class.\n    4. You've completed or read at least one page from the fetchers section to use here for requests: [HTTP requests](../fetching/static.md), [Dynamic websites](../fetching/dynamic.md), or [Dynamic websites with hard protections](../fetching/stealthy.md).\n\n\n## What is the Extract Command group?\n\nThe extract command is a set of simple terminal tools that:\n\n- **Downloads web pages** and saves their content to files.\n- **Converts HTML to readable formats** like Markdown, keeps it as HTML, or just extracts the text content of the page.\n- **Supports custom CSS selectors** to extract specific parts of the page.\n- **Handles HTTP requests and fetching through browsers**\n- **Highly customizable** with custom headers, cookies, proxies, and the rest of the options. Almost all the options available through the code are also accessible through the command line.\n\n## Quick Start\n\n- **Basic Website Download**\n\n    Download a website's text content as clean, readable text:\n    ```bash\n    scrapling extract get \"https://example.com\" page_content.txt\n    ```\n    This makes an HTTP GET request and saves the webpage's text content to `page_content.txt`.\n\n- **Save as Different Formats**\n\n    Choose your output format by changing the file extension:\n    ```bash\n    # Convert the HTML content to Markdown, then save it to the file (great for documentation)\n    scrapling extract get \"https://blog.example.com\" article.md\n    \n    # Save the HTML content as it is to the file\n    scrapling extract get \"https://example.com\" page.html\n    \n    # Save a clean version of the text content of the webpage to the file\n    scrapling extract get \"https://example.com\" content.txt\n  \n    # Or use the Docker image with something like this:\n    docker run -v $(pwd)/output:/output scrapling extract get \"https://blog.example.com\" /output/article.md \n    ```\n\n- **Extract Specific Content**\n\n    All commands can use CSS selectors to extract specific parts of the page through `--css-selector` or `-s` as you will see in the examples below.\n\n## Available Commands\n\nYou can display the available commands through `scrapling extract --help` to get the following list:\n```bash\nUsage: scrapling extract [OPTIONS] COMMAND [ARGS]...\n\n  Fetch web pages using various fetchers and extract full/selected HTML content as HTML, Markdown, or extract text content.\n\nOptions:\n  --help  Show this message and exit.\n\nCommands:\n  get             Perform a GET request and save the content to a file.\n  post            Perform a POST request and save the content to a file.\n  put             Perform a PUT request and save the content to a file.\n  delete          Perform a DELETE request and save the content to a file.\n  fetch           Use DynamicFetcher to fetch content with browser...\n  stealthy-fetch  Use StealthyFetcher to fetch content with advanced...\n```\n\nWe will go through each command in detail below.\n\n### HTTP Requests\n\n1. **GET Request**\n\n    The most common command for downloading website content:\n    \n    ```bash\n    scrapling extract get [URL] [OUTPUT_FILE] [OPTIONS]\n    ```\n    \n    **Examples:**\n    ```bash\n    # Basic download\n    scrapling extract get \"https://news.site.com\" news.md\n    \n    # Download with custom timeout\n    scrapling extract get \"https://example.com\" content.txt --timeout 60\n    \n    # Extract only specific content using CSS selectors\n    scrapling extract get \"https://blog.example.com\" articles.md --css-selector \"article\"\n   \n    # Send a request with cookies\n    scrapling extract get \"https://scrapling.requestcatcher.com\" content.md --cookies \"session=abc123; user=john\"\n   \n    # Add user agent\n    scrapling extract get \"https://api.site.com\" data.json -H \"User-Agent: MyBot 1.0\"\n    \n    # Add multiple headers\n    scrapling extract get \"https://site.com\" page.html -H \"Accept: text/html\" -H \"Accept-Language: en-US\"\n    ```\n    Get the available options for the command with `scrapling extract get --help` as follows:\n    ```bash\n    Usage: scrapling extract get [OPTIONS] URL OUTPUT_FILE\n    \n      Perform a GET request and save the content to a file.\n    \n      The output file path can be an HTML file, a Markdown file of the HTML content, or the text content itself. Use file extensions (`.html`/`.md`/`.txt`) respectively.\n    \n    Options:\n      -H, --headers TEXT                             HTTP headers in format \"Key: Value\" (can be used multiple times)\n      --cookies TEXT                                 Cookies string in format \"name1=value1;name2=value2\"\n      --timeout INTEGER                              Request timeout in seconds (default: 30)\n      --proxy TEXT                                   Proxy URL in format \"http://username:password@host:port\"\n      -s, --css-selector TEXT                        CSS selector to extract specific content from the page. It returns all matches.\n      -p, --params TEXT                              Query parameters in format \"key=value\" (can be used multiple times)\n      --follow-redirects / --no-follow-redirects     Whether to follow redirects (default: True)\n      --verify / --no-verify                         Whether to verify SSL certificates (default: True)\n      --impersonate TEXT                             Browser to impersonate (e.g., chrome, firefox).\n      --stealthy-headers / --no-stealthy-headers     Use stealthy browser headers (default: True)\n      --help                                         Show this message and exit.\n    \n    ```\n    Note that the options will work in the same way for all other request commands, so no need to repeat them.\n\n2. **Post Request**\n    \n    ```bash\n    scrapling extract post [URL] [OUTPUT_FILE] [OPTIONS]\n    ```\n    \n    **Examples:**\n    ```bash\n    # Submit form data\n    scrapling extract post \"https://api.site.com/search\" results.html --data \"query=python&type=tutorial\"\n    \n    # Send JSON data\n    scrapling extract post \"https://api.site.com\" response.json --json '{\"username\": \"test\", \"action\": \"search\"}'\n    ```\n    Get the available options for the command with `scrapling extract post --help` as follows:\n    ```bash\n    Usage: scrapling extract post [OPTIONS] URL OUTPUT_FILE\n    \n      Perform a POST request and save the content to a file.\n    \n      The output file path can be an HTML file, a Markdown file of the HTML content, or the text content itself. Use file extensions (`.html`/`.md`/`.txt`) respectively.\n    \n    Options:\n      -d, --data TEXT                                Form data to include in the request body (as string, ex: \"param1=value1&param2=value2\")\n      -j, --json TEXT                                JSON data to include in the request body (as string)\n      -H, --headers TEXT                             HTTP headers in format \"Key: Value\" (can be used multiple times)\n      --cookies TEXT                                 Cookies string in format \"name1=value1;name2=value2\"\n      --timeout INTEGER                              Request timeout in seconds (default: 30)\n      --proxy TEXT                                   Proxy URL in format \"http://username:password@host:port\"\n      -s, --css-selector TEXT                        CSS selector to extract specific content from the page. It returns all matches.\n      -p, --params TEXT                              Query parameters in format \"key=value\" (can be used multiple times)\n      --follow-redirects / --no-follow-redirects     Whether to follow redirects (default: True)\n      --verify / --no-verify                         Whether to verify SSL certificates (default: True)\n      --impersonate TEXT                             Browser to impersonate (e.g., chrome, firefox).\n      --stealthy-headers / --no-stealthy-headers     Use stealthy browser headers (default: True)\n      --help                                         Show this message and exit.\n    \n    ```\n\n3. **Put Request**\n    \n    ```bash\n    scrapling extract put [URL] [OUTPUT_FILE] [OPTIONS]\n    ```\n    \n    **Examples:**\n    ```bash\n    # Send data\n    scrapling extract put \"https://scrapling.requestcatcher.com/put\" results.html --data \"update=info\" --impersonate \"firefox\"\n    \n    # Send JSON data\n    scrapling extract put \"https://scrapling.requestcatcher.com/put\" response.json --json '{\"username\": \"test\", \"action\": \"search\"}'\n    ```\n    Get the available options for the command with `scrapling extract put --help` as follows:\n    ```bash\n    Usage: scrapling extract put [OPTIONS] URL OUTPUT_FILE\n    \n      Perform a PUT request and save the content to a file.\n    \n      The output file path can be an HTML file, a Markdown file of the HTML content, or the text content itself. Use file extensions (`.html`/`.md`/`.txt`) respectively.\n    \n    Options:\n      -d, --data TEXT                                Form data to include in the request body\n      -j, --json TEXT                                JSON data to include in the request body (as string)\n      -H, --headers TEXT                             HTTP headers in format \"Key: Value\" (can be used multiple times)\n      --cookies TEXT                                 Cookies string in format \"name1=value1;name2=value2\"\n      --timeout INTEGER                              Request timeout in seconds (default: 30)\n      --proxy TEXT                                   Proxy URL in format \"http://username:password@host:port\"\n      -s, --css-selector TEXT                        CSS selector to extract specific content from the page. It returns all matches.\n      -p, --params TEXT                              Query parameters in format \"key=value\" (can be used multiple times)\n      --follow-redirects / --no-follow-redirects     Whether to follow redirects (default: True)\n      --verify / --no-verify                         Whether to verify SSL certificates (default: True)\n      --impersonate TEXT                             Browser to impersonate (e.g., chrome, firefox).\n      --stealthy-headers / --no-stealthy-headers     Use stealthy browser headers (default: True)\n      --help                                         Show this message and exit.\n    ```\n\n4. **Delete Request**\n    \n    ```bash\n    scrapling extract delete [URL] [OUTPUT_FILE] [OPTIONS]\n    ```\n    \n    **Examples:**\n    ```bash\n    # Send data\n    scrapling extract delete \"https://scrapling.requestcatcher.com/delete\" results.html\n    \n    # Send JSON data\n    scrapling extract delete \"https://scrapling.requestcatcher.com/\" response.txt --impersonate \"chrome\"\n    ```\n    Get the available options for the command with `scrapling extract delete --help` as follows:\n    ```bash\n    Usage: scrapling extract delete [OPTIONS] URL OUTPUT_FILE\n    \n      Perform a DELETE request and save the content to a file.\n    \n      The output file path can be an HTML file, a Markdown file of the HTML content, or the text content itself. Use file extensions (`.html`/`.md`/`.txt`) respectively.\n    \n    Options:\n      -H, --headers TEXT                             HTTP headers in format \"Key: Value\" (can be used multiple times)\n      --cookies TEXT                                 Cookies string in format \"name1=value1;name2=value2\"\n      --timeout INTEGER                              Request timeout in seconds (default: 30)\n      --proxy TEXT                                   Proxy URL in format \"http://username:password@host:port\"\n      -s, --css-selector TEXT                        CSS selector to extract specific content from the page. It returns all matches.\n      -p, --params TEXT                              Query parameters in format \"key=value\" (can be used multiple times)\n      --follow-redirects / --no-follow-redirects     Whether to follow redirects (default: True)\n      --verify / --no-verify                         Whether to verify SSL certificates (default: True)\n      --impersonate TEXT                             Browser to impersonate (e.g., chrome, firefox).\n      --stealthy-headers / --no-stealthy-headers     Use stealthy browser headers (default: True)\n      --help                                         Show this message and exit.\n    ```\n\n### Browsers fetching\n\n1. **fetch - Handle Dynamic Content**\n\n    For websites that load content with dynamic content or have slight protection\n    \n    ```bash\n    scrapling extract fetch [URL] [OUTPUT_FILE] [OPTIONS]\n    ```\n    \n    **Examples:**\n    ```bash\n    # Wait for JavaScript to load content and finish network activity\n    scrapling extract fetch \"https://scrapling.requestcatcher.com/\" content.md --network-idle\n    \n    # Wait for specific content to appear\n    scrapling extract fetch \"https://scrapling.requestcatcher.com/\" data.txt --wait-selector \".content-loaded\"\n    \n    # Run in visible browser mode (helpful for debugging)\n    scrapling extract fetch \"https://scrapling.requestcatcher.com/\" page.html --no-headless --disable-resources\n    ```\n    Get the available options for the command with `scrapling extract fetch --help` as follows:\n    ```bash\n    Usage: scrapling extract fetch [OPTIONS] URL OUTPUT_FILE\n    \n      Use DynamicFetcher to fetch content with browser automation.\n    \n      The output file path can be an HTML file, a Markdown file of the HTML content, or the text content itself. Use file extensions (`.html`/`.md`/`.txt`) respectively.\n    \n    Options:\n      --headless / --no-headless                  Run browser in headless mode (default: True)\n      --disable-resources / --enable-resources    Drop unnecessary resources for speed boost (default: False)\n      --network-idle / --no-network-idle          Wait for network idle (default: False)\n      --timeout INTEGER                           Timeout in milliseconds (default: 30000)\n      --wait INTEGER                              Additional wait time in milliseconds after page load (default: 0)\n      -s, --css-selector TEXT                     CSS selector to extract specific content from the page. It returns all matches.\n      --wait-selector TEXT                        CSS selector to wait for before proceeding\n      --locale TEXT                               Specify user locale. Defaults to the system default locale.\n      --real-chrome/--no-real-chrome              If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it. (default: False)\n      --proxy TEXT                                Proxy URL in format \"http://username:password@host:port\"\n      -H, --extra-headers TEXT                    Extra headers in format \"Key: Value\" (can be used multiple times)\n      --help                                      Show this message and exit.\n    ```\n\n2. **stealthy-fetch - Bypass Protection**\n\n    For websites with anti-bot protection or Cloudflare protection\n    \n    ```bash\n    scrapling extract stealthy-fetch [URL] [OUTPUT_FILE] [OPTIONS]\n    ```\n    \n    **Examples:**\n    ```bash\n    # Bypass basic protection\n    scrapling extract stealthy-fetch \"https://scrapling.requestcatcher.com\" content.md\n    \n    # Solve Cloudflare challenges\n    scrapling extract stealthy-fetch \"https://nopecha.com/demo/cloudflare\" data.txt --solve-cloudflare --css-selector \"#padded_content a\"\n    \n    # Use a proxy for anonymity.\n    scrapling extract stealthy-fetch \"https://site.com\" content.md --proxy \"http://proxy-server:8080\"\n    ```\n    Get the available options for the command with `scrapling extract stealthy-fetch --help` as follows:\n    ```bash\n    Usage: scrapling extract stealthy-fetch [OPTIONS] URL OUTPUT_FILE\n    \n      Use StealthyFetcher to fetch content with advanced stealth features.\n    \n      The output file path can be an HTML file, a Markdown file of the HTML content, or the text content itself. Use file extensions (`.html`/`.md`/`.txt`) respectively.\n    \n    Options:\n      --headless / --no-headless                  Run browser in headless mode (default: True)\n      --disable-resources / --enable-resources    Drop unnecessary resources for speed boost (default: False)\n      --block-webrtc / --allow-webrtc             Block WebRTC entirely (default: False)\n      --solve-cloudflare / --no-solve-cloudflare  Solve Cloudflare challenges (default: False)\n      --allow-webgl / --block-webgl               Allow WebGL (default: True)\n      --network-idle / --no-network-idle          Wait for network idle (default: False)\n      --real-chrome/--no-real-chrome              If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it. (default: False)\n      --timeout INTEGER                           Timeout in milliseconds (default: 30000)\n      --wait INTEGER                              Additional wait time in milliseconds after page load (default: 0)\n      -s, --css-selector TEXT                     CSS selector to extract specific content from the page. It returns all matches.\n      --wait-selector TEXT                        CSS selector to wait for before proceeding\n      --hide-canvas / --show-canvas               Add noise to canvas operations (default: False)\n      --proxy TEXT                                Proxy URL in format \"http://username:password@host:port\"\n      -H, --extra-headers TEXT                    Extra headers in format \"Key: Value\" (can be used multiple times)\n      --help                                      Show this message and exit.\n    ```\n\n## When to use each command\n\nIf you are not a Web Scraping expert and can't decide what to choose, you can use the following formula to help you decide:\n\n- Use **`get`** with simple websites, blogs, or news articles\n- Use **`fetch`** with modern web apps, or sites with dynamic content\n- Use **`stealthy-fetch`** with protected sites, Cloudflare, or anti-bot systems\n\n## Legal and Ethical Considerations\n\n⚠️ **Important Guidelines:**\n\n- **Check robots.txt**: Visit `https://website.com/robots.txt` to see scraping rules\n- **Respect rate limits**: Don't overwhelm servers with requests\n- **Terms of Service**: Read and comply with website terms\n- **Copyright**: Respect intellectual property rights\n- **Privacy**: Be mindful of personal data protection laws\n- **Commercial use**: Ensure you have permission for business purposes\n\n---\n\n*Happy scraping! Remember to always respect website policies and comply with all applicable laws and regulations.*"
  },
  {
    "path": "docs/cli/interactive-shell.md",
    "content": "# Scrapling Interactive Shell Guide\n\n<script src=\"https://asciinema.org/a/736339.js\" id=\"asciicast-736339\" async data-autoplay=\"1\" data-loop=\"1\" data-cols=\"225\" data-rows=\"40\" data-start-at=\"00:06\" data-speed=\"1.5\" data-theme=\"tango\"></script>\n\n**Powerful Web Scraping REPL for Developers and Data Scientists**\n\nThe Scrapling Interactive Shell is an enhanced IPython-based environment designed specifically for Web Scraping tasks. It provides instant access to all Scrapling features, clever shortcuts, automatic page management, and advanced tools, such as conversion of the curl command.\n\n!!! success \"Prerequisites\"\n\n    1. You've completed or read the [Fetchers basics](../fetching/choosing.md) page to understand what the [Response object](../fetching/choosing.md#response-object) is and which fetcher to use.\n    2. You've completed or read the [Querying elements](../parsing/selection.md) page to understand how to find/extract elements from the [Selector](../parsing/main_classes.md#selector)/[Response](../fetching/choosing.md#response-object) object.\n    3. You've completed or read the [Main classes](../parsing/main_classes.md) page to know what properties/methods the [Response](../fetching/choosing.md#response-object) class is inheriting from the [Selector](../parsing/main_classes.md#selector) class.\n    4. You've completed or read at least one page from the fetchers section to use here for requests: [HTTP requests](../fetching/static.md), [Dynamic websites](../fetching/dynamic.md), or [Dynamic websites with hard protections](../fetching/stealthy.md).\n\n\n## Why use the Interactive Shell?\n\nThe interactive shell transforms web scraping from a slow script-and-run cycle into a fast, exploratory experience. It's perfect for:\n\n- **Rapid prototyping**: Test scraping strategies instantly\n- **Data exploration**: Interactively navigate and extract from websites  \n- **Learning Scrapling**: Experiment with features in real-time\n- **Debugging scrapers**: Step through requests and inspect results\n- **Converting workflows**: Transform curl commands from browser DevTools to a Fetcher request in a one-liner\n\n## Getting Started\n\n### Launch the Shell\n\n```bash\n# Start the interactive shell\nscrapling shell\n\n# Execute code and exit (useful for scripting)\nscrapling shell -c \"get('https://quotes.toscrape.com'); print(len(page.css('.quote')))\"\n\n# Set logging level\nscrapling shell --loglevel info\n```\n\nOnce launched, you'll see the Scrapling banner and can immediately start scraping as the video above shows:\n\n```python\n# No imports needed - everything is ready!\n>>> get('https://news.ycombinator.com')\n\n>>> # Explore the page structure\n>>> page.css('a')[:5]  # Look at first 5 links\n\n>>> # Refine your selectors\n>>> stories = page.css('.titleline>a')\n>>> len(stories)\n30\n\n>>> # Extract specific data\n>>> for story in stories[:3]:\n...     title = story.text\n...     url = story['href']\n...     print(f\"{title}: {url}\")\n\n>>> # Try different approaches\n>>> titles = page.css('.titleline>a::text')  # Direct text extraction\n>>> urls = page.css('.titleline>a::attr(href)')  # Direct attribute extraction\n```\n\n## Built-in Shortcuts\n\nThe shell provides convenient shortcuts that eliminate boilerplate code:\n\n- **`get(url, **kwargs)`** - HTTP GET request (instead of `Fetcher.get`)\n- **`post(url, **kwargs)`** - HTTP POST request (instead of `Fetcher.post`)\n- **`put(url, **kwargs)`** - HTTP PUT request (instead of `Fetcher.put`)\n- **`delete(url, **kwargs)`** - HTTP DELETE request (instead of `Fetcher.delete`)\n- **`fetch(url, **kwargs)`** - Browser-based fetch (instead of `DynamicFetcher.fetch`) \n- **`stealthy_fetch(url, **kwargs)`** - Stealthy browser fetch (instead of `StealthyFetcher.fetch`)\n\nThe most commonly used classes are automatically available without any import, including `Fetcher`, `AsyncFetcher`, `DynamicFetcher`, `StealthyFetcher`, and `Selector`.\n\n### Smart Page Management\n\nThe shell automatically tracks your requests and pages:\n\n- **Current Page Access**\n\n    The `page` and `response` commands are automatically updated with the last fetched page:\n    \n    ```python\n    >>> get('https://quotes.toscrape.com')\n    >>> # 'page' and 'response' both refer to the last fetched page\n    >>> page.url\n    'https://quotes.toscrape.com'\n    >>> response.status  # Same as page.status\n    200\n    ```\n\n- **Page History**\n\n    The `pages` command keeps track of the last five pages (it's a `Selectors` object):\n    \n    ```python\n    >>> get('https://site1.com')\n    >>> get('https://site2.com') \n    >>> get('https://site3.com')\n    \n    >>> # Access last 5 pages\n    >>> len(pages)  # `Selectors` object with `page` history\n    3\n    >>> pages[0].url  # First page in history\n    'https://site1.com'\n    >>> pages[-1].url  # Most recent page\n    'https://site3.com'\n    \n    >>> # Work with historical pages\n    >>> for i, old_page in enumerate(pages):\n    ...     print(f\"Page {i}: {old_page.url} - {old_page.status}\")\n    ```\n\n## Additional helpful commands\n\n### Page Visualization\n\nView scraped pages in your browser:\n\n```python\n>>> get('https://quotes.toscrape.com')\n>>> view(page)  # Opens the page HTML in your default browser\n```\n\n### Curl Command Integration\n\nThe shell provides a few functions to help you convert curl commands from the browser DevTools to `Fetcher` requests: `uncurl` and `curl2fetcher`.\n\nFirst, you need to copy a request as a curl command like the following:\n\n<img src=\"../assets/scrapling_shell_curl.png\" title=\"Copying a request as a curl command from Chrome\" alt=\"Copying a request as a curl command from Chrome\" style=\"width: 70%;\"/>\n\n- **Convert Curl command to Request Object**\n\n    ```python\n    >>> curl_cmd = '''curl 'https://scrapling.requestcatcher.com/post' \\\n    ...   -X POST \\\n    ...   -H 'Content-Type: application/json' \\\n    ...   -d '{\"name\": \"test\", \"value\": 123}' '''\n    \n    >>> request = uncurl(curl_cmd)\n    >>> request.method\n    'post'\n    >>> request.url\n    'https://scrapling.requestcatcher.com/post'\n    >>> request.headers\n    {'Content-Type': 'application/json'}\n    ```\n\n- **Execute Curl Command Directly**\n\n    ```python\n    >>> # Convert and execute in one step\n    >>> curl2fetcher(curl_cmd)\n    >>> page.status\n    200\n    >>> page.json()['json']\n    {'name': 'test', 'value': 123}\n    ```\n\n### IPython Features\n\nThe shell inherits all IPython capabilities:\n\n```python\n>>> # Magic commands\n>>> %time page = get('https://example.com')  # Time execution\n>>> %history  # Show command history\n>>> %save filename.py 1-10  # Save commands 1-10 to file\n\n>>> # Tab completion works everywhere\n>>> page.c<TAB>  # Shows: css, cookies, headers, etc.\n>>> Fetcher.<TAB>  # Shows all Fetcher methods\n\n>>> # Object inspection\n>>> get? # Show get documentation\n```\n\n## Examples\n\nHere are a few examples generated via AI:\n\n#### E-commerce Data Collection\n\n```python\n>>> # Start with product listing page\n>>> catalog = get('https://shop.example.com/products')\n\n>>> # Find product links\n>>> product_links = catalog.css('.product-link::attr(href)')\n>>> print(f\"Found {len(product_links)} products\")\n\n>>> # Sample a few products first\n>>> for link in product_links[:3]:\n...     product = get(f\"https://shop.example.com{link}\")\n...     name = product.css('.product-name::text').get('')\n...     price = product.css('.price::text').get('')\n...     print(f\"{name}: {price}\")\n\n>>> # Scale up with sessions for efficiency\n>>> from scrapling.fetchers import FetcherSession\n>>> with FetcherSession() as session:\n...     products = []\n...     for link in product_links:\n...         product = session.get(f\"https://shop.example.com{link}\")\n...         products.append({\n...             'name': product.css('.product-name::text').get(''),\n...             'price': product.css('.price::text').get(''),\n...             'url': link\n...         })\n```\n\n#### API Integration and Testing\n\n```python\n>>> # Test API endpoints interactively\n>>> response = get('https://jsonplaceholder.typicode.com/posts/1')\n>>> response.json()\n{'userId': 1, 'id': 1, 'title': 'sunt aut...', 'body': 'quia et...'}\n\n>>> # Test POST requests\n>>> new_post = post('https://jsonplaceholder.typicode.com/posts', \n...                 json={'title': 'Test Post', 'body': 'Test content', 'userId': 1})\n>>> new_post.json()['id']\n101\n\n>>> # Test with different data\n>>> updated = put(f'https://jsonplaceholder.typicode.com/posts/{new_post.json()[\"id\"]}',\n...               json={'title': 'Updated Title'})\n```\n\n## Getting Help\n\nIf you need help other than what is available in-terminal, you can:\n\n- [Scrapling Documentation](https://scrapling.readthedocs.io/)\n- [Discord Community](https://discord.gg/EMgGbDceNQ)\n- [GitHub Issues](https://github.com/D4Vinci/Scrapling/issues)  \n\nAnd that's it! Happy scraping! The shell makes web scraping as easy as a conversation."
  },
  {
    "path": "docs/cli/overview.md",
    "content": "# Command Line Interface\n\nSince v0.3, Scrapling includes a powerful command-line interface that provides three main capabilities:\n\n1. **Interactive Shell**: An interactive Web Scraping shell based on IPython that provides many shortcuts and useful tools\n2. **Extract Commands**: Scrape websites from the terminal without any programming\n3. **Utility Commands**: Installation and management tools\n\n```bash\n# Launch interactive shell\nscrapling shell\n\n# Convert the content of a page to markdown and save it to a file\nscrapling extract get \"https://example.com\" content.md\n\n# Get help for any command\nscrapling --help\nscrapling extract --help\n```\n\n## Requirements\nThis section requires you to install the extra `shell` dependency group, like the following:\n```bash\npip install \"scrapling[shell]\"\n```\nand the installation of the fetchers' dependencies with the following command\n```bash\nscrapling install\n```\nThis downloads all browsers, along with their system dependencies and fingerprint manipulation dependencies."
  },
  {
    "path": "docs/development/adaptive_storage_system.md",
    "content": "# Writing your retrieval system\n\nScrapling uses SQLite by default, but this tutorial shows how to write your own storage system to store element properties for the `adaptive` feature.\n\nYou might want to use Firebase, for example, and share the database between multiple spiders on different machines. It's a great idea to use an online database like that because spiders can share adaptive data with each other.\n\nSo first, to make your storage class work, it must do the big 3:\n\n1. Inherit from the abstract class `scrapling.core.storage.StorageSystemMixin` and accept a string argument, which will be the `url` argument to maintain the library logic.\n2. Use the decorator `functools.lru_cache` on top of the class to follow the Singleton design pattern as other classes.\n3. Implement methods `save` and `retrieve`, as you see from the type hints:\n    - The method `save` returns nothing and will get two arguments from the library\n        * The first one is of type `lxml.html.HtmlElement`, which is the element itself. It must be converted to a dictionary using the `element_to_dict` function in the submodule `scrapling.core.utils._StorageTools` to maintain the same format, and then saved to your database as you wish.\n        * The second one is a string, the identifier used for retrieval. The combination result of this identifier and the `url` argument from initialization must be unique for each row, or the `adaptive` data will be messed up.\n    - The method `retrieve` takes a string, which is the identifier; using it with the `url` passed on initialization, the element's dictionary is retrieved from the database and returned if it exists; otherwise, it returns `None`.\n\n> If the instructions weren't clear enough for you, you can check my implementation using SQLite3 in [storage_adaptors](https://github.com/D4Vinci/Scrapling/blob/main/scrapling/core/storage.py) file\n\nIf your class meets these criteria, the rest is straightforward. If you plan to use the library in a threaded application, ensure your class supports it. The default used class is thread-safe.\n\nSome helper functions are added to the abstract class if you want to use them. It's easier to see it for yourself in the [code](https://github.com/D4Vinci/Scrapling/blob/main/scrapling/core/storage.py); it's heavily commented :)\n\n\n## Real-World Example: Redis Storage\n\nHere's a more practical example generated by AI using Redis:\n\n```python\nimport redis\nimport orjson\nfrom functools import lru_cache\nfrom scrapling.core.storage import StorageSystemMixin\nfrom scrapling.core.utils import _StorageTools\n\n@lru_cache(None)\nclass RedisStorage(StorageSystemMixin):\n    def __init__(self, host='localhost', port=6379, db=0, url=None):\n        super().__init__(url)\n        self.redis = redis.Redis(\n            host=host,\n            port=port,\n            db=db,\n            decode_responses=False\n        )\n        \n    def save(self, element, identifier: str) -> None:\n        # Convert element to dictionary\n        element_dict = _StorageTools.element_to_dict(element)\n        \n        # Create key\n        key = f\"scrapling:{self._get_base_url()}:{identifier}\"\n        \n        # Store as JSON\n        self.redis.set(\n            key,\n            orjson.dumps(element_dict)\n        )\n        \n    def retrieve(self, identifier: str) -> dict | None:\n        # Get data\n        key = f\"scrapling:{self._get_base_url()}:{identifier}\"\n        data = self.redis.get(key)\n        \n        # Parse JSON if exists\n        if data:\n            return orjson.loads(data)\n        return None\n```"
  },
  {
    "path": "docs/development/scrapling_custom_types.md",
    "content": "# Using Scrapling's custom types\n\n> You can take advantage of the custom-made types for Scrapling and use them outside the library if you want. It's better than copying their code, after all :)\n\n### All current types can be imported alone, like below\n```python\n>>> from scrapling.core.custom_types import TextHandler, AttributesHandler\n\n>>> somestring = TextHandler('{}')\n>>> somestring.json()\n'{}'\n>>> somedict_1 = AttributesHandler({'a': 1})\n>>> somedict_2 = AttributesHandler(a=1)\n```\n\nNote that `TextHandler` is a subclass of Python's `str`, so all standard operations/methods that work with Python strings will work.\nIf you want to check the type in your code, it's better to use Python's built-in `issubclass` function.\n\nThe class `AttributesHandler` is a subclass of `collections.abc.Mapping`, so it's immutable (read-only), and all operations are inherited from it. The data passed can be accessed later through the `_data` property, but be careful; it's of type `types.MappingProxyType`, so it's immutable (read-only) as well (faster than `collections.abc.Mapping` by fractions of seconds).\n\nSo, to make it simple for you, if you are new to Python, the same operations and methods from the Python standard `dict` type will all work with the class `AttributesHandler` except for the ones that try to modify the actual data.\n\nIf you want to modify the data inside `AttributesHandler`, you have to convert it to a dictionary first, e.g., using the `dict` function, and then change it outside."
  },
  {
    "path": "docs/donate.md",
    "content": "I've been creating all of these projects in my spare time and have invested considerable resources & effort in providing them to the community for free. By becoming a sponsor, you'd be directly funding my coffee reserves, helping me fulfill my responsibilities, and enabling me to continuously update existing projects and potentially create new ones.\n\nYou can sponsor me directly through the [GitHub Sponsors program](https://github.com/sponsors/D4Vinci) or [Buy Me a Coffee](https://buymeacoffee.com/d4vinci).\n\nThank you, stay curious, and hack the planet! ❤️\n\n## Advertisement\nIf you are looking to **advertise** your business to our target audience, check out the [available tiers](https://github.com/sponsors/D4Vinci):\n\n### 1. [The Silver tier](https://github.com/sponsors/D4Vinci/sponsorships?tier_id=435495) ($100/month)\nPerks:\n\n1. Your logo will be featured at [the top of Scrapling's project page](https://github.com/D4Vinci/Scrapling?tab=readme-ov-file#sponsors).\n2. The same logo will be featured at [the top of Scrapling's PyPI page](https://pypi.org/project/scrapling/) and [the top of Docker's image page](https://hub.docker.com/r/pyd4vinci/scrapling), the same way it was placed on the project's page.\n\n### 2. [The Gold tier](https://github.com/sponsors/D4Vinci/sponsorships?tier_id=591422) ($200/month)\nPerks:\n\n1. Your logo will be featured at [the top of Scrapling's project page](https://github.com/D4Vinci/Scrapling?tab=readme-ov-file#sponsors).\n2. The same logo will be featured at [the top of Scrapling's PyPI page](https://pypi.org/project/scrapling/) and [the top of Docker's image page](https://hub.docker.com/r/pyd4vinci/scrapling), the same way it was placed on the project's page.\n3. Your logo will be featured as a top sponsor on [Scrapling's website](https://scrapling.readthedocs.io/en/latest/) main page.\n\n### 3. [The Platinum tier](https://github.com/sponsors/D4Vinci/sponsorships?tier_id=586646) ($300/month)\nPerks:\n\n1. Your logo will have a special placement at [the very top of Scrapling's project page](https://github.com/D4Vinci/Scrapling?tab=readme-ov-file#platinum-sponsors) with a 30-word paragraph or less.\n2. The same logo will be featured at [the PyPI page](https://pypi.org/project/scrapling/)/[the Docker page](https://hub.docker.com/r/pyd4vinci/scrapling), the same way it was placed on the project's page.\n3. A special placement for your logo as a top sponsor on [Scrapling's website](https://scrapling.readthedocs.io/en/latest/) main page.\n4. A partner role at our Discord server and an announcement on the Twitter page and the Discord server.\n5. A Shoutout at the end of each Release notes."
  },
  {
    "path": "docs/fetching/choosing.md",
    "content": "# Fetchers basics\n\n## Introduction\nFetchers are classes that can do requests or fetch pages for you easily in a single-line fashion with many features and then return a [Response](#response-object) object. Starting with v0.3, all fetchers have separate classes to keep the session running, so for example, a fetcher that uses a browser will keep the browser open till you finish all your requests through it instead of opening multiple browsers. So it depends on your use case.\n\nThis feature was introduced because, before v0.2, Scrapling was only a parsing engine. The target here is to gradually become the one-stop shop for all Web Scraping needs.\n\n> Fetchers are not wrappers built on top of other libraries. However, they only use these libraries as an engine to request/fetch pages. To further clarify this, all fetchers have features that the underlying engines don't, while still fully leveraging those engines and optimizing them for Web Scraping.\n\n## Fetchers Overview\n\nScrapling provides three different fetcher classes with their session classes; each fetcher is designed for a specific use case.\n\nThe following table compares them and can be quickly used for guidance.\n\n\n| Feature            | Fetcher                                           | DynamicFetcher                                                                    | StealthyFetcher                                                                            |\n|--------------------|---------------------------------------------------|-----------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------|\n| Relative speed     | 🐇🐇🐇🐇🐇                                        | 🐇🐇🐇                                                                            | 🐇🐇🐇                                                                                     |\n| Stealth            | ⭐⭐                                                | ⭐⭐⭐                                                                               | ⭐⭐⭐⭐⭐                                                                                      |\n| Anti-Bot options   | ⭐⭐                                                | ⭐⭐⭐                                                                               | ⭐⭐⭐⭐⭐                                                                                      |\n| JavaScript loading | ❌                                                 | ✅                                                                                 | ✅                                                                                          |\n| Memory Usage       | ⭐                                                 | ⭐⭐⭐                                                                               | ⭐⭐⭐                                                                                        |\n| Best used for      | Basic scraping when HTTP requests alone can do it | - Dynamically loaded websites <br/>- Small automation<br/>- Small-Mid protections | - Dynamically loaded websites <br/>- Small automation <br/>- Small-Complicated protections |\n| Browser(s)         | ❌                                                 | Chromium and Google Chrome                                                        | Chromium and Google Chrome                                                                 |\n| Browser API used   | ❌                                                 | PlayWright                                                                        | PlayWright                                                                                 |\n| Setup Complexity   | Simple                                            | Simple                                                                            | Simple                                                                                     |\n\nIn the following pages, we will talk about each one in detail.\n\n## Parser configuration in all fetchers\nAll fetchers share the same import method, as you will see in the upcoming pages\n```python\n>>> from scrapling.fetchers import Fetcher, AsyncFetcher, StealthyFetcher, DynamicFetcher\n```\nThen you use it right away without initializing like this, and it will use the default parser settings:\n```python\n>>> page = StealthyFetcher.fetch('https://example.com') \n```\nIf you want to configure the parser ([Selector class](../parsing/main_classes.md#selector)) that will be used on the response before returning it for you, then do this first:\n```python\n>>> from scrapling.fetchers import Fetcher\n>>> Fetcher.configure(adaptive=True, keep_comments=False, keep_cdata=False)  # and the rest\n```\nor\n```python\n>>> from scrapling.fetchers import Fetcher\n>>> Fetcher.adaptive=True\n>>> Fetcher.keep_comments=False\n>>> Fetcher.keep_cdata=False  # and the rest\n```\nThen, continue your code as usual.\n\nThe available configuration arguments are: `adaptive`, `adaptive_domain`, `huge_tree`, `keep_comments`, `keep_cdata`, `storage`, and `storage_args`, which are the same ones you give to the [Selector](../parsing/main_classes.md#selector) class. You can display the current configuration anytime by running `<fetcher_class>.display_config()`.\n\n!!! info\n\n    The `adaptive` argument is disabled by default; you must enable it to use that feature.\n\n### Set parser config per request\nAs you probably understand, the logic above for setting the parser config will apply globally to all requests/fetches made through that class, and it's intended for simplicity.\n\nIf your use case requires a different configuration for each request/fetch, you can pass a dictionary to the request method (`fetch`/`get`/`post`/...) to an argument named `selector_config`.\n\n## Response Object\nThe `Response` object is the same as the [Selector](../parsing/main_classes.md#selector) class, but it has additional details about the response, like response headers, status, cookies, etc., as shown below:\n```python\n>>> from scrapling.fetchers import Fetcher\n>>> page = Fetcher.get('https://example.com')\n\n>>> page.status          # HTTP status code\n>>> page.reason          # Status message\n>>> page.cookies         # Response cookies as a dictionary\n>>> page.headers         # Response headers\n>>> page.request_headers # Request headers\n>>> page.history         # Response history of redirections, if any\n>>> page.body            # Raw response body as bytes\n>>> page.encoding        # Response encoding\n>>> page.meta            # Response metadata dictionary (e.g., proxy used). Mainly helpful with the spiders system.\n```\nAll fetchers return the `Response` object.\n\n!!! note\n\n    Unlike the [Selector](../parsing/main_classes.md#selector) class, the `Response` class's body is always bytes since v0.4."
  },
  {
    "path": "docs/fetching/dynamic.md",
    "content": "# Fetching dynamic websites\n\nHere, we will discuss the `DynamicFetcher` class (formerly `PlayWrightFetcher`). This class provides flexible browser automation with multiple configuration options and little under-the-hood stealth improvements.\n\nAs we will explain later, to automate the page, you need some knowledge of [Playwright's Page API](https://playwright.dev/python/docs/api/class-page).\n\n!!! success \"Prerequisites\"\n\n    1. You've completed or read the [Fetchers basics](../fetching/choosing.md) page to understand what the [Response object](../fetching/choosing.md#response-object) is and which fetcher to use.\n    2. You've completed or read the [Querying elements](../parsing/selection.md) page to understand how to find/extract elements from the [Selector](../parsing/main_classes.md#selector)/[Response](../fetching/choosing.md#response-object) object.\n    3. You've completed or read the [Main classes](../parsing/main_classes.md) page to know what properties/methods the [Response](../fetching/choosing.md#response-object) class is inheriting from the [Selector](../parsing/main_classes.md#selector) class.\n\n## Basic Usage\nYou have one primary way to import this Fetcher, which is the same for all fetchers.\n\n```python\n>>> from scrapling.fetchers import DynamicFetcher\n```\nCheck out how to configure the parsing options [here](choosing.md#parser-configuration-in-all-fetchers)\n\nNow, we will review most of the arguments one by one, using examples. If you want to jump to a table of all arguments for quick reference, [click here](#full-list-of-arguments)\n\n!!! abstract\n\n    The async version of the `fetch` method is `async_fetch`, of course.\n\n\nThis fetcher currently provides three main run options that can be combined as desired.\n\nWhich are:\n\n### 1. Vanilla Playwright\n```python\nDynamicFetcher.fetch('https://example.com')\n```\nUsing it in that manner will open a Chromium browser and load the page. There are optimizations for speed, and some stealth goes automatically under the hood, but other than that, there are no tricks or extra features unless you enable some; it's just a plain PlayWright API.\n\n### 2. Real Chrome\n```python\nDynamicFetcher.fetch('https://example.com', real_chrome=True)\n```\nIf you have a Google Chrome browser installed, use this option. It's the same as the first option, but it will use the Google Chrome browser you installed on your device instead of Chromium. This will make your requests look more authentic, so they're less detectable for better results.\n\nIf you don't have Google Chrome installed and want to use this option, you can use the command below in the terminal to install it for the library instead of installing it manually:\n```commandline\nplaywright install chrome\n```\n\n### 3. CDP Connection\n```python\nDynamicFetcher.fetch('https://example.com', cdp_url='ws://localhost:9222')\n```\nInstead of launching a browser locally (Chromium/Google Chrome), you can connect to a remote browser through the [Chrome DevTools Protocol](https://chromedevtools.github.io/devtools-protocol/).\n\n\n!!! note \"Notes:\"\n\n    * There was a `stealth` option here, but it was moved to the `StealthyFetcher` class, as explained on the next page, with additional features since version 0.3.13.<br/>\n    * This makes it less confusing for new users, easier to maintain, and provides other benefits, as explained on the [StealthyFetcher page](../fetching/stealthy.md).\n\n## Full list of arguments\nScrapling provides many options with this fetcher and its session classes. To make it as simple as possible, we will list the options here and give examples of how to use most of them.\n\n|      Argument       | Description                                                                                                                                                                                                                         | Optional |\n|:-------------------:|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:--------:|\n|         url         | Target url                                                                                                                                                                                                                          |    ❌     |\n|      headless       | Pass `True` to run the browser in headless/hidden (**default**) or `False` for headful/visible mode.                                                                                                                                |    ✔️    |\n|  disable_resources  | Drop requests for unnecessary resources for a speed boost. Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.                         |    ✔️    |\n|       cookies       | Set cookies for the next request.                                                                                                                                                                                                   |    ✔️    |\n|      useragent      | Pass a useragent string to be used. **Otherwise, the fetcher will generate and use a real Useragent of the same browser and version.**                                                                                              |    ✔️    |\n|    network_idle     | Wait for the page until there are no network connections for at least 500 ms.                                                                                                                                                       |    ✔️    |\n|      load_dom       | Enabled by default, wait for all JavaScript on page(s) to fully load and execute (wait for the `domcontentloaded` state).                                                                                                           |    ✔️    |\n|       timeout       | The timeout (milliseconds) used in all operations and waits through the page. The default is 30,000 ms (30 seconds).                                                                                                                |    ✔️    |\n|        wait         | The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the `Response` object.                                                                                                |    ✔️    |\n|     page_action     | Added for automation. Pass a function that takes the `page` object and does the necessary automation.                                                                                                                               |    ✔️    |\n|    wait_selector    | Wait for a specific css selector to be in a specific state.                                                                                                                                                                         |    ✔️    |\n|     init_script     | An absolute path to a JavaScript file to be executed on page creation for all pages in this session.                                                                                                                                |    ✔️    |\n| wait_selector_state | Scrapling will wait for the given state to be fulfilled for the selector given with `wait_selector`. _Default state is `attached`._                                                                                                 |    ✔️    |\n|    google_search    | Enabled by default, Scrapling will set a Google referer header.                                                                                                                                                                      |    ✔️    |\n|    extra_headers    | A dictionary of extra headers to add to the request. _The referer set by `google_search` takes priority over the referer set here if used together._                                                                                |    ✔️    |\n|        proxy        | The proxy to be used with requests. It can be a string or a dictionary with only the keys 'server', 'username', and 'password'.                                                                                                     |    ✔️    |\n|     real_chrome     | If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch and use an instance of your browser.                                                                                                |    ✔️    |\n|       locale        | Specify user locale, for example, `en-GB`, `de-DE`, etc. Locale will affect `navigator.language` value, `Accept-Language` request header value, as well as number and date formatting rules. Defaults to the system default locale. |    ✔️    |\n|     timezone_id     | Changes the timezone of the browser. Defaults to the system timezone.                                                                                                                                                               |    ✔️    |\n|       cdp_url       | Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.                                                                                                                          |    ✔️    |\n|    user_data_dir    | Path to a User Data Directory, which stores browser session data like cookies and local storage. The default is to create a temporary directory. **Only Works with sessions**                                                       |    ✔️    |\n|     extra_flags     | A list of additional browser flags to pass to the browser on launch.                                                                                                                                                                |    ✔️    |\n|   additional_args   | Additional arguments to be passed to Playwright's context as additional settings, and they take higher priority than Scrapling's settings.                                                                                          |    ✔️    |\n|   selector_config   | A dictionary of custom parsing arguments to be used when creating the final `Selector`/`Response` class.                                                                                                                            |    ✔️    |\n|   blocked_domains   | A set of domain names to block requests to. Subdomains are also matched (e.g., `\"example.com\"` blocks `\"sub.example.com\"` too).                                                                                                     |    ✔️    |\n|    proxy_rotator    | A `ProxyRotator` instance for automatic proxy rotation. Cannot be combined with `proxy`.                                                                                                                                            |    ✔️    |\n|       retries       | Number of retry attempts for failed requests. Defaults to 3.                                                                                                                                                                        |    ✔️    |\n|     retry_delay     | Seconds to wait between retry attempts. Defaults to 1.                                                                                                                                                                              |    ✔️    |\n\nIn session classes, all these arguments can be set globally for the session. Still, you can configure each request individually by passing some of the arguments here that can be configured on the browser tab level like: `google_search`, `timeout`, `wait`, `page_action`, `extra_headers`, `disable_resources`, `wait_selector`, `wait_selector_state`, `network_idle`, `load_dom`, `blocked_domains`, `proxy`, and `selector_config`.\n\n!!! note \"Notes:\"\n\n    1. The `disable_resources` option made requests ~25% faster in my tests for some websites and can help save your proxy usage, but be careful with it, as it can cause some websites to never finish loading.\n    2. The `google_search` argument is enabled by default for all requests, setting the referer to `https://www.google.com/`. If used together with `extra_headers`, it takes priority over the referer set there.\n    3. Since version 0.3.13, the `stealth` option has been removed here in favor of the `StealthyFetcher` class, and the `hide_canvas` option has been moved to it. The `disable_webgl` argument has been moved to the `StealthyFetcher` class and renamed as `allow_webgl`.\n    4. If you didn't set a user agent and enabled headless mode, the fetcher will generate a real user agent for the same browser version and use it. If you didn't set a user agent and didn't enable headless mode, the fetcher will use the browser's default user agent, which is the same as in standard browsers in the latest versions.\n\n\n## Examples\nIt's easier to understand with examples, so let's take a look.\n\n### Resource Control\n\n```python\n# Disable unnecessary resources\npage = DynamicFetcher.fetch('https://example.com', disable_resources=True)  # Blocks fonts, images, media, etc.\n```\n\n### Domain Blocking\n\n```python\n# Block requests to specific domains (and their subdomains)\npage = DynamicFetcher.fetch('https://example.com', blocked_domains={\"ads.example.com\", \"tracker.net\"})\n```\n\n### Network Control\n\n```python\n# Wait for network idle (Consider fetch to be finished when there are no network connections for at least 500 ms)\npage = DynamicFetcher.fetch('https://example.com', network_idle=True)\n\n# Custom timeout (in milliseconds)\npage = DynamicFetcher.fetch('https://example.com', timeout=30000)  # 30 seconds\n\n# Proxy support (It can also be a dictionary with only the keys 'server', 'username', and 'password'.)\npage = DynamicFetcher.fetch('https://example.com', proxy='http://username:password@host:port')\n```\n\n### Proxy Rotation\n\n```python\nfrom scrapling.fetchers import DynamicSession, ProxyRotator\n\n# Set up proxy rotation\nrotator = ProxyRotator([\n    \"http://proxy1:8080\",\n    \"http://proxy2:8080\",\n    \"http://proxy3:8080\",\n])\n\n# Use with session - rotates proxy automatically with each request\nwith DynamicSession(proxy_rotator=rotator, headless=True) as session:\n    page1 = session.fetch('https://example1.com')\n    page2 = session.fetch('https://example2.com')\n\n    # Override rotator for a specific request\n    page3 = session.fetch('https://example3.com', proxy='http://specific-proxy:8080')\n```\n\n!!! warning\n\n    Remember that by default, all browser-based fetchers and sessions use a persistent browser context with a pool of tabs. However, since browsers can't set a proxy per tab, when you use a `ProxyRotator`, the fetcher will automatically open a separate context for each proxy, with one tab per context. Once the tab's job is done, both the tab and its context are closed.\n\n### Downloading Files\n\n```python\npage = DynamicFetcher.fetch('https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/main_cover.png')\n\nwith open(file='main_cover.png', mode='wb') as f:\n    f.write(page.body)\n```\n\nThe `body` attribute of the `Response` object always returns `bytes`.\n\n### Browser Automation\nThis is where your knowledge about [Playwright's Page API](https://playwright.dev/python/docs/api/class-page) comes into play. The function you pass here takes the page object from Playwright's API, performs the desired action, and then the fetcher continues.\n\nThis function is executed immediately after waiting for `network_idle` (if enabled) and before waiting for the `wait_selector` argument, allowing it to be used for purposes beyond automation. You can alter the page as you want.\n\nIn the example below, I used the pages' [mouse events](https://playwright.dev/python/docs/api/class-mouse) to scroll the page with the mouse wheel, then move the mouse.\n```python\nfrom playwright.sync_api import Page\n\ndef scroll_page(page: Page):\n    page.mouse.wheel(10, 0)\n    page.mouse.move(100, 400)\n    page.mouse.up()\n\npage = DynamicFetcher.fetch('https://example.com', page_action=scroll_page)\n```\nOf course, if you use the async fetch version, the function must also be async.\n```python\nfrom playwright.async_api import Page\n\nasync def scroll_page(page: Page):\n   await page.mouse.wheel(10, 0)\n   await page.mouse.move(100, 400)\n   await page.mouse.up()\n\npage = await DynamicFetcher.async_fetch('https://example.com', page_action=scroll_page)\n```\n\n### Wait Conditions\n\n```python\n# Wait for the selector\npage = DynamicFetcher.fetch(\n    'https://example.com',\n    wait_selector='h1',\n    wait_selector_state='visible'\n)\n```\nThis is the last wait the fetcher will do before returning the response (if enabled). You pass a CSS selector to the `wait_selector` argument, and the fetcher will wait for the state you passed in the `wait_selector_state` argument to be fulfilled. If you didn't pass a state, the default would be `attached`, which means it will wait for the element to be present in the DOM.\n\nAfter that, if `load_dom` is enabled (the default), the fetcher will check again to see if all JavaScript files are loaded and executed (in the `domcontentloaded` state) or continue waiting. If you have enabled `network_idle`, the fetcher will wait for `network_idle` to be fulfilled again, as explained above.\n\nThe states the fetcher can wait for can be any of the following ([source](https://playwright.dev/python/docs/api/class-page#page-wait-for-selector)):\n\n- `attached`: Wait for an element to be present in the DOM.\n- `detached`: Wait for an element to not be present in the DOM.\n- `visible`: wait for an element to have a non-empty bounding box and no `visibility:hidden`. Note that an element without any content or with `display:none` has an empty bounding box and is not considered visible.\n- `hidden`: wait for an element to be either detached from the DOM, or have an empty bounding box, or `visibility:hidden`. This is opposite to the `'visible'` option.\n\n### Some Stealth Features\n\n```python\npage = DynamicFetcher.fetch(\n    'https://example.com',\n    google_search=True,\n    useragent='Mozilla/5.0...',  # Custom user agent\n    locale='en-US',  # Set browser locale\n)\n```\n\n### General example\n```python\nfrom scrapling.fetchers import DynamicFetcher\n\ndef scrape_dynamic_content():\n    # Use Playwright for JavaScript content\n    page = DynamicFetcher.fetch(\n        'https://example.com/dynamic',\n        network_idle=True,\n        wait_selector='.content'\n    )\n    \n    # Extract dynamic content\n    content = page.css('.content')\n    \n    return {\n        'title': content.css('h1::text').get(),\n        'items': [\n            item.text for item in content.css('.item')\n        ]\n    }\n```\n\n## Session Management\n\nTo keep the browser open until you make multiple requests with the same configuration, use `DynamicSession`/`AsyncDynamicSession` classes. Those classes can accept all the arguments that the `fetch` function can take, which enables you to specify a config for the entire session.\n\n```python\nfrom scrapling.fetchers import DynamicSession\n\n# Create a session with default configuration\nwith DynamicSession(\n    headless=True,\n    disable_resources=True,\n    real_chrome=True\n) as session:\n    # Make multiple requests with the same browser instance\n    page1 = session.fetch('https://example1.com')\n    page2 = session.fetch('https://example2.com')\n    page3 = session.fetch('https://dynamic-site.com')\n    \n    # All requests reuse the same tab on the same browser instance\n```\n\n### Async Session Usage\n\n```python\nimport asyncio\nfrom scrapling.fetchers import AsyncDynamicSession\n\nasync def scrape_multiple_sites():\n    async with AsyncDynamicSession(\n        network_idle=True,\n        timeout=30000,\n        max_pages=3\n    ) as session:\n        # Make async requests with shared browser configuration\n        pages = await asyncio.gather(\n            session.fetch('https://spa-app1.com'),\n            session.fetch('https://spa-app2.com'),\n            session.fetch('https://dynamic-content.com')\n        )\n        return pages\n```\n\nYou may have noticed the `max_pages` argument. This is a new argument that enables the fetcher to create a **rotating pool of Browser tabs**. Instead of using a single tab for all your requests, you set a limit on the maximum number of pages that can be displayed at once. With each request, the library will close all tabs that have finished their task and check if the number of the current tabs is lower than the maximum allowed number of pages/tabs, then:\n\n1. If you are within the allowed range, the fetcher will create a new tab for you, and then all is as normal.\n2. Otherwise, it will keep checking every subsecond if creating a new tab is allowed or not for 60 seconds, then raise `TimeoutError`. This can happen when the website you are fetching becomes unresponsive.\n\nThis logic allows for multiple URLs to be fetched at the same time in the same browser, which saves a lot of resources, but most importantly, is so fast :)\n\nIn versions 0.3 and 0.3.1, the pool was reusing finished tabs to save more resources/time. That logic proved flawed, as it's nearly impossible to protect pages/tabs from contamination by the previous configuration used in the request before this one.\n\n### Session Benefits\n\n- **Browser reuse**: Much faster subsequent requests by reusing the same browser instance.\n- **Cookie persistence**: Automatic cookie and session state handling as any browser does automatically.\n- **Consistent fingerprint**: Same browser fingerprint across all requests.\n- **Memory efficiency**: Better resource usage compared to launching new browsers with each fetch.\n\n## When to Use\n\nUse DynamicFetcher when:\n\n- Need browser automation\n- Want multiple browser options\n- Using a real Chrome browser\n- Need custom browser config\n- Want a few stealth options \n\nIf you want more stealth and control without much config, check out the [StealthyFetcher](stealthy.md)."
  },
  {
    "path": "docs/fetching/static.md",
    "content": "# HTTP requests\n\nThe `Fetcher` class provides rapid and lightweight HTTP requests using the high-performance `curl_cffi` library with a lot of stealth capabilities.\n\n!!! success \"Prerequisites\"\n\n    1. You've completed or read the [Fetchers basics](../fetching/choosing.md) page to understand what the [Response object](../fetching/choosing.md#response-object) is and which fetcher to use.\n    2. You've completed or read the [Querying elements](../parsing/selection.md) page to understand how to find/extract elements from the [Selector](../parsing/main_classes.md#selector)/[Response](../fetching/choosing.md#response-object) object.\n    3. You've completed or read the [Main classes](../parsing/main_classes.md) page to know what properties/methods the [Response](../fetching/choosing.md#response-object) class is inheriting from the [Selector](../parsing/main_classes.md#selector) class.\n\n## Basic Usage\nYou have one primary way to import this Fetcher, which is the same for all fetchers.\n\n```python\n>>> from scrapling.fetchers import Fetcher\n```\nCheck out how to configure the parsing options [here](choosing.md#parser-configuration-in-all-fetchers)\n\n### Shared arguments\nAll methods for making requests here share some arguments, so let's discuss them first.\n\n- **url**: The targeted URL\n- **stealthy_headers**: If enabled (default), it creates and adds real browser headers. It also sets a Google referer header.\n- **follow_redirects**: As the name implies, tell the fetcher to follow redirections. **Enabled by default**\n- **timeout**: The number of seconds to wait for each request to be finished. **Defaults to 30 seconds**.\n- **retries**: The number of retries that the fetcher will do for failed requests. **Defaults to three retries**.\n- **retry_delay**: Number of seconds to wait between retry attempts. **Defaults to 1 second**.\n- **impersonate**: Impersonate specific browsers' TLS fingerprints. Accepts browser strings or a list of them like `\"chrome110\"`, `\"firefox102\"`, `\"safari15_5\"` to use specific versions or `\"chrome\"`, `\"firefox\"`, `\"safari\"`, `\"edge\"` to automatically use the latest version available. This makes your requests appear to come from real browsers at the TLS level. If you pass it a list of strings, it will choose a random one with each request. **Defaults to the latest available Chrome version.**\n- **http3**: Use HTTP/3 protocol for requests. **Defaults to False**. It might be problematic if used with `impersonate`.\n- **cookies**: Cookies to use in the request. Can be a dictionary of `name→value` or a list of dictionaries.\n- **proxy**: As the name implies, the proxy for this request is used to route all traffic (HTTP and HTTPS). The format accepted here is `http://username:password@localhost:8030`.\n- **proxy_auth**: HTTP basic auth for proxy, tuple of (username, password).\n- **proxies**: Dict of proxies to use. Format: `{\"http\": proxy_url, \"https\": proxy_url}`.\n- **proxy_rotator**: A `ProxyRotator` instance for automatic proxy rotation. Cannot be combined with `proxy` or `proxies`.\n- **headers**: Headers to include in the request. Can override any header generated by the `stealthy_headers` argument\n- **max_redirects**: Maximum number of redirects. **Defaults to 30**, use -1 for unlimited.\n- **verify**: Whether to verify HTTPS certificates. **Defaults to True**.\n- **cert**: Tuple of (cert, key) filenames for the client certificate.\n- **selector_config**: A dictionary of custom parsing arguments to be used when creating the final `Selector`/`Response` class.\n\n!!! note \"Notes:\"\n\n    1. The currently available browsers to impersonate are (`\"edge\"`, `\"chrome\"`, `\"chrome_android\"`, `\"safari\"`, `\"safari_beta\"`, `\"safari_ios\"`, `\"safari_ios_beta\"`, `\"firefox\"`, `\"tor\"`)<br/>\n    2. The available browsers to impersonate, along with their corresponding versions, are automatically displayed in the argument autocompletion and updated with each `curl_cffi` update.<br/>\n    3. If any of the arguments `impersonate` or `stealthy_headers` are enabled, the fetchers will automatically generate real browser headers that match the browser version used.\n\nOther than this, for further customization, you can pass any arguments that `curl_cffi` supports for any method if that method doesn't already support them.\n\n### HTTP Methods\nThere are additional arguments for each method, depending on the method, such as `params` for GET requests and `data`/`json` for POST/PUT/DELETE requests.\n\nExamples are the best way to explain this:\n\n> Hence: `OPTIONS` and `HEAD` methods are not supported.\n#### GET\n```python\n>>> from scrapling.fetchers import Fetcher\n>>> # Basic GET\n>>> page = Fetcher.get('https://example.com')\n>>> page = Fetcher.get('https://scrapling.requestcatcher.com/get', stealthy_headers=True, follow_redirects=True)\n>>> page = Fetcher.get('https://scrapling.requestcatcher.com/get', proxy='http://username:password@localhost:8030')\n>>> # With parameters\n>>> page = Fetcher.get('https://example.com/search', params={'q': 'query'})\n>>>\n>>> # With headers\n>>> page = Fetcher.get('https://example.com', headers={'User-Agent': 'Custom/1.0'})\n>>> # Basic HTTP authentication\n>>> page = Fetcher.get(\"https://example.com\", auth=(\"my_user\", \"password123\"))\n>>> # Browser impersonation\n>>> page = Fetcher.get('https://example.com', impersonate='chrome')\n>>> # HTTP/3 support\n>>> page = Fetcher.get('https://example.com', http3=True)\n```\nAnd for asynchronous requests, it's a small adjustment \n```python\n>>> from scrapling.fetchers import AsyncFetcher\n>>> # Basic GET\n>>> page = await AsyncFetcher.get('https://example.com')\n>>> page = await AsyncFetcher.get('https://scrapling.requestcatcher.com/get', stealthy_headers=True, follow_redirects=True)\n>>> page = await AsyncFetcher.get('https://scrapling.requestcatcher.com/get', proxy='http://username:password@localhost:8030')\n>>> # With parameters\n>>> page = await AsyncFetcher.get('https://example.com/search', params={'q': 'query'})\n>>>\n>>> # With headers\n>>> page = await AsyncFetcher.get('https://example.com', headers={'User-Agent': 'Custom/1.0'})\n>>> # Basic HTTP authentication\n>>> page = await AsyncFetcher.get(\"https://example.com\", auth=(\"my_user\", \"password123\"))\n>>> # Browser impersonation\n>>> page = await AsyncFetcher.get('https://example.com', impersonate='chrome110')\n>>> # HTTP/3 support\n>>> page = await AsyncFetcher.get('https://example.com', http3=True)\n```\nNeedless to say, the `page` object in all cases is [Response](choosing.md#response-object) object, which is a [Selector](../parsing/main_classes.md#selector) as we said, so you can use it directly\n```python\n>>> page.css('.something.something')\n\n>>> page = Fetcher.get('https://api.github.com/events')\n>>> page.json()\n[{'id': '<redacted>',\n  'type': 'PushEvent',\n  'actor': {'id': '<redacted>',\n   'login': '<redacted>',\n   'display_login': '<redacted>',\n   'gravatar_id': '',\n   'url': 'https://api.github.com/users/<redacted>',\n   'avatar_url': 'https://avatars.githubusercontent.com/u/<redacted>'},\n  'repo': {'id': '<redacted>',\n...\n```\n#### POST\n```python\n>>> from scrapling.fetchers import Fetcher\n>>> # Basic POST\n>>> page = Fetcher.post('https://scrapling.requestcatcher.com/post', data={'key': 'value'}, params={'q': 'query'})\n>>> page = Fetcher.post('https://scrapling.requestcatcher.com/post', data={'key': 'value'}, stealthy_headers=True, follow_redirects=True)\n>>> page = Fetcher.post('https://scrapling.requestcatcher.com/post', data={'key': 'value'}, proxy='http://username:password@localhost:8030', impersonate=\"chrome\")\n>>> # Another example of form-encoded data\n>>> page = Fetcher.post('https://example.com/submit', data={'username': 'user', 'password': 'pass'}, http3=True)\n>>> # JSON data\n>>> page = Fetcher.post('https://example.com/api', json={'key': 'value'})\n```\nAnd for asynchronous requests, it's a small adjustment\n```python\n>>> from scrapling.fetchers import AsyncFetcher\n>>> # Basic POST\n>>> page = await AsyncFetcher.post('https://scrapling.requestcatcher.com/post', data={'key': 'value'})\n>>> page = await AsyncFetcher.post('https://scrapling.requestcatcher.com/post', data={'key': 'value'}, stealthy_headers=True, follow_redirects=True)\n>>> page = await AsyncFetcher.post('https://scrapling.requestcatcher.com/post', data={'key': 'value'}, proxy='http://username:password@localhost:8030', impersonate=\"chrome\")\n>>> # Another example of form-encoded data\n>>> page = await AsyncFetcher.post('https://example.com/submit', data={'username': 'user', 'password': 'pass'}, http3=True)\n>>> # JSON data\n>>> page = await AsyncFetcher.post('https://example.com/api', json={'key': 'value'})\n```\n#### PUT\n```python\n>>> from scrapling.fetchers import Fetcher\n>>> # Basic PUT\n>>> page = Fetcher.put('https://example.com/update', data={'status': 'updated'})\n>>> page = Fetcher.put('https://example.com/update', data={'status': 'updated'}, stealthy_headers=True, follow_redirects=True, impersonate=\"chrome\")\n>>> page = Fetcher.put('https://example.com/update', data={'status': 'updated'}, proxy='http://username:password@localhost:8030')\n>>> # Another example of form-encoded data\n>>> page = Fetcher.put(\"https://scrapling.requestcatcher.com/put\", data={'key': ['value1', 'value2']})\n```\nAnd for asynchronous requests, it's a small adjustment\n```python\n>>> from scrapling.fetchers import AsyncFetcher\n>>> # Basic PUT\n>>> page = await AsyncFetcher.put('https://example.com/update', data={'status': 'updated'})\n>>> page = await AsyncFetcher.put('https://example.com/update', data={'status': 'updated'}, stealthy_headers=True, follow_redirects=True, impersonate=\"chrome\")\n>>> page = await AsyncFetcher.put('https://example.com/update', data={'status': 'updated'}, proxy='http://username:password@localhost:8030')\n>>> # Another example of form-encoded data\n>>> page = await AsyncFetcher.put(\"https://scrapling.requestcatcher.com/put\", data={'key': ['value1', 'value2']})\n```\n\n#### DELETE\n```python\n>>> from scrapling.fetchers import Fetcher\n>>> page = Fetcher.delete('https://example.com/resource/123')\n>>> page = Fetcher.delete('https://example.com/resource/123', stealthy_headers=True, follow_redirects=True, impersonate=\"chrome\")\n>>> page = Fetcher.delete('https://example.com/resource/123', proxy='http://username:password@localhost:8030')\n```\nAnd for asynchronous requests, it's a small adjustment\n```python\n>>> from scrapling.fetchers import AsyncFetcher\n>>> page = await AsyncFetcher.delete('https://example.com/resource/123')\n>>> page = await AsyncFetcher.delete('https://example.com/resource/123', stealthy_headers=True, follow_redirects=True, impersonate=\"chrome\")\n>>> page = await AsyncFetcher.delete('https://example.com/resource/123', proxy='http://username:password@localhost:8030')\n```\n\n## Session Management\n\nFor making multiple requests with the same configuration, use the `FetcherSession` class. It can be used in both synchronous and asynchronous code without issue; the class automatically detects and changes the session type, without requiring a different import.\n\nThe `FetcherSession` class can accept nearly all the arguments that the methods can take, which enables you to specify a config for the entire session and later choose a different config for one of the requests effortlessly, as you will see in the following examples.\n\n```python\nfrom scrapling.fetchers import FetcherSession\n\n# Create a session with default configuration\nwith FetcherSession(\n    impersonate='chrome',\n    http3=True,\n    stealthy_headers=True,\n    timeout=30,\n    retries=3\n) as session:\n    # Make multiple requests with the same settings and the same cookies\n    page1 = session.get('https://scrapling.requestcatcher.com/get')\n    page2 = session.post('https://scrapling.requestcatcher.com/post', data={'key': 'value'})\n    page3 = session.get('https://api.github.com/events')\n\n    # All requests share the same session and connection pool\n```\n\nYou can also use a `ProxyRotator` with `FetcherSession` for automatic proxy rotation across requests:\n\n```python\nfrom scrapling.fetchers import FetcherSession, ProxyRotator\n\nrotator = ProxyRotator([\n    'http://proxy1:8080',\n    'http://proxy2:8080',\n    'http://proxy3:8080',\n])\n\nwith FetcherSession(proxy_rotator=rotator, impersonate='chrome') as session:\n    # Each request automatically uses the next proxy in rotation\n    page1 = session.get('https://example.com/page1')\n    page2 = session.get('https://example.com/page2')\n\n    # You can check which proxy was used via the response metadata\n    print(page1.meta['proxy'])\n```\n\nYou can also override the session proxy (or rotator) for a specific request by passing `proxy=` directly to the request method:\n\n```python\nwith FetcherSession(proxy='http://default-proxy:8080') as session:\n    # Uses the session proxy\n    page1 = session.get('https://example.com/page1')\n\n    # Override the proxy for this specific request\n    page2 = session.get('https://example.com/page2', proxy='http://special-proxy:9090')\n```\n\nAnd here's an async example\n\n```python\nasync with FetcherSession(impersonate='firefox', http3=True) as session:\n    # All standard HTTP methods available\n    response = await session.get('https://example.com')\n    response = await session.post('https://scrapling.requestcatcher.com/post', json={'data': 'value'})\n    response = await session.put('https://scrapling.requestcatcher.com/put', data={'update': 'info'})\n    response = await session.delete('https://scrapling.requestcatcher.com/delete')\n```\nor better\n```python\nimport asyncio\nfrom scrapling.fetchers import FetcherSession\n\n# Async session usage\nasync with FetcherSession(impersonate=\"safari\") as session:\n    urls = ['https://example.com/page1', 'https://example.com/page2']\n\n    tasks = [\n        session.get(url) for url in urls\n    ]\n\n    pages = await asyncio.gather(*tasks)\n```\n\nThe `Fetcher` class uses `FetcherSession` to create a temporary session with each request you make.\n\n### Session Benefits\n\n- **A lot faster**: 10 times faster than creating a single session for each request\n- **Cookie persistence**: Automatic cookie handling across requests\n- **Resource efficiency**: Better memory and CPU usage for multiple requests\n- **Centralized configuration**: Single place to manage request settings\n\n## Examples\nSome well-rounded examples to aid newcomers to Web Scraping\n\n### Basic HTTP Request\n\n```python\nfrom scrapling.fetchers import Fetcher\n\n# Make a request\npage = Fetcher.get('https://example.com')\n\n# Check the status\nif page.status == 200:\n    # Extract title\n    title = page.css('title::text').get()\n    print(f\"Page title: {title}\")\n\n    # Extract all links\n    links = page.css('a::attr(href)').getall()\n    print(f\"Found {len(links)} links\")\n```\n\n### Product Scraping\n\n```python\nfrom scrapling.fetchers import Fetcher\n\ndef scrape_products():\n    page = Fetcher.get('https://example.com/products')\n    \n    # Find all product elements\n    products = page.css('.product')\n    \n    results = []\n    for product in products:\n        results.append({\n            'title': product.css('.title::text').get(),\n            'price': product.css('.price::text').re_first(r'\\d+\\.\\d{2}'),\n            'description': product.css('.description::text').get(),\n            'in_stock': product.has_class('in-stock')\n        })\n    \n    return results\n```\n\n### Downloading Files\n\n```python\nfrom scrapling.fetchers import Fetcher\n\npage = Fetcher.get('https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/main_cover.png')\nwith open(file='main_cover.png', mode='wb') as f:\n   f.write(page.body)\n```\n\n### Pagination Handling\n\n```python\nfrom scrapling.fetchers import Fetcher\n\ndef scrape_all_pages():\n    base_url = 'https://example.com/products?page={}'\n    page_num = 1\n    all_products = []\n    \n    while True:\n        # Get current page\n        page = Fetcher.get(base_url.format(page_num))\n        \n        # Find products\n        products = page.css('.product')\n        if not products:\n            break\n            \n        # Process products\n        for product in products:\n            all_products.append({\n                'name': product.css('.name::text').get(),\n                'price': product.css('.price::text').get()\n            })\n            \n        # Next page\n        page_num += 1\n        \n    return all_products\n```\n\n### Form Submission\n\n```python\nfrom scrapling.fetchers import Fetcher\n\n# Submit login form\nresponse = Fetcher.post(\n    'https://example.com/login',\n    data={\n        'username': 'user@example.com',\n        'password': 'password123'\n    }\n)\n\n# Check login success\nif response.status == 200:\n    # Extract user info\n    user_name = response.css('.user-name::text').get()\n    print(f\"Logged in as: {user_name}\")\n```\n\n### Table Extraction\n\n```python\nfrom scrapling.fetchers import Fetcher\n\ndef extract_table():\n    page = Fetcher.get('https://example.com/data')\n    \n    # Find table\n    table = page.css('table')[0]\n    \n    # Extract headers\n    headers = [\n        th.text for th in table.css('thead th')\n    ]\n    \n    # Extract rows\n    rows = []\n    for row in table.css('tbody tr'):\n        cells = [td.text for td in row.css('td')]\n        rows.append(dict(zip(headers, cells)))\n        \n    return rows\n```\n\n### Navigation Menu\n\n```python\nfrom scrapling.fetchers import Fetcher\n\ndef extract_menu():\n    page = Fetcher.get('https://example.com')\n    \n    # Find navigation\n    nav = page.css('nav')[0]\n    \n    menu = {}\n    for item in nav.css('li'):\n        links = item.css('a')\n        if links:\n            link = links[0]\n            menu[link.text] = {\n                'url': link['href'],\n                'has_submenu': bool(item.css('.submenu'))\n            }\n            \n    return menu\n```\n\n## When to Use\n\nUse `Fetcher` when:\n\n- Need rapid HTTP requests.\n- Want minimal overhead.\n- Don't need JavaScript execution (the website can be scraped through requests).\n- Need some stealth features (ex, the targeted website is using protection but doesn't use JavaScript challenges).\n\nUse `FetcherSession` when:\n\n- Making multiple requests to the same or different sites.\n- Need to maintain cookies/authentication between requests.\n- Want connection pooling for better performance.\n- Require consistent configuration across requests.\n- Working with APIs that require a session state.\n\nUse other fetchers when:\n\n- Need browser automation.\n- Need advanced anti-bot/stealth capabilities.\n- Need JavaScript support or interacting with dynamic content"
  },
  {
    "path": "docs/fetching/stealthy.md",
    "content": "# Fetching dynamic websites with hard protections\n\nHere, we will discuss the `StealthyFetcher` class. This class is very similar to the [DynamicFetcher](dynamic.md#introduction) class, including the browsers, the automation, and the use of [Playwright's API](https://playwright.dev/python/docs/intro). The main difference is that this class provides advanced anti-bot protection bypass capabilities; most of them are handled automatically under the hood, and the rest is up to you to enable.\n\nAs with [DynamicFetcher](dynamic.md#introduction), you will need some knowledge about [Playwright's Page API](https://playwright.dev/python/docs/api/class-page) to automate the page, as we will explain later.\n\n!!! success \"Prerequisites\"\n\n    1. You've completed or read the [DynamicFetcher](dynamic.md#introduction) page since this class builds upon it, and we won't repeat the same information here for that reason.\n    2. You've completed or read the [Fetchers basics](../fetching/choosing.md) page to understand what the [Response object](../fetching/choosing.md#response-object) is and which fetcher to use.\n    3. You've completed or read the [Querying elements](../parsing/selection.md) page to understand how to find/extract elements from the [Selector](../parsing/main_classes.md#selector)/[Response](../fetching/choosing.md#response-object) object.\n    4. You've completed or read the [Main classes](../parsing/main_classes.md) page to know what properties/methods the [Response](../fetching/choosing.md#response-object) class is inheriting from the [Selector](../parsing/main_classes.md#selector) class.\n\n## Basic Usage\nYou have one primary way to import this Fetcher, which is the same for all fetchers.\n\n```python\n>>> from scrapling.fetchers import StealthyFetcher\n```\nCheck out how to configure the parsing options [here](choosing.md#parser-configuration-in-all-fetchers)\n\n!!! abstract\n\n    The async version of the `fetch` method is `async_fetch`, of course.\n\n## What does it do?\n\nThe `StealthyFetcher` class is a stealthy version of the [DynamicFetcher](dynamic.md#introduction) class, and here are some of the things it does:\n\n1. It easily bypasses all types of Cloudflare's Turnstile/Interstitial automatically. \n2. It bypasses CDP runtime leaks and WebRTC leaks.\n3. It isolates JS execution, removes many Playwright fingerprints, and stops detection through some of the known behaviors that bots do.\n4. It generates canvas noise to prevent fingerprinting through canvas.\n5. It automatically patches known methods to detect running in headless mode and provides an option to defeat timezone mismatch attacks.\n6. and other anti-protection options...\n\n## Full list of arguments\nScrapling provides many options with this fetcher and its session classes. Before jumping to the [examples](#examples), here's the full list of arguments\n\n\n|      Argument       | Description                                                                                                                                                                                                                         | Optional |\n|:-------------------:|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:--------:|\n|         url         | Target url                                                                                                                                                                                                                          |    ❌     |\n|      headless       | Pass `True` to run the browser in headless/hidden (**default**) or `False` for headful/visible mode.                                                                                                                                |    ✔️    |\n|  disable_resources  | Drop requests for unnecessary resources for a speed boost. Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.                         |    ✔️    |\n|       cookies       | Set cookies for the next request.                                                                                                                                                                                                   |    ✔️    |\n|      useragent      | Pass a useragent string to be used. **Otherwise, the fetcher will generate and use a real Useragent of the same browser and version.**                                                                                              |    ✔️    |\n|    network_idle     | Wait for the page until there are no network connections for at least 500 ms.                                                                                                                                                       |    ✔️    |\n|      load_dom       | Enabled by default, wait for all JavaScript on page(s) to fully load and execute (wait for the `domcontentloaded` state).                                                                                                           |    ✔️    |\n|       timeout       | The timeout (milliseconds) used in all operations and waits through the page. The default is 30,000 ms (30 seconds).                                                                                                                |    ✔️    |\n|        wait         | The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the `Response` object.                                                                                                |    ✔️    |\n|     page_action     | Added for automation. Pass a function that takes the `page` object and does the necessary automation.                                                                                                                               |    ✔️    |\n|    wait_selector    | Wait for a specific css selector to be in a specific state.                                                                                                                                                                         |    ✔️    |\n|     init_script     | An absolute path to a JavaScript file to be executed on page creation for all pages in this session.                                                                                                                                |    ✔️    |\n| wait_selector_state | Scrapling will wait for the given state to be fulfilled for the selector given with `wait_selector`. _Default state is `attached`._                                                                                                 |    ✔️    |\n|    google_search    | Enabled by default, Scrapling will set a Google referer header.                                                                                                                                                                      |    ✔️    |\n|    extra_headers    | A dictionary of extra headers to add to the request. _The referer set by `google_search` takes priority over the referer set here if used together._                                                                                |    ✔️    |\n|        proxy        | The proxy to be used with requests. It can be a string or a dictionary with only the keys 'server', 'username', and 'password'.                                                                                                     |    ✔️    |\n|     real_chrome     | If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch and use an instance of your browser.                                                                                                |    ✔️    |\n|       locale        | Specify user locale, for example, `en-GB`, `de-DE`, etc. Locale will affect `navigator.language` value, `Accept-Language` request header value, as well as number and date formatting rules. Defaults to the system default locale. |    ✔️    |\n|     timezone_id     | Changes the timezone of the browser. Defaults to the system timezone.                                                                                                                                                               |    ✔️    |\n|       cdp_url       | Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.                                                                                                                          |    ✔️    |\n|    user_data_dir    | Path to a User Data Directory, which stores browser session data like cookies and local storage. The default is to create a temporary directory. **Only Works with sessions**                                                       |    ✔️    |\n|     extra_flags     | A list of additional browser flags to pass to the browser on launch.                                                                                                                                                                |    ✔️    |\n|  solve_cloudflare   | When enabled, fetcher solves all types of Cloudflare's Turnstile/Interstitial challenges before returning the response to you.                                                                                                      |    ✔️    |\n|    block_webrtc     | Forces WebRTC to respect proxy settings to prevent local IP address leak.                                                                                                                                                           |    ✔️    |\n|     hide_canvas     | Add random noise to canvas operations to prevent fingerprinting.                                                                                                                                                                    |    ✔️    |\n|     allow_webgl     | Enabled by default. Disabling it disables WebGL and WebGL 2.0 support entirely. Disabling WebGL is not recommended, as many WAFs now check if WebGL is enabled.                                                                     |    ✔️    |\n|   additional_args   | Additional arguments to be passed to Playwright's context as additional settings, and they take higher priority than Scrapling's settings.                                                                                          |    ✔️    |\n|   selector_config   | A dictionary of custom parsing arguments to be used when creating the final `Selector`/`Response` class.                                                                                                                            |    ✔️    |\n|   blocked_domains   | A set of domain names to block requests to. Subdomains are also matched (e.g., `\"example.com\"` blocks `\"sub.example.com\"` too).                                                                                                     |    ✔️    |\n|    proxy_rotator    | A `ProxyRotator` instance for automatic proxy rotation. Cannot be combined with `proxy`.                                                                                                                                            |    ✔️    |\n|       retries       | Number of retry attempts for failed requests. Defaults to 3.                                                                                                                                                                        |    ✔️    |\n|     retry_delay     | Seconds to wait between retry attempts. Defaults to 1.                                                                                                                                                                              |    ✔️    |\n\nIn session classes, all these arguments can be set globally for the session. Still, you can configure each request individually by passing some of the arguments here that can be configured on the browser tab level like: `google_search`, `timeout`, `wait`, `page_action`, `extra_headers`, `disable_resources`, `wait_selector`, `wait_selector_state`, `network_idle`, `load_dom`, `solve_cloudflare`, `blocked_domains`, `proxy`, and `selector_config`.\n\n!!! note \"Notes:\"\n\n    1. It's basically the same arguments as [DynamicFetcher](dynamic.md#introduction) class, but with these additional arguments: `solve_cloudflare`, `block_webrtc`, `hide_canvas`, and `allow_webgl`.\n    2. The `disable_resources` option made requests ~25% faster in my tests for some websites and can help save your proxy usage, but be careful with it, as it can cause some websites to never finish loading.\n    3. The `google_search` argument is enabled by default for all requests, setting the referer to `https://www.google.com/`. If used together with `extra_headers`, it takes priority over the referer set there.\n    4. If you didn't set a user agent and enabled headless mode, the fetcher will generate a real user agent for the same browser version and use it. If you didn't set a user agent and didn't enable headless mode, the fetcher will use the browser's default user agent, which is the same as in standard browsers in the latest versions.\n\n## Examples\nIt's easier to understand with examples, so we will now review most of the arguments individually. Since it's the same class as the [DynamicFetcher](dynamic.md#introduction), you can refer to that page for more examples, as we won't repeat all the examples from there.\n\n### Cloudflare and stealth options\n\n```python\n# Automatic Cloudflare solver\npage = StealthyFetcher.fetch('https://nopecha.com/demo/cloudflare', solve_cloudflare=True)\n\n# Works with other stealth options\npage = StealthyFetcher.fetch(\n    'https://protected-site.com',\n    solve_cloudflare=True,\n    block_webrtc=True,\n    real_chrome=True,\n    hide_canvas=True,\n    google_search=True,\n    proxy='http://username:password@host:port',  # It can also be a dictionary with only the keys 'server', 'username', and 'password'.\n)\n```\n\nThe `solve_cloudflare` parameter enables automatic detection and solving all types of Cloudflare's Turnstile/Interstitial challenges:\n\n- JavaScript challenges (managed)\n- Interactive challenges (clicking verification boxes)\n- Invisible challenges (automatic background verification)\n\nAnd even solves the custom pages with embedded captcha.\n\n!!! notes \"**Important notes:**\"\n\n    1. Sometimes, with websites that use custom implementations, you will need to use `wait_selector` to make sure Scrapling waits for the real website content to be loaded after solving the captcha. Some websites can be the real definition of an edge case while we are trying to make the solver as generic as possible.\n    2. The timeout should be at least 60 seconds when using the Cloudflare solver for sufficient challenge-solving time.\n    3. This feature works seamlessly with proxies and other stealth options.\n\n### Browser Automation\nThis is where your knowledge about [Playwright's Page API](https://playwright.dev/python/docs/api/class-page) comes into play. The function you pass here takes the page object from Playwright's API, performs the desired action, and then the fetcher continues.\n\nThis function is executed immediately after waiting for `network_idle` (if enabled) and before waiting for the `wait_selector` argument, allowing it to be used for purposes beyond automation. You can alter the page as you want.\n\nIn the example below, I used the pages' [mouse events](https://playwright.dev/python/docs/api/class-mouse) to scroll the page with the mouse wheel, then move the mouse.\n```python\nfrom playwright.sync_api import Page\n\ndef scroll_page(page: Page):\n    page.mouse.wheel(10, 0)\n    page.mouse.move(100, 400)\n    page.mouse.up()\n\npage = StealthyFetcher.fetch('https://example.com', page_action=scroll_page)\n```\nOf course, if you use the async fetch version, the function must also be async.\n```python\nfrom playwright.async_api import Page\n\nasync def scroll_page(page: Page):\n   await page.mouse.wheel(10, 0)\n   await page.mouse.move(100, 400)\n   await page.mouse.up()\n\npage = await StealthyFetcher.async_fetch('https://example.com', page_action=scroll_page)\n```\n\n### Wait Conditions\n```python\n# Wait for the selector\npage = StealthyFetcher.fetch(\n    'https://example.com',\n    wait_selector='h1',\n    wait_selector_state='visible'\n)\n```\nThis is the last wait the fetcher will do before returning the response (if enabled). You pass a CSS selector to the `wait_selector` argument, and the fetcher will wait for the state you passed in the `wait_selector_state` argument to be fulfilled. If you didn't pass a state, the default would be `attached`, which means it will wait for the element to be present in the DOM.\n\nAfter that, if `load_dom` is enabled (the default), the fetcher will check again to see if all JavaScript files are loaded and executed (in the `domcontentloaded` state) or continue waiting. If you have enabled `network_idle`, the fetcher will wait for `network_idle` to be fulfilled again, as explained above.\n\nThe states the fetcher can wait for can be any of the following ([source](https://playwright.dev/python/docs/api/class-page#page-wait-for-selector)):\n\n- `attached`: Wait for an element to be present in the DOM.\n- `detached`: Wait for an element to not be present in the DOM.\n- `visible`: wait for an element to have a non-empty bounding box and no `visibility:hidden`. Note that an element without any content or with `display:none` has an empty bounding box and is not considered visible.\n- `hidden`: wait for an element to be either detached from the DOM, or have an empty bounding box, or `visibility:hidden`. This is opposite to the `'visible'` option.\n\n\n### Real-world example (Amazon)\nThis is for educational purposes only; this example was generated by AI, which also shows how easy it is to work with Scrapling through AI\n```python\ndef scrape_amazon_product(url):\n    # Use StealthyFetcher to bypass protection\n    page = StealthyFetcher.fetch(url)\n\n    # Extract product details\n    return {\n        'title': page.css('#productTitle::text').get().clean(),\n        'price': page.css('.a-price .a-offscreen::text').get(),\n        'rating': page.css('[data-feature-name=\"averageCustomerReviews\"] .a-popover-trigger .a-color-base::text').get(),\n        'reviews_count': page.css('#acrCustomerReviewText::text').re_first(r'[\\d,]+'),\n        'features': [\n            li.get().clean() for li in page.css('#feature-bullets li span::text')\n        ],\n        'availability': page.css('#availability')[0].get_all_text(strip=True),\n        'images': [\n            img.attrib['src'] for img in page.css('#altImages img')\n        ]\n    }\n```\n\n## Session Management\n\nTo keep the browser open until you make multiple requests with the same configuration, use `StealthySession`/`AsyncStealthySession` classes. Those classes can accept all the arguments that the `fetch` function can take, which enables you to specify a config for the entire session.\n\n```python\nfrom scrapling.fetchers import StealthySession\n\n# Create a session with default configuration\nwith StealthySession(\n    headless=True,\n    real_chrome=True,\n    block_webrtc=True,\n    solve_cloudflare=True\n) as session:\n    # Make multiple requests with the same browser instance\n    page1 = session.fetch('https://example1.com')\n    page2 = session.fetch('https://example2.com') \n    page3 = session.fetch('https://nopecha.com/demo/cloudflare')\n    \n    # All requests reuse the same tab on the same browser instance\n```\n\n### Async Session Usage\n\n```python\nimport asyncio\nfrom scrapling.fetchers import AsyncStealthySession\n\nasync def scrape_multiple_sites():\n    async with AsyncStealthySession(\n        real_chrome=True,\n        block_webrtc=True,\n        solve_cloudflare=True,\n        timeout=60000,  # 60 seconds for Cloudflare challenges\n        max_pages=3\n    ) as session:\n        # Make async requests with shared browser configuration\n        pages = await asyncio.gather(\n            session.fetch('https://site1.com'),\n            session.fetch('https://site2.com'), \n            session.fetch('https://protected-site.com')\n        )\n        return pages\n```\n\nYou may have noticed the `max_pages` argument. This is a new argument that enables the fetcher to create a **rotating pool of Browser tabs**. Instead of using a single tab for all your requests, you set a limit on the maximum number of pages that can be displayed at once. With each request, the library will close all tabs that have finished their task and check if the number of the current tabs is lower than the maximum allowed number of pages/tabs, then:\n\n1. If you are within the allowed range, the fetcher will create a new tab for you, and then all is as normal.\n2. Otherwise, it will keep checking every subsecond if creating a new tab is allowed or not for 60 seconds, then raise `TimeoutError`. This can happen when the website you are fetching becomes unresponsive.\n\nThis logic allows for multiple URLs to be fetched at the same time in the same browser, which saves a lot of resources, but most importantly, is so fast :)\n\nIn versions 0.3 and 0.3.1, the pool was reusing finished tabs to save more resources/time. That logic proved flawed, as it's nearly impossible to protect pages/tabs from contamination by the previous configuration used in the request before this one.\n\n### Session Benefits\n\n- **Browser reuse**: Much faster subsequent requests by reusing the same browser instance.\n- **Cookie persistence**: Automatic cookie and session state handling as any browser does automatically.\n- **Consistent fingerprint**: Same browser fingerprint across all requests.\n- **Memory efficiency**: Better resource usage compared to launching new browsers with each fetch.\n\n## Using Camoufox as an engine\n\nThis fetcher used a custom version of [Camoufox](https://github.com/daijro/camoufox) as an engine before version 0.3.13, which was replaced by [patchright](https://github.com/Kaliiiiiiiiii-Vinyzu/patchright) for many reasons. If you see that Camoufox is stable on your device, has no high memory issues, and you want to continue using it, then you can.\n\nFirst, you will need to install the Camoufox library, browser, and Firefox system dependencies if you didn't already:\n```commandline\npip install camoufox\nplaywright install-deps firefox\ncamoufox fetch\n```\nThen you will inherit from `StealthySession` and set it as below:\n```python\nfrom scrapling.fetchers import StealthySession\nfrom playwright.sync_api import sync_playwright\nfrom camoufox.utils import launch_options as generate_launch_options\n\nclass StealthySession(StealthySession):\n    def start(self):\n        \"\"\"Create a browser for this instance and context.\"\"\"\n        if not self.playwright:\n            self.playwright = sync_playwright().start()\n            # Configure camoufox run options here\n            launch_options = generate_launch_options(**{\"headless\": True, \"user_data_dir\": ''})\n            # Here's an example, part of what we have been doing before v0.3.13\n            launch_options = generate_launch_options(**{\n                \"geoip\": False,\n                \"proxy\": self._config.proxy,\n                \"headless\": self._config.headless,\n                \"humanize\": True if self._config.solve_cloudflare else False,  # Better enable humanize for Cloudflare, otherwise it's up to you\n                \"i_know_what_im_doing\": True,  # To turn warnings off with the user configurations\n                \"allow_webgl\": self._config.allow_webgl,\n                \"block_webrtc\": self._config.block_webrtc,\n                \"os\": None,\n                \"user_data_dir\": self._config.user_data_dir,\n                \"firefox_user_prefs\": {\n                    # This is what enabling `enable_cache` does internally, so we do it from here instead\n                    \"browser.sessionhistory.max_entries\": 10,\n                    \"browser.sessionhistory.max_total_viewers\": -1,\n                    \"browser.cache.memory.enable\": True,\n                    \"browser.cache.disk_cache_ssl\": True,\n                    \"browser.cache.disk.smart_size.enabled\": True,\n                },\n                # etc...\n            })\n            self.context = self.playwright.firefox.launch_persistent_context(**launch_options)\n        else:\n            raise RuntimeError(\"Session has been already started\")\n```\nAfter that, you can use it normally as before, even for solving Cloudflare challenges:\n```python\nwith StealthySession(solve_cloudflare=True, headless=True) as session:\n    page = session.fetch('https://sergiodemo.com/security/challenge/legacy-challenge')\n    if page.css('#page-not-found-404'):\n        print('Cloudflare challenge solved successfully!')\n```\n\nThe same logic applies to the `AsyncStealthySession` class with a few differences:\n```python\nfrom scrapling.fetchers import AsyncStealthySession\nfrom playwright.async_api import async_playwright\nfrom camoufox.utils import launch_options as generate_launch_options\n\nclass AsyncStealthySession(AsyncStealthySession):\n    async def start(self):\n        \"\"\"Create a browser for this instance and context.\"\"\"\n        if not self.playwright:\n            self.playwright = await async_playwright().start()\n            # Configure camoufox run options here\n            launch_options = generate_launch_options(**{\"headless\": True, \"user_data_dir\": ''})\n            # or set the launch options as in the above example\n            self.context = await self.playwright.firefox.launch_persistent_context(**launch_options)\n        else:\n            raise RuntimeError(\"Session has been already started\")\n \nasync with AsyncStealthySession(solve_cloudflare=True, headless=True) as session:\n    page = await session.fetch('https://sergiodemo.com/security/challenge/legacy-challenge')\n    if page.css('#page-not-found-404'):\n        print('Cloudflare challenge solved successfully!')\n```\n\nEnjoy! :)\n\n## When to Use\n\nUse StealthyFetcher when:\n\n- Bypassing anti-bot protection\n- Need a reliable browser fingerprint\n- Full JavaScript support needed\n- Want automatic stealth features\n- Need browser automation\n- Dealing with Cloudflare protection"
  },
  {
    "path": "docs/index.md",
    "content": "<style>\n.md-typeset h1 {\n  display: none;\n}\n[data-md-color-scheme=\"default\"] .only-dark { display: none; }\n[data-md-color-scheme=\"slate\"] .only-light { display: none; }\n</style>\n\n<br/>\n<div align=\"center\">\n    <a href=\"https://scrapling.readthedocs.io/en/latest/\" alt=\"poster\">\n        <img alt=\"Scrapling\" src=\"assets/cover_light.svg\" class=\"only-light\">\n        <img alt=\"Scrapling\" src=\"assets/cover_dark.svg\" class=\"only-dark\">\n    </a>\n</div>\n\n<h2 align=\"center\"><i>Effortless Web Scraping for the Modern Web</i></h2><br>\n\nScrapling is an adaptive Web Scraping framework that handles everything from a single request to a full-scale crawl.\n\nIts parser learns from website changes and automatically relocates your elements when pages update. Its fetchers bypass anti-bot systems like Cloudflare Turnstile out of the box. And its spider framework lets you scale up to concurrent, multi-session crawls with pause/resume and automatic proxy rotation — all in a few lines of Python. One library, zero compromises.\n\nBlazing fast crawls with real-time stats and streaming. Built by Web Scrapers for Web Scrapers and regular users, there's something for everyone.\n\n```python\nfrom scrapling.fetchers import Fetcher, StealthyFetcher, DynamicFetcher\nStealthyFetcher.adaptive = True\npage = StealthyFetcher.fetch('https://example.com', headless=True, network_idle=True)  # Fetch website under the radar!\nproducts = page.css('.product', auto_save=True)                                        # Scrape data that survives website design changes!\nproducts = page.css('.product', adaptive=True)                                         # Later, if the website structure changes, pass `adaptive=True` to find them!\n```\nOr scale up to full crawls\n```python\nfrom scrapling.spiders import Spider, Response\n\nclass MySpider(Spider):\n  name = \"demo\"\n  start_urls = [\"https://example.com/\"]\n\n  async def parse(self, response: Response):\n      for item in response.css('.product'):\n          yield {\"title\": item.css('h2::text').get()}\n\nMySpider().start()\n```\n\n## Top Sponsors \n\n<style>\n.ad {\n    width:240px;\n    height:100px;\n}\n\n</style>\n\n<!-- sponsors -->\n<div style=\"text-align: center;\">\n  <a href=\"https://hypersolutions.co/?utm_source=github&utm_medium=readme&utm_campaign=scrapling\" target=\"_blank\" title=\"Bot Protection Bypass API for Akamai, DataDome, Incapsula & Kasada\">\n    <img src=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/HyperSolutions.png\" class=\"ad\">\n  </a>\n  <a href=\"https://birdproxies.com/t/scrapling\" target=\"_blank\" title=\"At Bird Proxies, we eliminate your pains such as banned IPs, geo restriction, and high costs so you can focus on your work.\">\n    <img src=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/BirdProxies.jpg\" class=\"ad\">\n  </a>\n  <a href=\"https://evomi.com?utm_source=github&utm_medium=banner&utm_campaign=d4vinci-scrapling\" target=\"_blank\" title=\"Evomi is your Swiss Quality Proxy Provider, starting at $0.49/GB\">\n    <img src=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/evomi.png\" class=\"ad\">\n  </a>\n  <a href=\"https://tikhub.io/?ref=KarimShoair\" target=\"_blank\" title=\"Unlock the Power of Social Media Data & AI\">\n    <img src=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/TikHub.jpg\" class=\"ad\">\n  </a>\n  <a href=\"https://www.nsocks.com/?keyword=2p67aivg\" target=\"_blank\" title=\"Scalable Web Data Access for AI Applications\">\n    <img src=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/nsocks.png\" class=\"ad\">\n  </a>\n  <a href=\"https://petrosky.io/d4vinci\" target=\"_blank\" title=\"PetroSky delivers cutting-edge VPS hosting.\">\n    <img src=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/petrosky.png\" class=\"ad\">\n  </a>\n  <a href=\"https://substack.thewebscraping.club/p/scrapling-hands-on-guide?utm_source=github&utm_medium=repo&utm_campaign=scrapling\" target=\"_blank\" title=\"The #1 newsletter dedicated to Web Scraping\">\n    <img src=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/TWSC.png\" class=\"ad\">\n  </a>\n  <a href=\"https://proxy-seller.com/?partner=CU9CAA5TBYFFT2\" target=\"_blank\" title=\"Proxy-Seller provides reliable proxy infrastructure for Web Scraping\">\n    <img src=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/ProxySeller.png\" class=\"ad\">\n  </a>\n  <br />\n  <br />\n</div>\n<!-- /sponsors -->\n\n<i><sub>Do you want to show your ad here? Click [here](https://github.com/sponsors/D4Vinci), choose a plan, and enjoy the rest of the perks!</sub></i>\n\n## Key Features\n\n### Spiders — A Full Crawling Framework\n- 🕷️ **Scrapy-like Spider API**: Define spiders with `start_urls`, async `parse` callbacks, and `Request`/`Response` objects.\n- ⚡ **Concurrent Crawling**: Configurable concurrency limits, per-domain throttling, and download delays.\n- 🔄 **Multi-Session Support**: Unified interface for HTTP requests, and stealthy headless browsers in a single spider — route requests to different sessions by ID.\n- 💾 **Pause & Resume**: Checkpoint-based crawl persistence. Press Ctrl+C for a graceful shutdown; restart to resume from where you left off.\n- 📡 **Streaming Mode**: Stream scraped items as they arrive via `async for item in spider.stream()` with real-time stats — ideal for UI, pipelines, and long-running crawls.\n- 🛡️ **Blocked Request Detection**: Automatic detection and retry of blocked requests with customizable logic.\n- 📦 **Built-in Export**: Export results through hooks and your own pipeline or the built-in JSON/JSONL with `result.items.to_json()` / `result.items.to_jsonl()` respectively.\n\n### Advanced Websites Fetching with Session Support\n- **HTTP Requests**: Fast and stealthy HTTP requests with the `Fetcher` class. Can impersonate browsers' TLS fingerprint, headers, and use HTTP/3.\n- **Dynamic Loading**: Fetch dynamic websites with full browser automation through the `DynamicFetcher` class supporting Playwright's Chromium and Google's Chrome.\n- **Anti-bot Bypass**: Advanced stealth capabilities with `StealthyFetcher` and fingerprint spoofing. Can easily bypass all types of Cloudflare's Turnstile/Interstitial with automation.\n- **Session Management**: Persistent session support with `FetcherSession`, `StealthySession`, and `DynamicSession` classes for cookie and state management across requests.\n- **Proxy Rotation**: Built-in `ProxyRotator` with cyclic or custom rotation strategies across all session types, plus per-request proxy overrides.\n- **Domain Blocking**: Block requests to specific domains (and their subdomains) in browser-based fetchers.\n- **Async Support**: Complete async support across all fetchers and dedicated async session classes.\n\n### Adaptive Scraping & AI Integration\n- 🔄 **Smart Element Tracking**: Relocate elements after website changes using intelligent similarity algorithms.\n- 🎯 **Smart Flexible Selection**: CSS selectors, XPath selectors, filter-based search, text search, regex search, and more.\n- 🔍 **Find Similar Elements**: Automatically locate elements similar to found elements.\n- 🤖 **MCP Server to be used with AI**: Built-in MCP server for AI-assisted Web Scraping and data extraction. The MCP server features powerful, custom capabilities that leverage Scrapling to extract targeted content before passing it to the AI (Claude/Cursor/etc), thereby speeding up operations and reducing costs by minimizing token usage. ([demo video](https://www.youtube.com/watch?v=qyFk3ZNwOxE))\n\n### High-Performance & battle-tested Architecture\n- 🚀 **Lightning Fast**: Optimized performance outperforming most Python scraping libraries.\n- 🔋 **Memory Efficient**: Optimized data structures and lazy loading for a minimal memory footprint.\n- ⚡ **Fast JSON Serialization**: 10x faster than the standard library.\n- 🏗️ **Battle tested**: Not only does Scrapling have 92% test coverage and full type hints coverage, but it has been used daily by hundreds of Web Scrapers over the past year.\n\n### Developer/Web Scraper Friendly Experience\n- 🎯 **Interactive Web Scraping Shell**: Optional built-in IPython shell with Scrapling integration, shortcuts, and new tools to speed up Web Scraping scripts development, like converting curl requests to Scrapling requests and viewing requests results in your browser.\n- 🚀 **Use it directly from the Terminal**: Optionally, you can use Scrapling to scrape a URL without writing a single line of code!\n- 🛠️ **Rich Navigation API**: Advanced DOM traversal with parent, sibling, and child navigation methods.\n- 🧬 **Enhanced Text Processing**: Built-in regex, cleaning methods, and optimized string operations.\n- 📝 **Auto Selector Generation**: Generate robust CSS/XPath selectors for any element.\n- 🔌 **Familiar API**: Similar to Scrapy/BeautifulSoup with the same pseudo-elements used in Scrapy/Parsel.\n- 📘 **Complete Type Coverage**: Full type hints for excellent IDE support and code completion. The entire codebase is automatically scanned with **PyRight** and **MyPy** with each change.\n- 🔋 **Ready Docker image**: With each release, a Docker image containing all browsers is automatically built and pushed.\n\n\n## Star History\nScrapling’s GitHub stars have grown steadily since its release (see chart below).\n\n<div id=\"chartContainer\">\n  <a href=\"https://github.com/D4Vinci/Scrapling\">\n    <img id=\"chartImage\" alt=\"Star History Chart\" loading=\"lazy\" src=\"https://api.star-history.com/svg?repos=D4Vinci/Scrapling&type=Date\" height=\"400\"/>\n  </a>\n</div>\n\n<script>\nconst observer = new MutationObserver((mutations) => {\n  mutations.forEach((mutation) => {\n    if (mutation.attributeName === 'data-md-color-media') {\n      const colorMedia = document.body.getAttribute('data-md-color-media');\n      const isDarkScheme = document.body.getAttribute('data-md-color-scheme') === 'slate';\n      const chartImg = document.querySelector('#chartImage');\n      const baseUrl = 'https://api.star-history.com/svg?repos=D4Vinci/Scrapling&type=Date';\n      \n      if (colorMedia === '(prefers-color-scheme)' ? isDarkScheme : colorMedia.includes('dark')) {\n        chartImg.src = `${baseUrl}&theme=dark`;\n      } else {\n        chartImg.src = baseUrl;\n      }\n    }\n  });\n});\n\nobserver.observe(document.body, {\n  attributes: true,\n  attributeFilter: ['data-md-color-media', 'data-md-color-scheme']\n});\n</script>\n\n\n## Installation\nScrapling requires Python 3.10 or higher:\n\n```bash\npip install scrapling\n```\n\nThis installation only includes the parser engine and its dependencies, without any fetchers or commandline dependencies.\n\n### Optional Dependencies\n\n1. If you are going to use any of the extra features below, the fetchers, or their classes, you will need to install fetchers' dependencies and their browser dependencies as follows:\n    ```bash\n    pip install \"scrapling[fetchers]\"\n    \n    scrapling install           # normal install\n    scrapling install  --force  # force reinstall\n    ```\n\n    This downloads all browsers, along with their system dependencies and fingerprint manipulation dependencies.\n\n    Or you can install them from the code instead of running a command like this:\n    ```python\n    from scrapling.cli import install\n    \n    install([], standalone_mode=False)          # normal install\n    install([\"--force\"], standalone_mode=False) # force reinstall\n    ```\n\n2. Extra features:\n\n\n     - Install the MCP server feature:\n       ```bash\n       pip install \"scrapling[ai]\"\n       ```\n     - Install shell features (Web Scraping shell and the `extract` command): \n         ```bash\n         pip install \"scrapling[shell]\"\n         ```\n     - Install everything: \n         ```bash\n         pip install \"scrapling[all]\"\n         ```\n     Don't forget that you need to install the browser dependencies with `scrapling install` after any of these extras (if you didn't already)\n\n### Docker\nYou can also install a Docker image with all extras and browsers with the following command from DockerHub:\n```bash\ndocker pull pyd4vinci/scrapling\n```\nOr download it from the GitHub registry:\n```bash\ndocker pull ghcr.io/d4vinci/scrapling:latest\n```\nThis image is automatically built and pushed using GitHub Actions and the repository's main branch.\n\n## How the documentation is organized\nScrapling has extensive documentation, so we try to follow the [Diátaxis documentation framework](https://diataxis.fr/).\n\n## Support\n\nIf you like Scrapling and want to support its development:\n\n- ⭐ Star the [GitHub repository](https://github.com/D4Vinci/Scrapling)\n- 🚀 Follow us on [Twitter](https://x.com/Scrapling_dev) and join the [discord server](https://discord.gg/EMgGbDceNQ)\n- 💝 Consider [sponsoring the project or buying me a coffee](donate.md) :wink:\n- 🐛 Report bugs and suggest features through [GitHub Issues](https://github.com/D4Vinci/Scrapling/issues)\n\n## License\n\nThis project is licensed under the BSD-3 License. See the [LICENSE](https://github.com/D4Vinci/Scrapling/blob/main/LICENSE) file for details."
  },
  {
    "path": "docs/overrides/main.html",
    "content": "{% extends \"base.html\" %}\n\n{% block announce %}\n  <a href=\"https://dataimpulse.com/?utm_source=scrapling&utm_medium=banner&utm_campaign=scrapling\" target=\"_blank\" style=\"display:flex; justify-content:center; padding:0px 0;\">\n    <img src=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/DataImpulse.png\" alt=\"At DataImpulse, we specialize in developing custom proxy services for your business. Make requests from anywhere, collect data, and enjoy fast connections with our premium proxies.\" style=\"max-height:60px;\">\n  </a>\n{% endblock %}\n\n{% block extrahead %}\n    <!-- Open Graph -->\n    <meta property=\"og:image\" content=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/docs/assets/main_cover.png\" />\n    <meta property=\"og:image:type\" content=\"image/png\" />\n    <meta property=\"og:image:width\" content=\"1344\" />\n    <meta property=\"og:image:height\" content=\"768\" />\n    <meta property=\"og:type\" content=\"website\" />\n    <meta property=\"og:site_name\" content=\"Scrapling documentation\" />\n\n    <!-- Twitter -->\n    <meta name=\"twitter:card\" content=\"summary_large_image\" />\n    <meta name=\"twitter:image\" content=\"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/docs/assets/main_cover.png\" />\n    <meta name=\"twitter:site\" content=\"@Scrapling_dev\" />\n    <meta name=\"twitter:creator\" content=\"@D4Vinci1\" />\n\n    <!-- General -->\n    <meta name=\"author\" content=\"Karim Shoair\" />\n    <meta name=\"theme-color\" content=\"#673ab7\" />\n{% endblock %}"
  },
  {
    "path": "docs/overview.md",
    "content": "## Pick Your Path\n\nNot sure where to start? Pick the path that matches what you're trying to do:\n\n| I want to... | Start here |\n|:---|:---|\n| **Parse HTML** I already have | [Querying elements](parsing/selection.md) — CSS, XPath, and text-based selection |\n| **Quickly scrape a page** and prototype | Pick a [fetcher](fetching/choosing.md) and test right away, or launch the [interactive shell](cli/interactive-shell.md) |\n| **Build a crawler** that scales | [Spiders](spiders/getting-started.md) — concurrent, multi-session crawls with pause/resume |\n| **Scrape without writing code** | [CLI extract commands](cli/extract-commands.md) or hook up the [MCP server](ai/mcp-server.md) to your favourite AI tool |\n| **Migrate** from another library | [From BeautifulSoup](tutorials/migrating_from_beautifulsoup.md) or [Scrapy comparison](spiders/architecture.md#comparison-with-scrapy) |\n\n---\n\nWe will start by quickly reviewing the parsing capabilities. Then we will fetch websites using custom browsers, make requests, and parse the responses.\n\nHere's an HTML document generated by ChatGPT that we will be using as an example throughout this page:\n```html\n<html>\n  <head>\n    <title>Complex Web Page</title>\n    <style>\n      .hidden { display: none; }\n    </style>\n  </head>\n  <body>\n    <header>\n      <nav>\n        <ul>\n          <li> <a href=\"#home\">Home</a> </li>\n          <li> <a href=\"#about\">About</a> </li>\n          <li> <a href=\"#contact\">Contact</a> </li>\n        </ul>\n      </nav>\n    </header>\n    <main>\n      <section id=\"products\" schema='{\"jsonable\": \"data\"}'>\n        <h2>Products</h2>\n        <div class=\"product-list\">\n          <article class=\"product\" data-id=\"1\">\n            <h3>Product 1</h3>\n            <p class=\"description\">This is product 1</p>\n            <span class=\"price\">$10.99</span>\n            <div class=\"hidden stock\">In stock: 5</div>\n          </article>\n\n          <article class=\"product\" data-id=\"2\">\n            <h3>Product 2</h3>\n            <p class=\"description\">This is product 2</p>\n            <span class=\"price\">$20.99</span>\n            <div class=\"hidden stock\">In stock: 3</div>\n          </article>\n\n          <article class=\"product\" data-id=\"3\">\n            <h3>Product 3</h3>\n            <p class=\"description\">This is product 3</p>\n            <span class=\"price\">$15.99</span>\n            <div class=\"hidden stock\">Out of stock</div>\n          </article>\n        </div>\n      </section>\n      \n      <section id=\"reviews\">\n        <h2>Customer Reviews</h2>\n        <div class=\"review-list\">\n          <div class=\"review\" data-rating=\"5\">\n            <p class=\"review-text\">Great product!</p>\n            <span class=\"reviewer\">John Doe</span>\n          </div>\n          <div class=\"review\" data-rating=\"4\">\n            <p class=\"review-text\">Good value for money.</p>\n            <span class=\"reviewer\">Jane Smith</span>\n          </div>\n        </div>\n      </section>\n    </main>\n    <script id=\"page-data\" type=\"application/json\">\n      {\n        \"lastUpdated\": \"2024-09-22T10:30:00Z\",\n        \"totalProducts\": 3\n      }\n    </script>\n  </body>\n</html>\n```\nStarting with loading raw HTML above like this\n```python\nfrom scrapling.parser import Selector\npage = Selector(html_doc)\npage  # <data='<html><head><title>Complex Web Page</tit...'>\n```\nGet all text content on the page recursively\n```python\npage.get_all_text(ignore_tags=('script', 'style'))\n# 'Complex Web Page\\nHome\\nAbout\\nContact\\nProducts\\nProduct 1\\nThis is product 1\\n$10.99\\nIn stock: 5\\nProduct 2\\nThis is product 2\\n$20.99\\nIn stock: 3\\nProduct 3\\nThis is product 3\\n$15.99\\nOut of stock\\nCustomer Reviews\\nGreat product!\\nJohn Doe\\nGood value for money.\\nJane Smith'\n```\n\n## Finding elements\nIf there's an element you want to find on the page, you will find it! Your creativity level is the only limitation!\n\nFinding the first HTML `section` element\n```python\nsection_element = page.find('section')\n# <data='<section id=\"products\" schema='{\"jsonabl...' parent='<main><section id=\"products\" schema='{\"j...'>\n```\nFind all `section` elements\n```python\nsection_elements = page.find_all('section')\n# [<data='<section id=\"products\" schema='{\"jsonabl...' parent='<main><section id=\"products\" schema='{\"j...'>, <data='<section id=\"reviews\"><h2>Customer Revie...' parent='<main><section id=\"products\" schema='{\"j...'>]\n```\nFind all `section` elements whose `id` attribute value is `products`.\n```python\nsection_elements = page.find_all('section', {'id':\"products\"})\n# Same as\nsection_elements = page.find_all('section', id=\"products\")\n# [<data='<section id=\"products\" schema='{\"jsonabl...' parent='<main><section id=\"products\" schema='{\"j...'>]\n```\nFind all `section` elements whose `id` attribute value contains `product`.\n```python\nsection_elements = page.find_all('section', {'id*':\"product\"})\n```\nFind all `h3` elements whose text content matches this regex `Product \\d`\n```python\npage.find_all('h3', re.compile(r'Product \\d'))\n# [<data='<h3>Product 1</h3>' parent='<article class=\"product\" data-id=\"1\"><h3...'>, <data='<h3>Product 2</h3>' parent='<article class=\"product\" data-id=\"2\"><h3...'>, <data='<h3>Product 3</h3>' parent='<article class=\"product\" data-id=\"3\"><h3...'>]\n```\nFind all `h3` and `h2` elements whose text content matches the regex `Product` only\n```python\npage.find_all(['h3', 'h2'], re.compile(r'Product'))\n# [<data='<h3>Product 1</h3>' parent='<article class=\"product\" data-id=\"1\"><h3...'>, <data='<h3>Product 2</h3>' parent='<article class=\"product\" data-id=\"2\"><h3...'>, <data='<h3>Product 3</h3>' parent='<article class=\"product\" data-id=\"3\"><h3...'>, <data='<h2>Products</h2>' parent='<section id=\"products\" schema='{\"jsonabl...'>]\n```\nFind all elements whose text content matches exactly `Products` (Whitespaces are not taken into consideration)\n```python\npage.find_by_text('Products', first_match=False)\n# [<data='<h2>Products</h2>' parent='<section id=\"products\" schema='{\"jsonabl...'>]\n```\nOr find all elements whose text content matches regex `Product \\d`\n```python\npage.find_by_regex(r'Product \\d', first_match=False)\n# [<data='<h3>Product 1</h3>' parent='<article class=\"product\" data-id=\"1\"><h3...'>, <data='<h3>Product 2</h3>' parent='<article class=\"product\" data-id=\"2\"><h3...'>, <data='<h3>Product 3</h3>' parent='<article class=\"product\" data-id=\"3\"><h3...'>]\n```\nFind all elements that are similar to the element you want\n```python\ntarget_element = page.find_by_regex(r'Product \\d', first_match=True)\n# <data='<h3>Product 1</h3>' parent='<article class=\"product\" data-id=\"1\"><h3...'>\ntarget_element.find_similar()\n# [<data='<h3>Product 2</h3>' parent='<article class=\"product\" data-id=\"2\"><h3...'>, <data='<h3>Product 3</h3>' parent='<article class=\"product\" data-id=\"3\"><h3...'>]\n```\nFind the first element that matches a CSS selector\n```python\npage.css('.product-list [data-id=\"1\"]')[0]\n# <data='<article class=\"product\" data-id=\"1\"><h3...' parent='<div class=\"product-list\"> <article clas...'>\n```\nFind all elements that match a CSS selector\n```python\npage.css('.product-list article')\n# [<data='<article class=\"product\" data-id=\"1\"><h3...' parent='<div class=\"product-list\"> <article clas...'>, <data='<article class=\"product\" data-id=\"2\"><h3...' parent='<div class=\"product-list\"> <article clas...'>, <data='<article class=\"product\" data-id=\"3\"><h3...' parent='<div class=\"product-list\"> <article clas...'>]\n```\nFind the first element that matches an XPath selector\n```python\npage.xpath(\"//*[@id='products']/div/article\")[0]\n# <data='<article class=\"product\" data-id=\"1\"><h3...' parent='<div class=\"product-list\"> <article clas...'>\n```\nFind all elements that match an XPath selector\n```python\npage.xpath(\"//*[@id='products']/div/article\")\n# [<data='<article class=\"product\" data-id=\"1\"><h3...' parent='<div class=\"product-list\"> <article clas...'>, <data='<article class=\"product\" data-id=\"2\"><h3...' parent='<div class=\"product-list\"> <article clas...'>, <data='<article class=\"product\" data-id=\"3\"><h3...' parent='<div class=\"product-list\"> <article clas...'>]\n```\n\nWith this, we just scratched the surface of these functions; more advanced options with these selection methods are shown later.\n## Accessing elements' data\nIt's as simple as\n```python\n>>> section_element.tag\n'section'\n>>> print(section_element.attrib)\n{'id': 'products', 'schema': '{\"jsonable\": \"data\"}'}\n>>> section_element.attrib['schema'].json()  # If an attribute value can be converted to json, then use `.json()` to convert it\n{'jsonable': 'data'}\n>>> section_element.text  # Direct text content\n''\n>>> section_element.get_all_text()  # All text content recursively\n'Products\\nProduct 1\\nThis is product 1\\n$10.99\\nIn stock: 5\\nProduct 2\\nThis is product 2\\n$20.99\\nIn stock: 3\\nProduct 3\\nThis is product 3\\n$15.99\\nOut of stock'\n>>> section_element.html_content  # The HTML content of the element\n'<section id=\"products\" schema=\\'{\"jsonable\": \"data\"}\\'><h2>Products</h2>\\n        <div class=\"product-list\">\\n          <article class=\"product\" data-id=\"1\"><h3>Product 1</h3>\\n            <p class=\"description\">This is product 1</p>\\n            <span class=\"price\">$10.99</span>\\n            <div class=\"hidden stock\">In stock: 5</div>\\n          </article><article class=\"product\" data-id=\"2\"><h3>Product 2</h3>\\n            <p class=\"description\">This is product 2</p>\\n            <span class=\"price\">$20.99</span>\\n            <div class=\"hidden stock\">In stock: 3</div>\\n          </article><article class=\"product\" data-id=\"3\"><h3>Product 3</h3>\\n            <p class=\"description\">This is product 3</p>\\n            <span class=\"price\">$15.99</span>\\n            <div class=\"hidden stock\">Out of stock</div>\\n          </article></div>\\n      </section>'\n>>> print(section_element.prettify())  # The prettified version\n'''\n<section id=\"products\" schema='{\"jsonable\": \"data\"}'><h2>Products</h2>\n    <div class=\"product-list\">\n      <article class=\"product\" data-id=\"1\"><h3>Product 1</h3>\n        <p class=\"description\">This is product 1</p>\n        <span class=\"price\">$10.99</span>\n        <div class=\"hidden stock\">In stock: 5</div>\n      </article><article class=\"product\" data-id=\"2\"><h3>Product 2</h3>\n        <p class=\"description\">This is product 2</p>\n        <span class=\"price\">$20.99</span>\n        <div class=\"hidden stock\">In stock: 3</div>\n      </article><article class=\"product\" data-id=\"3\"><h3>Product 3</h3>\n        <p class=\"description\">This is product 3</p>\n        <span class=\"price\">$15.99</span>\n        <div class=\"hidden stock\">Out of stock</div>\n      </article>\n    </div>\n</section>\n'''\n>>> section_element.path  # All the ancestors in the DOM tree of this element\n[<data='<main><section id=\"products\" schema='{\"j...' parent='<body> <header><nav><ul><li> <a href=\"#h...'>,\n <data='<body> <header><nav><ul><li> <a href=\"#h...' parent='<html><head><title>Complex Web Page</tit...'>,\n <data='<html><head><title>Complex Web Page</tit...'>]\n>>> section_element.generate_css_selector\n'#products'\n>>> section_element.generate_full_css_selector\n'body > main > #products > #products'\n>>> section_element.generate_xpath_selector\n\"//*[@id='products']\"\n>>> section_element.generate_full_xpath_selector\n\"//body/main/*[@id='products']\"\n```\n\n## Navigation\nUsing the elements we found above \n\n```python\n>>> section_element.parent\n<data='<main><section id=\"products\" schema='{\"j...' parent='<body> <header><nav><ul><li> <a href=\"#h...'>\n>>> section_element.parent.tag\n'main'\n>>> section_element.parent.parent.tag\n'body'\n>>> section_element.children\n[<data='<h2>Products</h2>' parent='<section id=\"products\" schema='{\"jsonabl...'>,\n <data='<div class=\"product-list\"> <article clas...' parent='<section id=\"products\" schema='{\"jsonabl...'>]\n>>> section_element.siblings\n[<data='<section id=\"reviews\"><h2>Customer Revie...' parent='<main><section id=\"products\" schema='{\"j...'>]\n>>> section_element.next  # gets the next element, the same logic applies to `quote.previous`.\n<data='<section id=\"reviews\"><h2>Customer Revie...' parent='<main><section id=\"products\" schema='{\"j...'>\n>>> section_element.children.css('h2::text').getall()\n['Products']\n>>> page.css('[data-id=\"1\"]')[0].has_class('product')\nTrue\n```\nIf your case needs more than the element's parent, you can iterate over the whole ancestors' tree of any element, like the one below\n```python\nfor ancestor in section_element.iterancestors():\n    # do something with it...\n```\nYou can search for a specific ancestor of an element that satisfies a function; all you need to do is pass a function that takes a `Selector` object as an argument and returns `True` if the condition is satisfied or `False` otherwise, like below:\n```python\n>>> section_element.find_ancestor(lambda ancestor: ancestor.css('nav'))\n<data='<body> <header><nav><ul><li> <a href=\"#h...' parent='<html><head><title>Complex Web Page</tit...'>\n```\n\n## Fetching websites\nInstead of passing the raw HTML to Scrapling, you can retrieve a website's response directly via HTTP requests or by fetching it in a browser.\n\nA fetcher is made for every use case.\n\n### HTTP Requests\nFor simple HTTP requests, there's a `Fetcher` class that can be imported and used as below:\n```python\nfrom scrapling.fetchers import Fetcher\npage = Fetcher.get('https://scrapling.requestcatcher.com/get', impersonate=\"chrome\")\n```\nWith that out of the way, here's how to do all HTTP methods:\n```python\n>>> from scrapling.fetchers import Fetcher\n>>> page = Fetcher.get('https://scrapling.requestcatcher.com/get', stealthy_headers=True, follow_redirects=True)\n>>> page = Fetcher.post('https://scrapling.requestcatcher.com/post', data={'key': 'value'}, proxy='http://username:password@localhost:8030')\n>>> page = Fetcher.put('https://scrapling.requestcatcher.com/put', data={'key': 'value'})\n>>> page = Fetcher.delete('https://scrapling.requestcatcher.com/delete')\n```\nFor Async requests, you will replace the import like below:\n```python\n>>> from scrapling.fetchers import AsyncFetcher\n>>> page = await AsyncFetcher.get('https://scrapling.requestcatcher.com/get', stealthy_headers=True, follow_redirects=True)\n>>> page = await AsyncFetcher.post('https://scrapling.requestcatcher.com/post', data={'key': 'value'}, proxy='http://username:password@localhost:8030')\n>>> page = await AsyncFetcher.put('https://scrapling.requestcatcher.com/put', data={'key': 'value'})\n>>> page = await AsyncFetcher.delete('https://scrapling.requestcatcher.com/delete')\n```\n\n!!! note \"Notes:\"\n\n    1. You have the `stealthy_headers` argument, which, when enabled, makes requests to generate real browser headers and use them, including a Google referer header. It's enabled by default.\n    2. The `impersonate` argument lets you fake the TLS fingerprint for a specific browser version.\n    3. There's also the `http3` argument, which, when enabled, makes the fetcher use HTTP/3 for requests, which makes your requests more authentic\n\nThis is just the tip of the iceberg with this fetcher; check out the rest from [here](fetching/static.md)\n\n### Dynamic loading\nWe have you covered if you deal with dynamic websites like most today!\n\nThe `DynamicFetcher` class (formerly `PlayWrightFetcher`) offers many options for fetching and loading web pages using Chromium-based browsers.\n```python\n>>> from scrapling.fetchers import DynamicFetcher\n>>> page = DynamicFetcher.fetch('https://www.google.com/search?q=%22Scrapling%22', disable_resources=True)  # Vanilla Playwright option\n>>> page.css(\"#search a::attr(href)\").get()\n'https://github.com/D4Vinci/Scrapling'\n>>> # The async version of fetch\n>>> page = await DynamicFetcher.async_fetch('https://www.google.com/search?q=%22Scrapling%22', disable_resources=True)\n>>> page.css(\"#search a::attr(href)\").get()\n'https://github.com/D4Vinci/Scrapling'\n```\nIt's built on top of [Playwright](https://playwright.dev/python/), and it's currently providing two main run options that can be mixed as you want:\n\n- Vanilla Playwright without any modifications other than the ones you chose. It uses the Chromium browser.\n- Real browsers like your Chrome browser by passing the `real_chrome` argument or the CDP URL of your browser to be controlled by the Fetcher, and most of the options can be enabled on it.\n\n\nAgain, this is just the tip of the iceberg with this fetcher. Check out the rest from [here](fetching/dynamic.md) for all details and the complete list of arguments.\n\n### Dynamic anti-protection loading\nWe also have you covered if you deal with dynamic websites with annoying anti-protections!\n\nThe `StealthyFetcher` class uses a stealthy version of the `DynamicFetcher` explained above. \n\nSome of the things it does:\n\n1. It easily bypasses all types of Cloudflare's Turnstile/Interstitial automatically. \n2. It bypasses CDP runtime leaks and WebRTC leaks.\n3. It isolates JS execution, removes many Playwright fingerprints, and stops detection through some of the known behaviors that bots do.\n4. It generates canvas noise to prevent fingerprinting through canvas.\n5. It automatically patches known methods to detect running in headless mode and provides an option to defeat timezone mismatch attacks.\n6. and other anti-protection options...\n\n```python\n>>> from scrapling.fetchers import StealthyFetcher\n>>> page = StealthyFetcher.fetch('https://www.browserscan.net/bot-detection')  # Running headless by default\n>>> page.status == 200\nTrue\n>>> page = StealthyFetcher.fetch('https://nopecha.com/demo/cloudflare', solve_cloudflare=True)  # Solve Cloudflare captcha automatically if presented\n>>> page.status == 200\nTrue\n>>> page = StealthyFetcher.fetch('https://www.browserscan.net/bot-detection', humanize=True, os_randomize=True) # and the rest of arguments...\n>>> # The async version of fetch\n>>> page = await StealthyFetcher.async_fetch('https://www.browserscan.net/bot-detection')\n>>> page.status == 200\nTrue\n```\n\nAgain, this is just the tip of the iceberg with this fetcher. Check out the rest from [here](fetching/stealthy.md) for all details and the complete list of arguments.\n\n---\n\nThat's Scrapling at a glance. If you want to learn more, continue to the next section."
  },
  {
    "path": "docs/parsing/adaptive.md",
    "content": "# Adaptive scraping\n\n!!! success \"Prerequisites\"\n\n    1. You've completed or read the [Querying elements](../parsing/selection.md) page to understand how to find/extract elements from the [Selector](../parsing/main_classes.md#selector) object.\n    2. You've completed or read the [Main classes](../parsing/main_classes.md) page to understand the [Selector](../parsing/main_classes.md#selector) class.\n\nAdaptive scraping (previously known as automatch) is one of Scrapling's most powerful features. It allows your scraper to survive website changes by intelligently tracking and relocating elements.\n\nLet's say you are scraping a page with a structure like this:\n```html\n<div class=\"container\">\n    <section class=\"products\">\n        <article class=\"product\" id=\"p1\">\n            <h3>Product 1</h3>\n            <p class=\"description\">Description 1</p>\n        </article>\n        <article class=\"product\" id=\"p2\">\n            <h3>Product 2</h3>\n            <p class=\"description\">Description 2</p>\n        </article>\n    </section>\n</div>\n```\nAnd you want to scrape the first product, the one with the `p1` ID. You will probably write a selector like this\n```python\npage.css('#p1')\n```\nWhen website owners implement structural changes like\n```html\n<div class=\"new-container\">\n    <div class=\"product-wrapper\">\n        <section class=\"products\">\n            <article class=\"product new-class\" data-id=\"p1\">\n                <div class=\"product-info\">\n                    <h3>Product 1</h3>\n                    <p class=\"new-description\">Description 1</p>\n                </div>\n            </article>\n            <article class=\"product new-class\" data-id=\"p2\">\n                <div class=\"product-info\">\n                    <h3>Product 2</h3>\n                    <p class=\"new-description\">Description 2</p>\n                </div>\n            </article>\n        </section>\n    </div>\n</div>\n```\nThe selector will no longer function, and your code needs maintenance. That's where Scrapling's `adaptive` feature comes into play.\n\nWith Scrapling, you can enable the `adaptive` feature the first time you select an element, and the next time you select that element and it doesn't exist, Scrapling will remember its properties and search on the website for the element with the highest percentage of similarity to that element, and without AI :)\n\n```python\nfrom scrapling import Selector, Fetcher\n# Before the change\npage = Selector(page_source, adaptive=True, url='example.com')\n# or\nFetcher.adaptive = True\npage = Fetcher.get('https://example.com')\n# then\nelement = page.css('#p1', auto_save=True)\nif not element:  # One day website changes?\n    element = page.css('#p1', adaptive=True)  # Scrapling still finds it!\n# the rest of your code...\n```\nBelow, I will show you an example of how to use this feature. Then, we will dive deep into how to use it and provide details about this feature. Note that it works with all selection methods, not just CSS/XPATH selection.\n\n## Real-World Scenario\nLet's use a real website as an example and use one of the fetchers to fetch its source. To achieve this, we need to identify a website that is about to update its design/structure, copy its source, and then wait for the website to change. Of course, that's nearly impossible to know unless I know the website's owner, but that will make it a staged test, haha.\n\nTo solve this issue, I will use [The Web Archive](https://archive.org/)'s [Wayback Machine](https://web.archive.org/). Here is a copy of [StackOverFlow's website in 2010](https://web.archive.org/web/20100102003420/http://stackoverflow.com/); pretty old, eh?</br>Let's see if the adaptive feature can extract the same button in the old design from 2010 and the current design using the same selector :)\n\nIf I want to extract the Questions button from the old design, I can use a selector like this: `#hmenus > div:nth-child(1) > ul > li:nth-child(1) > a`. This selector is too specific because it was generated by Google Chrome.\n\n\nNow, let's test the same selector in both versions\n```python\n>> from scrapling import Fetcher\n>> selector = '#hmenus > div:nth-child(1) > ul > li:nth-child(1) > a'\n>> old_url = \"https://web.archive.org/web/20100102003420/http://stackoverflow.com/\"\n>> new_url = \"https://stackoverflow.com/\"\n>> Fetcher.configure(adaptive = True, adaptive_domain='stackoverflow.com')\n>> \n>> page = Fetcher.get(old_url, timeout=30)\n>> element1 = page.css(selector, auto_save=True)[0]\n>> \n>> # Same selector but used in the updated website\n>> page = Fetcher.get(new_url)\n>> element2 = page.css(selector, adaptive=True)[0]\n>> \n>> if element1.text == element2.text:\n...    print('Scrapling found the same element in the old and new designs!')\n'Scrapling found the same element in the old and new designs!'\n```\nNote that I introduced a new argument called `adaptive_domain`. This is because, for Scrapling, these are two different domains (`archive.org` and `stackoverflow.com`), so Scrapling will isolate their `adaptive` data. To inform Scrapling that they are the same website, we must pass the custom domain we wish to use while saving `adaptive` data for both, ensuring Scrapling doesn't isolate them.\n\nThe code will be the same in a real-world scenario, except it will use the same URL for both requests, so you won't need to use the `adaptive_domain` argument. This is the closest example I can give to real-world cases, so I hope it didn't confuse you :)\n\nHence, in the two examples above, I used both the `Selector` and `Fetcher` classes to show that the adaptive logic is the same.\n\n!!! info\n\n    The main reason for creating the `adaptive_domain` argument was to handle if the website changed its URL while changing the design/structure. In that case, you can use it to continue using the previously stored adaptive data for the new URL. Otherwise, scrapling will consider it a new website and discard the old data.\n\n## How the adaptive scraping feature works\nAdaptive scraping works in two phases:\n\n1. **Save Phase**: Store unique properties of elements\n2. **Match Phase**: Find elements with similar properties later\n\nLet's say you've selected an element through any method and want the library to find it the next time you scrape this website, even if it undergoes structural/design changes. \n\nWith as few technical details as possible, the general logic goes as follows:\n\n  1. You tell Scrapling to save that element's unique properties in one of the ways we will show below.\n  2. Scrapling uses its configured database (SQLite by default) and saves each element's unique properties.\n  3. Now, because everything about the element can be changed or removed by the website's owner(s), nothing from the element can be used as a unique identifier for the database. To solve this issue, I made the storage system rely on two things:\n     1. The domain of the current website. If you are using the `Selector` class, pass it when initializing; if you are using a fetcher, the domain will be automatically taken from the URL.\n     2. An `identifier` to query that element's properties from the database. You don't always have to set the identifier yourself; we'll discuss this later.\n\n     Together, they will later be used to retrieve the element's unique properties from the database.\n\n  4. Later, when the website's structure changes, you tell Scrapling to find the element by enabling `adaptive`. Scrapling retrieves the element's unique properties and matches all elements on the page against the unique properties we already have for this element. A score is calculated based on their similarity to the desired element. In that comparison, everything is taken into consideration, as you will see later \n  5. The element(s) with the highest similarity score to the wanted element are returned.\n\n### The unique properties\nYou might wonder what unique properties we are referring to when discussing the removal or alteration of all element properties.\n\nFor Scrapling, the unique elements we are relying on are:\n\n- Element tag name, text, attributes (names and values), siblings (tag names only), and path (tag names only).\n- Element's parent tag name, attributes (names and values), and text.\n\nBut you need to understand that the comparison between elements isn't exact; it's more about how similar these values are. So everything is considered, even the values' order, like the order in which the element class names were written before and the order in which the same element class names are written now.\n\n## How to use adaptive feature\nThe adaptive feature can be applied to any found element, and it's added as arguments to CSS/XPath Selection methods, as you saw above, but we will get back to that later.\n\nFirst, you must enable the `adaptive` feature by passing `adaptive=True` to the [Selector](main_classes.md#selector) class when you initialize it or enable it in the fetcher you are using of the available fetchers, as we will show.\n\nExamples:\n```python\n>>> from scrapling import Selector, Fetcher\n>>> page = Selector(html_doc, adaptive=True)\n# OR\n>>> Fetcher.adaptive = True\n>>> page = Fetcher.get('https://example.com')\n```\nIf you are using the [Selector](main_classes.md#selector) class, you need to pass the url of the website you are using with the argument `url` so Scrapling can separate the properties saved for each element by domain.\n\nIf you didn't pass a URL, the word `default` will be used in place of the URL field while saving the element's unique properties. So, this will only be an issue if you use the same identifier later for a different website and don't pass the URL parameter when initializing it. The save process overwrites previous data, and the `adaptive` feature uses only the latest saved properties.\n\nBesides those arguments, we have `storage` and `storage_args`. Both are for the class to connect to the database; by default, it uses the SQLite class provided by the library. Those arguments shouldn't matter unless you want to write your own storage system, which we will cover on a [separate page in the development section](../development/adaptive_storage_system.md).\n\nNow that you've enabled the `adaptive` feature globally, you have two main ways to use it.\n\n### The CSS/XPath Selection way\nAs you have seen in the example above, first, you have to use the `auto_save` argument while selecting an element that exists on the page, like below\n```python\nelement = page.css('#p1', auto_save=True)\n```\nAnd when the element doesn't exist, you can use the same selector and the `adaptive` argument, and the library will find it for you\n```python\nelement = page.css('#p1', adaptive=True)\n```\nPretty simple, eh?\n\nWell, a lot happened under the hood here. Remember the identifier we mentioned before that you need to set to retrieve the element you want? Here, with the `css`/`xpath` methods, the identifier is set automatically as the selector you passed here to make things easier :)\n\nAdditionally, for all these methods, you can pass the `identifier` argument to set it yourself. This is useful in some instances, or you can use it to save properties with the `auto_save` argument.\n\n### The manual way\nYou manually save and retrieve an element, then relocate it, which all happens within the `adaptive` feature, as shown below. This allows you to relocate any element using any method or selection!\n\nFirst, let's say you got an element like this by text:\n```python\n>>> element = page.find_by_text('Tipping the Velvet', first_match=True)\n```\nYou can save its unique properties using the `save` method, as shown below, but you must set the identifier yourself. For this example, I chose `my_special_element` as an identifier, but it's best to use a meaningful identifier in your code for the same reason you use meaningful variable names :)\n```python\n>>> page.save(element, 'my_special_element')\n```\nNow, later, when you want to retrieve it and relocate it inside the page with `adaptive`, it would be like this\n```python\n>>> element_dict = page.retrieve('my_special_element')\n>>> page.relocate(element_dict, selector_type=True)\n[<data='<a href=\"catalogue/tipping-the-velvet_99...' parent='<h3><a href=\"catalogue/tipping-the-velve...'>]\n>>> page.relocate(element_dict, selector_type=True).css('::text').getall()\n['Tipping the Velvet']\n```\nHence, the `retrieve` and `relocate` methods are used.\n\nIf you want to keep it as a `lxml.etree` object, leave the `selector_type` argument\n```python\n>>> page.relocate(element_dict)\n[<Element a at 0x105a2a7b0>]\n```\n\n## Troubleshooting\n\n### No Matches Found\n```python\n# 1. Check if data was saved\nelement_data = page.retrieve('identifier')\nif not element_data:\n    print(\"No data saved for this identifier\")\n\n# 2. Try with different identifier\nproducts = page.css('.product', adaptive=True, identifier='old_selector')\n\n# 3. Save again with new identifier\nproducts = page.css('.new-product', auto_save=True, identifier='new_identifier')\n```\n\n### Wrong Elements Matched\n```python\n# Use more specific selectors\nproducts = page.css('.product-list .product', auto_save=True)\n\n# Or save with more context\nproduct = page.find_by_text('Product Name').parent\npage.save(product, 'specific_product')\n```\n\n## Known Issues\nIn the `adaptive` save process, only the unique properties of the first element in the selection results are saved. So if the selector you are using selects different elements on the page in other locations, `adaptive` will return the first element to you only when you relocate it later. This doesn't include combined CSS selectors (Using commas to combine more than one selector, for example), as these selectors are separated and each is executed alone.\n\n## Final thoughts\nExplaining this feature in detail without complications turned out to be challenging. However, still, if there's something left unclear, you can head out to the [discussions section](https://github.com/D4Vinci/Scrapling/discussions), and I will reply to you ASAP, or the Discord server, or reach out to me privately and have a chat :)"
  },
  {
    "path": "docs/parsing/main_classes.md",
    "content": "# Parsing main classes\n\n!!! success \"Prerequisites\"\n\n    - You’ve completed or read the [Querying elements](../parsing/selection.md) page to understand how to find/extract elements from the [Selector](../parsing/main_classes.md#selector) object.\n\nAfter exploring the various ways to select elements with Scrapling and its related features, let's take a step back and examine the [Selector](#selector) class in general, as well as other objects, to gain a better understanding of the parsing engine.\n\nThe [Selector](#selector) class is the core parsing engine in Scrapling, providing HTML parsing and element selection capabilities. You can always import it with any of the following imports\n```python\nfrom scrapling import Selector\nfrom scrapling.parser import Selector\n```\nThen use it directly as you already learned in the [overview](../overview.md) page\n```python\npage = Selector(\n    '<html>...</html>',\n    url='https://example.com'\n)\n\n# Then select elements as you like\nelements = page.css('.product')\n```\nIn Scrapling, the main object you deal with after passing an HTML source or fetching a website is, of course, a [Selector](#selector) object. Any operation you do, like selection, navigation, etc., will return either a [Selector](#selector) object or a [Selectors](#selectors) object, given that the result is element/elements from the page, not text or similar.\n\nIn other words, the main page is a [Selector](#selector) object, and the elements within are [Selector](#selector) objects, and so on. Any text, such as the text content inside elements or the text inside element attributes, is a [TextHandler](#texthandler) object, and the attributes of each element are stored as [AttributesHandler](#attributeshandler). We will return to both objects later, so let's focus on the [Selector](#selector) object.\n\n## Selector\n### Arguments explained\nThe most important one is `content`, it's used to pass the HTML code you want to parse, and it accepts the HTML content as `str` or `bytes`.\n\nOtherwise, you have the arguments `url`, `adaptive`, `storage`, and `storage_args`. All these arguments are settings used with the `adaptive` feature, and they don't make a difference if you are not going to use that feature, so just ignore them for now, and we will explain them in the [adaptive](adaptive.md) feature page.\n\nThen you have the arguments for parsing adjustments or adjusting/manipulating the HTML content while the library is parsing it:\n\n- **encoding**: This is the encoding that will be used while parsing the HTML. The default is `UTF-8`.\n- **keep_comments**: This tells the library whether to keep HTML comments while parsing the page. It's disabled by default because it can cause issues with your scraping in various ways.\n- **keep_cdata**: Same logic as the HTML comments. [cdata](https://stackoverflow.com/questions/7092236/what-is-cdata-in-html) is removed by default for cleaner HTML.\n\nI have intended to ignore the arguments `huge_tree` and `root` to avoid making this page more complicated than needed.\nYou may notice that I'm doing that a lot because it involves advanced features that you don't need to know to use the library. The development section will cover these missing parts if you are very invested.\n\nAfter that, most properties on the main page and its elements are lazily loaded. This means they don't get initialized until you use them like the text content of a page/element, and this is one of the reasons for Scrapling speed :)\n\n### Properties\nYou have already seen much of this on the [overview](../overview.md) page, but don't worry if you didn't. We will review it more thoroughly using more advanced methods/usages. For clarity, the properties for traversal are separated below in the [traversal](#traversal) section.\n\nLet's say we are parsing this HTML page for simplicity:\n```html\n<html>\n  <head>\n    <title>Some page</title>\n  </head>\n  <body>\n    <div class=\"product-list\">\n      <article class=\"product\" data-id=\"1\">\n        <h3>Product 1</h3>\n        <p class=\"description\">This is product 1</p>\n        <span class=\"price\">$10.99</span>\n        <div class=\"hidden stock\">In stock: 5</div>\n      </article>\n    \n      <article class=\"product\" data-id=\"2\">\n        <h3>Product 2</h3>\n        <p class=\"description\">This is product 2</p>\n        <span class=\"price\">$20.99</span>\n        <div class=\"hidden stock\">In stock: 3</div>\n      </article>\n    \n      <article class=\"product\" data-id=\"3\">\n        <h3>Product 3</h3>\n        <p class=\"description\">This is product 3</p>\n        <span class=\"price\">$15.99</span>\n        <div class=\"hidden stock\">Out of stock</div>\n      </article>\n    </div>\n\n    <script id=\"page-data\" type=\"application/json\">\n      {\n        \"lastUpdated\": \"2024-09-22T10:30:00Z\",\n        \"totalProducts\": 3\n      }\n    </script>\n  </body>\n</html>\n```\nLoad the page directly as shown before:\n```python\nfrom scrapling import Selector\npage = Selector(html_doc)\n```\nGet all text content on the page recursively\n```python\n>>> page.get_all_text()\n'Some page\\n\\n    \\n\\n      \\nProduct 1\\nThis is product 1\\n$10.99\\nIn stock: 5\\nProduct 2\\nThis is product 2\\n$20.99\\nIn stock: 3\\nProduct 3\\nThis is product 3\\n$15.99\\nOut of stock'\n```\nGet the first article, as explained before; we will use it as an example\n```python\narticle = page.find('article')\n```\nWith the same logic, get all text content on the element recursively\n```python\n>>> article.get_all_text()\n'Product 1\\nThis is product 1\\n$10.99\\nIn stock: 5'\n```\nBut if you try to get the direct text content, it will be empty because it doesn't have direct text in the HTML code above\n```python\n>>> article.text\n''\n```\nThe `get_all_text` method has the following optional arguments:\n\n1. **separator**: All strings collected will be concatenated using this separator. The default is '\\n'.\n2. **strip**: If enabled, strings will be stripped before concatenation. Disabled by default.\n3. **ignore_tags**: A tuple of all tag names you want to ignore in the final results and ignore any elements nested within them. The default is `('script', 'style',)`.\n4. **valid_values**: If enabled, the method will only collect elements with real values, so all elements with empty text content or only whitespaces will be ignored. It's enabled by default\n\nBy the way, the text returned here is not a standard string but a [TextHandler](#texthandler); we will get to this in detail later, so if the text content can be serialized to JSON, use `.json()` on it\n```python\n>>> script = page.find('script')\n>>> script.json()\n{'lastUpdated': '2024-09-22T10:30:00Z', 'totalProducts': 3}\n```\nLet's continue to get the element tag\n```python\n>>> article.tag\n'article'\n```\nIf you use it on the page directly, you will find that you are operating on the root `html` element\n```python\n>>> page.tag\n'html'\n```\nNow, I think I've hammered the (`page`/`element`) idea, so I won't return to it.\n\nGetting the attributes of the element\n```python\n>>> print(article.attrib)\n{'class': 'product', 'data-id': '1'}\n```\nAccess a specific attribute with any of the following\n```python\n>>> article.attrib['class']\n>>> article.attrib.get('class')\n>>> article['class']  # new in v0.3\n```\nCheck if the attributes contain a specific attribute with any of the methods below\n```python\n>>> 'class' in article.attrib\n>>> 'class' in article  # new in v0.3\n```\nGet the HTML content of the element\n```python\n>>> article.html_content\n'<article class=\"product\" data-id=\"1\"><h3>Product 1</h3>\\n        <p class=\"description\">This is product 1</p>\\n        <span class=\"price\">$10.99</span>\\n        <div class=\"hidden stock\">In stock: 5</div>\\n      </article>'\n```\nGet the prettified version of the element's HTML content\n```python\nprint(article.prettify())\n```\n```html\n<article class=\"product\" data-id=\"1\"><h3>Product 1</h3>\n    <p class=\"description\">This is product 1</p>\n    <span class=\"price\">$10.99</span>\n    <div class=\"hidden stock\">In stock: 5</div>\n</article>\n```\nUse the `.body` property to get the raw content of the page. Starting from v0.4, when used on a `Response` object from fetchers, `.body` always returns `bytes`.\n```python\n>>> page.body\n'<html>\\n  <head>\\n    <title>Some page</title>\\n  </head>\\n  ...'\n```\nTo get all the ancestors in the DOM tree of this element\n```python\n>>> article.path\n[<data='<div class=\"product-list\"> <article clas...' parent='<body> <div class=\"product-list\"> <artic...'>,\n <data='<body> <div class=\"product-list\"> <artic...' parent='<html><head><title>Some page</title></he...'>,\n <data='<html><head><title>Some page</title></he...'>]\n```\nGenerate a CSS shortened selector if possible, or generate the full selector\n```python\n>>> article.generate_css_selector\n'body > div > article'\n>>> article.generate_full_css_selector\n'body > div > article'\n```\nSame case with XPath\n```python\n>>> article.generate_xpath_selector\n\"//body/div/article\"\n>>> article.generate_full_xpath_selector\n\"//body/div/article\"\n```\n\n### Traversal\nUsing the elements we found above, we will go over the properties/methods for moving on the page in detail.\n\nIf you are unfamiliar with the DOM tree or the tree data structure in general, the following traversal part can be confusing. I recommend you look up these concepts online to better understand them.\n\nIf you are too lazy to search about it, here's a quick explanation to give you a good idea.<br/>\nIn simple words, the `html` element is the root of the website's tree, as every page starts with an `html` element.<br/>\nThis element will be positioned directly above elements such as `head` and `body`. These are considered \"children\" of the `html` element, and the `html` element is considered their \"parent\". The element `body` is a \"sibling\" of the element `head` and vice versa.\n\nAccessing the parent of an element\n```python\n>>> article.parent\n<data='<div class=\"product-list\"> <article clas...' parent='<body> <div class=\"product-list\"> <artic...'>\n>>> article.parent.tag\n'div'\n```\nYou can chain it as you want, which applies to all similar properties/methods we will review.\n```python\n>>> article.parent.parent.tag\n'body'\n```\nGet the children of an element\n```python\n>>> article.children\n[<data='<h3>Product 1</h3>' parent='<article class=\"product\" data-id=\"1\"><h3...'>,\n <data='<p class=\"description\">This is product 1...' parent='<article class=\"product\" data-id=\"1\"><h3...'>,\n <data='<span class=\"price\">$10.99</span>' parent='<article class=\"product\" data-id=\"1\"><h3...'>,\n <data='<div class=\"hidden stock\">In stock: 5</d...' parent='<article class=\"product\" data-id=\"1\"><h3...'>]\n```\nGet all elements underneath an element. It acts as a nested version of the `children` property\n```python\n>>> article.below_elements\n[<data='<h3>Product 1</h3>' parent='<article class=\"product\" data-id=\"1\"><h3...'>,\n <data='<p class=\"description\">This is product 1...' parent='<article class=\"product\" data-id=\"1\"><h3...'>,\n <data='<span class=\"price\">$10.99</span>' parent='<article class=\"product\" data-id=\"1\"><h3...'>,\n <data='<div class=\"hidden stock\">In stock: 5</d...' parent='<article class=\"product\" data-id=\"1\"><h3...'>]\n```\nThis element returns the same result as the `children` property because its children don't have children.\n\nAnother example of using the element with the `product-list` class will clear the difference between the `children` property and the `below_elements` property\n```python\n>>> products_list = page.css('.product-list')[0]\n>>> products_list.children\n[<data='<article class=\"product\" data-id=\"1\"><h3...' parent='<div class=\"product-list\"> <article clas...'>,\n <data='<article class=\"product\" data-id=\"2\"><h3...' parent='<div class=\"product-list\"> <article clas...'>,\n <data='<article class=\"product\" data-id=\"3\"><h3...' parent='<div class=\"product-list\"> <article clas...'>]\n\n>>> products_list.below_elements\n[<data='<article class=\"product\" data-id=\"1\"><h3...' parent='<div class=\"product-list\"> <article clas...'>,\n <data='<h3>Product 1</h3>' parent='<article class=\"product\" data-id=\"1\"><h3...'>,\n <data='<p class=\"description\">This is product 1...' parent='<article class=\"product\" data-id=\"1\"><h3...'>,\n <data='<span class=\"price\">$10.99</span>' parent='<article class=\"product\" data-id=\"1\"><h3...'>,\n <data='<div class=\"hidden stock\">In stock: 5</d...' parent='<article class=\"product\" data-id=\"1\"><h3...'>,\n <data='<article class=\"product\" data-id=\"2\"><h3...' parent='<div class=\"product-list\"> <article clas...'>,\n...]\n```\nGet the siblings of an element\n```python\n>>> article.siblings\n[<data='<article class=\"product\" data-id=\"2\"><h3...' parent='<div class=\"product-list\"> <article clas...'>,\n <data='<article class=\"product\" data-id=\"3\"><h3...' parent='<div class=\"product-list\"> <article clas...'>]\n```\nGet the next element of the current element\n```python\n>>> article.next\n<data='<article class=\"product\" data-id=\"2\"><h3...' parent='<div class=\"product-list\"> <article clas...'>\n```\nThe same logic applies to the `previous` property\n```python\n>>> article.previous  # It's the first child, so it doesn't have a previous element\n>>> second_article = page.css('.product[data-id=\"2\"]')[0]\n>>> second_article.previous\n<data='<article class=\"product\" data-id=\"1\"><h3...' parent='<div class=\"product-list\"> <article clas...'>\n```\nYou can check easily and pretty fast if an element has a specific class name or not\n```python\n>>> article.has_class('product')\nTrue\n```\nIf your case needs more than the element's parent, you can iterate over the whole ancestors' tree of any element, like the example below\n```python\nfor ancestor in article.iterancestors():\n    # do something with it...\n```\nYou can search for a specific ancestor of an element that satisfies a search function; all you need to do is pass a function that takes a [Selector](#selector) object as an argument and return `True` if the condition satisfies or `False` otherwise, like below:\n```python\n>>> article.find_ancestor(lambda ancestor: ancestor.has_class('product-list'))\n<data='<div class=\"product-list\"> <article clas...' parent='<body> <div class=\"product-list\"> <artic...'>\n\n>>> article.find_ancestor(lambda ancestor: ancestor.css('.product-list'))  # Same result, different approach\n<data='<div class=\"product-list\"> <article clas...' parent='<body> <div class=\"product-list\"> <artic...'>\n```\n## Selectors\nThe class `Selectors` is the \"List\" version of the [Selector](#selector) class. It inherits from the Python standard `List` type, so it shares all `List` properties and methods while adding more methods to make the operations you want to execute on the [Selector](#selector) instances within more straightforward.\n\nIn the [Selector](#selector) class, all methods/properties that should return a group of elements return them as a [Selectors](#selectors) class instance.\n\nStarting with v0.4, all selection methods consistently return [Selector](#selector)/[Selectors](#selectors) objects, even for text nodes and attribute values. Text nodes (selected via `::text`, `/text()`, `::attr()`, `/@attr`) are wrapped in [Selector](#selector) objects. These text node selectors have `tag` set to `\"#text\"`, and their `text` property returns the text value. You can still access the text value directly, and all other properties return empty/default values gracefully.\n\n```python\n>>> page.css('a::text')              # -> Selectors (of text node Selectors)\n>>> page.xpath('//a/text()')         # -> Selectors\n>>> page.css('a::text').get()        # -> TextHandler (the first text value)\n>>> page.css('a::text').getall()     # -> TextHandlers (all text values)\n>>> page.css('a::attr(href)')        # -> Selectors\n>>> page.xpath('//a/@href')          # -> Selectors\n>>> page.css('.price_color')         # -> Selectors\n```\n\n### Data extraction methods\nStarting with v0.4, [Selector](#selector) and [Selectors](#selectors) both provide `get()`, `getall()`, and their aliases `extract_first` and `extract` (following Scrapy conventions). The old `get_all()` method has been removed.\n\n**On a [Selector](#selector) object:**\n\n- `get()` returns a `TextHandler` — for text node selectors, it returns the text value; for HTML element selectors, it returns the serialized outer HTML.\n- `getall()` returns a `TextHandlers` list containing the single serialized string.\n- `extract_first` is an alias for `get()`, and `extract` is an alias for `getall()`.\n\n```python\n>>> page.css('h3')[0].get()        # Outer HTML of the element\n'<h3>Product 1</h3>'\n\n>>> page.css('h3::text')[0].get()  # Text value of the text node\n'Product 1'\n```\n\n**On a [Selectors](#selectors) object:**\n\n- `get(default=None)` returns the serialized string of the **first** element, or `default` if the list is empty.\n- `getall()` serializes **all** elements and returns a `TextHandlers` list.\n- `extract_first` is an alias for `get()`, and `extract` is an alias for `getall()`.\n\n```python\n>>> page.css('.price::text').get()      # First price text\n'$10.99'\n\n>>> page.css('.price::text').getall()   # All price texts\n['$10.99', '$20.99', '$15.99']\n\n>>> page.css('.price::text').get('')    # With default value\n'$10.99'\n```\n\nThese methods work seamlessly with all selection types (CSS, XPath, `find`, etc.) and are the recommended way to extract text and attribute values in a Scrapy-compatible style.\n\nNow, let's see what [Selectors](#selectors) class adds to the table with that out of the way.\n### Properties\nApart from the standard operations on Python lists, such as iteration and slicing.\n\nYou can do the following:\n\nExecute CSS and XPath selectors directly on the [Selector](#selector) instances it has, while the return types are the same as [Selector](#selector)'s `css` and `xpath` methods. The arguments are similar, except the `adaptive` argument is not available here. This, of course, makes chaining methods very straightforward.\n```python\n>>> page.css('.product_pod a')\n[<data='<a href=\"catalogue/a-light-in-the-attic_...' parent='<div class=\"image_container\"> <a href=\"c...'>,\n <data='<a href=\"catalogue/a-light-in-the-attic_...' parent='<h3><a href=\"catalogue/a-light-in-the-at...'>,\n <data='<a href=\"catalogue/tipping-the-velvet_99...' parent='<div class=\"image_container\"> <a href=\"c...'>,\n <data='<a href=\"catalogue/tipping-the-velvet_99...' parent='<h3><a href=\"catalogue/tipping-the-velve...'>,\n <data='<a href=\"catalogue/soumission_998/index....' parent='<div class=\"image_container\"> <a href=\"c...'>,\n <data='<a href=\"catalogue/soumission_998/index....' parent='<h3><a href=\"catalogue/soumission_998/in...'>,\n...]\n\n>>> page.css('.product_pod').css('a')  # Returns the same result\n[<data='<a href=\"catalogue/a-light-in-the-attic_...' parent='<div class=\"image_container\"> <a href=\"c...'>,\n <data='<a href=\"catalogue/a-light-in-the-attic_...' parent='<h3><a href=\"catalogue/a-light-in-the-at...'>,\n <data='<a href=\"catalogue/tipping-the-velvet_99...' parent='<div class=\"image_container\"> <a href=\"c...'>,\n <data='<a href=\"catalogue/tipping-the-velvet_99...' parent='<h3><a href=\"catalogue/tipping-the-velve...'>,\n <data='<a href=\"catalogue/soumission_998/index....' parent='<div class=\"image_container\"> <a href=\"c...'>,\n <data='<a href=\"catalogue/soumission_998/index....' parent='<h3><a href=\"catalogue/soumission_998/in...'>,\n...]\n```\nRun the `re` and `re_first` methods directly. They take the same arguments passed to the [Selector](#selector) class. I will leave the explanation of these methods to the [TextHandler](#texthandler) section below.\n\nHowever, in this class, the `re_first` behaves differently as it runs `re` on each [Selector](#selector) within and returns the first one with a result. The `re` method will return a [TextHandlers](#texthandlers) object as normal, which combines all the [TextHandler](#texthandler) instances into one [TextHandlers](#texthandlers) instance.\n```python\n>>> page.css('.price_color').re(r'[\\d\\.]+')\n['51.77',\n '53.74',\n '50.10',\n '47.82',\n '54.23',\n...]\n\n>>> page.css('.product_pod h3 a::attr(href)').re(r'catalogue/(.*)/index.html')\n['a-light-in-the-attic_1000',\n 'tipping-the-velvet_999',\n 'soumission_998',\n 'sharp-objects_997',\n...]\n```\nWith the `search` method, you can search quickly in the available [Selector](#selector) instances. The function you pass must accept a [Selector](#selector) instance as the first argument and return True/False. The method will return the first [Selector](#selector) instance that satisfies the function; otherwise, it will return `None`.\n```python\n# Find all the products with price '53.23'.\n>>> search_function = lambda p: float(p.css('.price_color').re_first(r'[\\d\\.]+')) == 54.23\n>>> page.css('.product_pod').search(search_function)\n<data='<article class=\"product_pod\"><div class=...' parent='<li class=\"col-xs-6 col-sm-4 col-md-3 co...'>\n```\nYou can use the `filter` method, too, which takes a function like the `search` method but returns an `Selectors` instance of all the [Selector](#selector) instances that satisfy the function\n```python\n# Find all products with prices over $50\n>>> filtering_function = lambda p: float(p.css('.price_color').re_first(r'[\\d\\.]+')) > 50\n>>> page.css('.product_pod').filter(filtering_function)\n[<data='<article class=\"product_pod\"><div class=...' parent='<li class=\"col-xs-6 col-sm-4 col-md-3 co...'>,\n <data='<article class=\"product_pod\"><div class=...' parent='<li class=\"col-xs-6 col-sm-4 col-md-3 co...'>,\n <data='<article class=\"product_pod\"><div class=...' parent='<li class=\"col-xs-6 col-sm-4 col-md-3 co...'>,\n...]\n```\nYou can safely access the first or last element without worrying about index errors:\n```python\n>>> page.css('.product').first   # First Selector or None\n<data='<article class=\"product\" data-id=\"1\"><h3...'>\n>>> page.css('.product').last    # Last Selector or None\n<data='<article class=\"product\" data-id=\"3\"><h3...'>\n>>> page.css('.nonexistent').first  # Returns None instead of raising IndexError\n```\n\nIf you are too lazy like me and want to know the number of [Selector](#selector) instances in a [Selectors](#selectors) instance. You can do this:\n```python\npage.css('.product_pod').length\n```\nwhich is equivalent to\n```python\nlen(page.css('.product_pod'))\n```\nYup, like JavaScript :)\n\n## TextHandler\nThis class is mandatory to understand, as all methods/properties that should return a string for you will return `TextHandler`, and the ones that should return a list of strings will return [TextHandlers](#texthandlers) instead.\n\nTextHandler is a subclass of the standard Python string, so you can do anything with it that you can do with a Python string. So, what is the difference that requires a different naming?\n\nOf course, TextHandler provides extra methods and properties that standard Python strings can't do. We will review them now, but remember that all methods and properties in all classes that return string(s) return TextHandler, which opens the door for creativity and makes the code shorter and cleaner, as you will see. Also, you can import it directly and use it on any string, which we will explain [later](../development/scrapling_custom_types.md).\n### Usage\nFirst, before discussing the added methods, you need to know that all operations on it, like slicing, accessing by index, etc., and methods like `split`, `replace`, `strip`, etc., all return a `TextHandler` again, so you can chain them as you want. If you find a method or property that returns a standard string instead of `TextHandler`, please open an issue, and we will override it as well.\n\nFirst, we start with the `re` and `re_first` methods. These are the same methods that exist in the other classes ([Selector](#selector), [Selectors](#selectors), and [TextHandlers](#texthandlers)), so they accept the same arguments.\n\n- The `re` method takes a string/compiled regex pattern as the first argument. It searches the data for all strings matching the regex and returns them as a [TextHandlers](#texthandlers) instance. The `re_first` method takes the same arguments and behaves similarly, but, as you probably figured out from the name, it returns only the first result as a `TextHandler` instance.\n    \n    Also, it takes other helpful arguments, which are:\n    \n    - **replace_entities**: This is enabled by default. It replaces character entity references with their corresponding characters.\n    - **clean_match**: It's disabled by default. This causes the method to ignore all whitespace, including consecutive spaces, while matching.\n    - **case_sensitive**: It's enabled by default. As the name implies, disabling it causes the regex to ignore letter case during compilation.\n  \n    You have seen these examples before; the return result is [TextHandlers](#texthandlers) because we used the `re` method.\n    ```python\n    >>> page.css('.price_color').re(r'[\\d\\.]+')\n    ['51.77',\n     '53.74',\n     '50.10',\n     '47.82',\n     '54.23',\n    ...]\n    \n    >>> page.css('.product_pod h3 a::attr(href)').re(r'catalogue/(.*)/index.html')\n    ['a-light-in-the-attic_1000',\n     'tipping-the-velvet_999',\n     'soumission_998',\n     'sharp-objects_997',\n    ...]\n    ```\n    To explain the other arguments better, we will use a custom string for each example below\n    ```python\n    >>> from scrapling import TextHandler\n    >>> test_string = TextHandler('hi  there')  # Hence the two spaces\n    >>> test_string.re('hi there')\n    >>> test_string.re('hi there', clean_match=True)  # Using `clean_match` will clean the string before matching the regex\n    ['hi there']\n    \n    >>> test_string2 = TextHandler('Oh, Hi Mark')\n    >>> test_string2.re_first('oh, hi Mark')\n    >>> test_string2.re_first('oh, hi Mark', case_sensitive=False)  # Hence disabling `case_sensitive`\n    'Oh, Hi Mark'\n    \n    # Mixing arguments\n    >>> test_string.re('hi there', clean_match=True, case_sensitive=False)\n    ['hi There']\n    ```\n    Another use of the idea of replacing strings with `TextHandler` everywhere is that a property like `html_content` returns `TextHandler`, so you can do regex on the HTML content if you want:\n    ```python\n    >>> page.html_content.re('div class=\".*\">(.*)</div')\n    ['In stock: 5', 'In stock: 3', 'Out of stock']\n    ```\n\n- You also have the `.json()` method, which tries to convert the content to a JSON object quickly if possible; otherwise, it throws an error\n  ```python\n  >>> page.css('#page-data::text').get()\n    '\\n      {\\n        \"lastUpdated\": \"2024-09-22T10:30:00Z\",\\n        \"totalProducts\": 3\\n      }\\n    '\n  >>> page.css('#page-data::text').get().json()\n    {'lastUpdated': '2024-09-22T10:30:00Z', 'totalProducts': 3}\n  ```\n  Hence, if you didn't specify a text node while selecting an element (like the text content or an attribute text content), the text content will be selected automatically, like this\n  ```python\n  >>> page.css('#page-data')[0].json()\n  {'lastUpdated': '2024-09-22T10:30:00Z', 'totalProducts': 3}\n  ```\n  The [Selector](#selector) class adds one thing here, too; let's say this is the page we are working with:\n  ```html\n  <html>\n      <body>\n          <div>\n            <script id=\"page-data\" type=\"application/json\">\n              {\n                \"lastUpdated\": \"2024-09-22T10:30:00Z\",\n                \"totalProducts\": 3\n              }\n            </script>\n          </div>\n      </body>\n  </html>\n  ```\n  The [Selector](#selector) class has the `get_all_text` method, which you should be aware of by now. This method returns a `TextHandler`, of course.<br/><br/>\n  So, as you know here, if you did something like this\n  ```python\n  >>> page.css('div::text').get().json()\n  ```\n  You will get an error because the `div` tag doesn't have any direct text content that can be serialized to JSON; it doesn't have any direct text content at all.<br/><br/>\n  In this case, the `get_all_text` method comes to the rescue, so you can do something like that\n  ```python\n  >>> page.css('div')[0].get_all_text(ignore_tags=[]).json()\n    {'lastUpdated': '2024-09-22T10:30:00Z', 'totalProducts': 3}\n  ```\n  I used the `ignore_tags` argument here because the default value of it is `('script', 'style',)`, as you are aware.<br/><br/>\n  Another related behavior to be aware of occurs when using any fetcher, which we will explain later. If you have a JSON response like this example:\n  ```python\n  >>> page = Selector(\"\"\"{\"some_key\": \"some_value\"}\"\"\")\n  ```\n  Because the [Selector](#selector) class is optimized to deal with HTML pages, it will deal with it as a broken HTML response and fix it, so if you used the `html_content` property, you get this\n  ```python\n  >>> page.html_content\n  '<html><body><p>{\"some_key\": \"some_value\"}</p></body></html>'\n  ```\n  Here, you can use the `json` method directly, and it will work\n  ```python\n  >>> page.json()\n  {'some_key': 'some_value'}\n  ```\n  You might wonder how this happened, given that the `html` tag doesn't contain direct text.<br/>\n  Well, for cases like JSON responses, I made the [Selector](#selector) class keep a raw copy of the content it receives. This way, when you use the `.json()` method, it checks for that raw copy and then converts it to JSON. If the raw copy is unavailable, as with the elements, it checks the current element's text content; otherwise, it uses the `get_all_text` method directly.<br/>\n\n- Another handy method is `.clean()`, which will remove all white spaces and consecutive spaces for you and return a new `TextHandler` instance\n```python\n>>> TextHandler('\\n wonderful  idea, \\reh?').clean()\n'wonderful idea, eh?'\n```\nAlso, you can pass the `remove_entities` argument to make `clean` replace HTML entities with their corresponding characters.\n\n- Another method that might be helpful in some cases is the `.sort()` method to sort the string for you, as you do with lists\n```python\n>>> TextHandler('acb').sort()\n'abc'\n```\nOr do it in reverse:\n```python\n>>> TextHandler('acb').sort(reverse=True)\n'cba'\n```\n\nOther methods and properties will be added over time, but remember that this class is returned in place of strings nearly everywhere in the library.\n\n## TextHandlers\nYou probably guessed it: This class is similar to [Selectors](#selectors) and [Selector](#selector), but here it inherits the same logic and method as standard lists, with only `re` and `re_first` as new methods.\n\nThe only difference is that the `re_first` method logic here runs `re` on each [TextHandler](#texthandler) and returns the first result, or `None`. Nothing new needs to be explained here, but new methods will be added over time.\n\n## AttributesHandler\nThis is a read-only version of Python's standard dictionary, or `dict`, used solely to store the attributes of each element/[Selector](#selector) instance.\n```python\n>>> print(page.find('script').attrib)\n{'id': 'page-data', 'type': 'application/json'}\n>>> type(page.find('script').attrib).__name__\n'AttributesHandler'\n```\nBecause it's read-only, it will use fewer resources than the standard dictionary. Still, it has the same dictionary method and properties, except those that allow you to modify/override the data.\n\nIt currently adds two extra simple methods:\n\n- The `search_values` method\n\n    In standard dictionaries, you can do `dict.get(\"key_name\")` to check if a key exists. However, if you want to search by values rather than keys, you will need some additional code lines. This method does that for you. It allows you to search the current attributes by values and returns a dictionary of each matching item.\n    \n    A simple example would be\n    ```python\n    >>> for i in page.find('script').attrib.search_values('page-data'):\n            print(i)\n    {'id': 'page-data'}\n    ```\n    But this method provides the `partial` argument as well, which allows you to search by part of the value:\n    ```python\n    >>> for i in page.find('script').attrib.search_values('page', partial=True):\n            print(i)\n    {'id': 'page-data'}\n    ```\n    These examples won't happen in the real world; most likely, a more real-world example would be using it with the `find_all` method to find all elements that have a specific value in their arguments:\n    ```python\n    >>> page.find_all(lambda element: list(element.attrib.search_values('product')))\n    [<data='<article class=\"product\" data-id=\"1\"><h3...' parent='<div class=\"product-list\"> <article clas...'>,\n     <data='<article class=\"product\" data-id=\"2\"><h3...' parent='<div class=\"product-list\"> <article clas...'>,\n     <data='<article class=\"product\" data-id=\"3\"><h3...' parent='<div class=\"product-list\"> <article clas...'>]\n    ```\n    All these elements have 'product' as the value for the `class` attribute.\n    \n    Hence, I used the `list` function here because `search_values` returns a generator, so it would be `True` for all elements.\n\n- The `json_string` property\n\n    This property converts current attributes to a JSON string if the attributes are JSON serializable; otherwise, it throws an error.\n  \n    ```python\n    >>>page.find('script').attrib.json_string\n    b'{\"id\":\"page-data\",\"type\":\"application/json\"}'\n    ```"
  },
  {
    "path": "docs/parsing/selection.md",
    "content": "# Querying elements\nScrapling currently supports parsing HTML pages exclusively, so it doesn't support XML feeds. This decision was made because the adaptive feature won't work with XML, but that might change soon, so stay tuned :)\n\nIn Scrapling, there are five main ways to find elements:\n\n1. CSS3 Selectors\n2. XPath Selectors\n3. Finding elements based on filters/conditions.\n4. Finding elements whose content contains a specific text\n5. Finding elements whose content matches a specific regex\n\nOf course, there are other indirect ways to find elements with Scrapling, but here we will discuss the main ways in detail. We will also bring up one of the most remarkable features of Scrapling: the ability to find elements that are similar to the element you have; you can jump to that section directly from [here](#finding-similar-elements).\n\nIf you are new to Web Scraping, have little to no experience writing selectors, and want to start quickly, I recommend you jump directly to learning the `find`/`find_all` methods from [here](#filters-based-searching).\n\n## CSS/XPath selectors\n\n### What are CSS selectors?\n[CSS](https://en.wikipedia.org/wiki/CSS) is a language for applying styles to HTML documents. It defines selectors to associate those styles with specific HTML elements.\n\nScrapling implements CSS3 selectors as described in the [W3C specification](http://www.w3.org/TR/2011/REC-css3-selectors-20110929/). CSS selectors support comes from `cssselect`, so it's better to read about which [selectors are supported from cssselect](https://cssselect.readthedocs.io/en/latest/#supported-selectors) and pseudo-functions/elements.\n\nAlso, Scrapling implements some non-standard pseudo-elements like:\n\n* To select text nodes, use ``::text``.\n* To select attribute values, use ``::attr(name)`` where name is the name of the attribute that you want the value of\n\nIn short, if you come from Scrapy/Parsel, you will find the same logic for selectors here to make it easier. No need to implement a stranger logic to the one that most of us are used to :)\n\nTo select elements with CSS selectors, use the `css` method, which returns `Selectors`. Use `[0]` to get the first element, or `.get()` / `.getall()` to extract text values from text/attribute pseudo-selectors.\n\n### What are XPath selectors?\n[XPath](https://en.wikipedia.org/wiki/XPath) is a language for selecting nodes in XML documents, which can also be used with HTML. This [cheatsheet](https://devhints.io/xpath) is a good resource for learning about [XPath](https://en.wikipedia.org/wiki/XPath). Scrapling adds XPath selectors directly through [lxml](https://lxml.de/).\n\nIn short, it is the same situation as CSS Selectors; if you come from Scrapy/Parsel, you will find the same logic for selectors here. However, Scrapling doesn't implement the XPath extension function `has-class` as Scrapy/Parsel does. Instead, it provides the `has_class` method, which can be used on elements returned for the same purpose.\n\nTo select elements with XPath selectors, you have the `xpath` method. Again, this method follows the same logic as the CSS selectors method above.\n\n> Note that each method of `css` and `xpath` has additional arguments, but we didn't explain them here, as they are all about the adaptive feature. The adaptive feature will have its own page later to be described in detail.\n\n### Selectors examples\nLet's see some shared examples of using CSS and XPath Selectors.\n\nSelect all elements with the class `product`.\n```python\nproducts = page.css('.product')\nproducts = page.xpath('//*[@class=\"product\"]')\n```\n!!! info \"Note:\"\n\n    The XPath one won't be accurate if there's another class; **it's always better to rely on CSS for selecting by class**\n\nSelect the first element with the class `product`.\n```python\nproduct = page.css('.product')[0]\nproduct = page.xpath('//*[@class=\"product\"]')[0]\n```\nGet the text of the first element with the `h1` tag name\n```python\ntitle = page.css('h1::text').get()\ntitle = page.xpath('//h1//text()').get()\n```\nWhich is the same as doing\n```python\ntitle = page.css('h1')[0].text\ntitle = page.xpath('//h1')[0].text\n```\nGet the `href` attribute of the first element with the `a` tag name\n```python\nlink = page.css('a::attr(href)').get()\nlink = page.xpath('//a/@href').get()\n```\nSelect the text of the first element with the `h1` tag name, which contains `Phone`, and under an element with class `product`.\n```python\ntitle = page.css('.product h1:contains(\"Phone\")::text').get()\ntitle = page.xpath('//*[@class=\"product\"]//h1[contains(text(),\"Phone\")]/text()').get()\n```\nYou can nest and chain selectors as you want, given that they return results\n```python\npage.css('.product')[0].css('h1:contains(\"Phone\")::text').get()\npage.xpath('//*[@class=\"product\"]')[0].xpath('//h1[contains(text(),\"Phone\")]/text()').get()\npage.xpath('//*[@class=\"product\"]')[0].css('h1:contains(\"Phone\")::text').get()\n```\nAnother example\n\nAll links that have 'image' in their 'href' attribute\n```python\nlinks = page.css('a[href*=\"image\"]')\nlinks = page.xpath('//a[contains(@href, \"image\")]')\nfor index, link in enumerate(links):\n    link_value = link.attrib['href']  # Cleaner than link.css('::attr(href)').get()\n    link_text = link.text\n    print(f'Link number {index} points to this url {link_value} with text content as \"{link_text}\"')\n```\n\n## Text-content selection\nScrapling provides the ability to select elements based on their direct text content, and you have two ways to do this:\n\n1. Elements whose direct text content contains the given text with many options through the `find_by_text` method.\n2. Elements whose direct text content matches the given regex pattern with many options through the `find_by_regex` method.\n\nWhat you can do with `find_by_text` can be done with `find_by_regex` if you are good enough with regular expressions (regex), but we are providing more options to make them easier for all users to access.\n\nWith `find_by_text`, you pass the text as the first argument; with `find_by_regex`, the regex pattern is the first argument. Both methods share the following arguments:\n\n* **first_match**: If `True` (the default), the method used will return the first result it finds.\n* **case_sensitive**: If `True`, the case of the letters will be considered.\n* **clean_match**: If `True`, all whitespaces and consecutive spaces will be replaced with a single space before matching.\n\nBy default, Scrapling searches for the exact matching of the text/pattern you pass to `find_by_text`, so the text content of the wanted element has to be ONLY the text you input, but that's why it also has one extra argument, which is:\n\n* **partial**: If enabled, `find_by_text` will return elements that contain the input text. So it's not an exact match anymore\n\n!!! abstract \"Note:\"\n\n    The method `find_by_regex` can accept both regular strings and a compiled regex pattern as its first argument, as you will see in the upcoming examples.\n\n### Finding Similar Elements\nOne of the most remarkable new features Scrapling puts on the table is the ability to tell Scrapling to find elements similar to the element at hand. This feature's inspiration came from the AutoScraper library, but in Scrapling, it can be used on elements found by any method. Most of its usage would likely occur after finding elements through text content, similar to how AutoScraper works, making it convenient to explain here.\n\nSo, how does it work?\n\nImagine a scenario where you found a product by its title, for example, and you want to extract other products listed in the same table/container. With the element you have, you can call the method `.find_similar()` on it, and Scrapling will:\n\n1. Find all page elements with the same DOM tree depth as this element. \n2. All found elements will be checked, and those without the same tag name, parent tag name, and grandparent tag name will be dropped.\n3. Now we are sure (like 99% sure) that these elements are the ones we want, but as a last check, Scrapling will use fuzzy matching to drop the elements whose attributes don't look like the attributes of our element. There's a percentage to control this step, and I recommend you not play with it unless the default settings don't get the elements you want.\n\nThat's a lot of talking, I know, but I had to go deep. I will give examples of using this method in the next section, but first, these are the arguments that can be passed to this method:\n\n* **similarity_threshold**: This is the percentage we discussed in step 3 for comparing elements' attributes. The default value is 0.2. In Simpler words, the tag attributes of both elements should be at least 20% similar. If you want to turn off this check (basically Step 3), you can set this attribute to 0, but I recommend you read what the other arguments do first.\n* **ignore_attributes**: The attribute names passed will be ignored while matching the attributes in the last step. The default value is `('href', 'src',)` because URLs can change significantly across elements, making them unreliable.\n* **match_text**: If `True`, the element's text content will be considered when matching (Step 3). Using this argument in typical cases is not recommended, but it depends.\n\nNow, let's check out the examples below.\n\n### Examples\nLet's see some shared examples of finding elements with raw text and regex.\n\nI will use the `Fetcher` class with these examples, but it will be explained in detail later.\n```python\nfrom scrapling.fetchers import Fetcher\npage = Fetcher.get('https://books.toscrape.com/index.html')\n```\nFind the first element whose text fully matches this text\n```python\n>>> page.find_by_text('Tipping the Velvet')\n<data='<a href=\"catalogue/tipping-the-velvet_99...' parent='<h3><a href=\"catalogue/tipping-the-velve...'>\n```\nCombining it with `page.urljoin` to return the full URL from the relative `href`.\n```python\n>>> page.find_by_text('Tipping the Velvet').attrib['href']\n'catalogue/tipping-the-velvet_999/index.html'\n>>> page.urljoin(page.find_by_text('Tipping the Velvet').attrib['href'])\n'https://books.toscrape.com/catalogue/tipping-the-velvet_999/index.html'\n```\nGet all matches if there are more (notice it returns a list)\n```python\n>>> page.find_by_text('Tipping the Velvet', first_match=False)\n[<data='<a href=\"catalogue/tipping-the-velvet_99...' parent='<h3><a href=\"catalogue/tipping-the-velve...'>]\n```\nGet all elements that contain the word `the` (Partial matching)\n```python\n>>> results = page.find_by_text('the', partial=True, first_match=False)\n>>> [i.text for i in results]\n['A Light in the ...',\n 'Tipping the Velvet',\n 'The Requiem Red',\n 'The Dirty Little Secrets ...',\n 'The Coming Woman: A ...',\n 'The Boys in the ...',\n 'The Black Maria',\n 'Mesaerion: The Best Science ...',\n \"It's Only the Himalayas\"]\n```\nThe search is case-insensitive, so those results include `The`, not just the lowercase `the`; let's limit the search to elements with `the` only.\n```python\n>>> results = page.find_by_text('the', partial=True, first_match=False, case_sensitive=True)\n>>> [i.text for i in results]\n['A Light in the ...',\n 'Tipping the Velvet',\n 'The Boys in the ...',\n \"It's Only the Himalayas\"]\n```\nGet the first element whose text content matches my price regex\n```python\n>>> page.find_by_regex(r'£[\\d\\.]+')\n<data='<p class=\"price_color\">£51.77</p>' parent='<div class=\"product_price\"> <p class=\"pr...'>\n>>> page.find_by_regex(r'£[\\d\\.]+').text\n'£51.77'\n```\nIt's the same if you pass the compiled regex as well; Scrapling will detect the input type and act upon that:\n```python\n>>> import re\n>>> regex = re.compile(r'£[\\d\\.]+')\n>>> page.find_by_regex(regex)\n<data='<p class=\"price_color\">£51.77</p>' parent='<div class=\"product_price\"> <p class=\"pr...'>\n>>> page.find_by_regex(regex).text\n'£51.77'\n```\nGet all elements that match the regex\n```python\n>>> page.find_by_regex(r'£[\\d\\.]+', first_match=False)\n[<data='<p class=\"price_color\">£51.77</p>' parent='<div class=\"product_price\"> <p class=\"pr...'>,\n <data='<p class=\"price_color\">£53.74</p>' parent='<div class=\"product_price\"> <p class=\"pr...'>,\n <data='<p class=\"price_color\">£50.10</p>' parent='<div class=\"product_price\"> <p class=\"pr...'>,\n <data='<p class=\"price_color\">£47.82</p>' parent='<div class=\"product_price\"> <p class=\"pr...'>,\n ...]\n```\nAnd so on...\n\nFind all elements similar to the current element in location and attributes. For our case, ignore the 'title' attribute while matching\n```python\n>>> element = page.find_by_text('Tipping the Velvet')\n>>> element.find_similar(ignore_attributes=['title'])\n[<data='<a href=\"catalogue/a-light-in-the-attic_...' parent='<h3><a href=\"catalogue/a-light-in-the-at...'>,\n <data='<a href=\"catalogue/soumission_998/index....' parent='<h3><a href=\"catalogue/soumission_998/in...'>,\n <data='<a href=\"catalogue/sharp-objects_997/ind...' parent='<h3><a href=\"catalogue/sharp-objects_997...'>,\n...]\n```\nNotice that the number of elements is 19, not 20, because the current element is not included in the results.\n```python\n>>> len(element.find_similar(ignore_attributes=['title']))\n19\n```\nGet the `href` attribute from all similar elements\n```python\n>>> [\n    element.attrib['href']\n    for element in element.find_similar(ignore_attributes=['title'])\n]\n['catalogue/a-light-in-the-attic_1000/index.html',\n 'catalogue/soumission_998/index.html',\n 'catalogue/sharp-objects_997/index.html',\n ...]\n```\nTo increase the complexity a little bit, let's say we want to get all the books' data using that element as a starting point for some reason\n```python\n>>> for product in element.parent.parent.find_similar():\n        print({\n            \"name\": product.css('h3 a::text').get(),\n            \"price\": product.css('.price_color')[0].re_first(r'[\\d\\.]+'),\n            \"stock\": product.css('.availability::text').getall()[-1].clean()\n        })\n{'name': 'A Light in the ...', 'price': '51.77', 'stock': 'In stock'}\n{'name': 'Soumission', 'price': '50.10', 'stock': 'In stock'}\n{'name': 'Sharp Objects', 'price': '47.82', 'stock': 'In stock'}\n...\n```\n### Advanced examples \nSee more advanced or real-world examples using the `find_similar` method.\n\nE-commerce Product Extraction\n```python\ndef extract_product_grid(page):\n    # Find the first product card\n    first_product = page.find_by_text('Add to Cart').find_ancestor(\n        lambda e: e.has_class('product-card')\n    )\n\n    # Find similar product cards\n    products = first_product.find_similar()\n\n    return [\n        {\n            'name': p.css('h3::text').get(),\n            'price': p.css('.price::text').re_first(r'\\d+\\.\\d{2}'),\n            'stock': 'In stock' in p.text,\n            'rating': p.css('.rating')[0].attrib.get('data-rating')\n        }\n        for p in products\n    ]\n```\nTable Row Extraction\n```python\ndef extract_table_data(page):\n    # Find the first data row\n    first_row = page.css('table tbody tr')[0]\n\n    # Find similar rows\n    rows = first_row.find_similar()\n\n    return [\n        {\n            'column1': row.css('td:nth-child(1)::text').get(),\n            'column2': row.css('td:nth-child(2)::text').get(),\n            'column3': row.css('td:nth-child(3)::text').get()\n        }\n        for row in rows\n    ]\n```\nForm Field Extraction\n```python\ndef extract_form_fields(page):\n    # Find first form field container\n    first_field = page.css('input')[0].find_ancestor(\n        lambda e: e.has_class('form-field')\n    )\n\n    # Find similar field containers\n    fields = first_field.find_similar()\n\n    return [\n        {\n            'label': f.css('label::text').get(),\n            'type': f.css('input')[0].attrib.get('type'),\n            'required': 'required' in f.css('input')[0].attrib\n        }\n        for f in fields\n    ]\n```\nExtracting reviews from a website\n```python\ndef extract_reviews(page):\n    # Find first review\n    first_review = page.find_by_text('Great product!')\n    review_container = first_review.find_ancestor(\n        lambda e: e.has_class('review')\n    )\n    \n    # Find similar reviews\n    all_reviews = review_container.find_similar()\n    \n    return [\n        {\n            'text': r.css('.review-text::text').get(),\n            'rating': r.attrib.get('data-rating'),\n            'author': r.css('.reviewer::text').get()\n        }\n        for r in all_reviews\n    ]\n```\n## Filters-based searching\nThis search method is arguably the best way to find elements in Scrapling, as it is powerful and easier for newcomers to Web Scraping to learn than writing selectors. \n\nInspired by BeautifulSoup's `find_all` function, you can find elements using the `find_all` and `find` methods. Both methods can accept multiple filters and return all elements on the pages where all these filters apply.\n\nTo be more specific:\n\n* Any string passed is considered a tag name.\n* Any iterable passed, like List/Tuple/Set, will be considered as an iterable of tag names.\n* Any dictionary is considered a mapping of HTML element(s), attribute names, and attribute values.\n* Any regex patterns passed are used to filter elements by content, like the `find_by_regex` method\n* Any functions passed are used to filter elements\n* Any keyword argument passed is considered as an HTML element attribute with its value.\n\nIt collects all passed arguments and keywords, and each filter passes its results to the following filter in a waterfall-like filtering system.\n\nIt filters all elements in the current page/element in the following order:\n\n1. All elements with the passed tag name(s) get collected.\n2. All elements that match all passed attribute(s) are collected; if a previous filter is used, then previously collected elements are filtered.\n3. All elements that match all passed regex patterns are collected, or if previous filter(s) are used, then previously collected elements are filtered.\n4. All elements that fulfill all passed function(s) are collected; if a previous filter(s) is used, then previously collected elements are filtered.\n\n!!! note \"Notes:\"\n\n    1. As you probably understood, the filtering process always starts from the first filter it finds in the filtering order above. So, if no tag name(s) are passed but attributes are passed, the process starts from that step (number 2), and so on.\n    2. The order in which you pass the arguments doesn't matter. The only order considered is the one explained above.\n\nCheck examples to clear any confusion :)\n\n### Examples\n```python\n>>> from scrapling.fetchers import Fetcher\n>>> page = Fetcher.get('https://quotes.toscrape.com/')\n```\nFind all elements with the tag name `div`.\n```python\n>>> page.find_all('div')\n[<data='<div class=\"container\"> <div class=\"row...' parent='<body> <div class=\"container\"> <div clas...'>,\n <data='<div class=\"row header-box\"> <div class=...' parent='<div class=\"container\"> <div class=\"row...'>,\n...]\n```\nFind all div elements with a class that equals `quote`.\n```python\n>>> page.find_all('div', class_='quote')\n[<data='<div class=\"quote\" itemscope itemtype=\"h...' parent='<div class=\"col-md-8\"> <div class=\"quote...'>,\n <data='<div class=\"quote\" itemscope itemtype=\"h...' parent='<div class=\"col-md-8\"> <div class=\"quote...'>,\n...]\n```\nSame as above.\n```python\n>>> page.find_all('div', {'class': 'quote'})\n[<data='<div class=\"quote\" itemscope itemtype=\"h...' parent='<div class=\"col-md-8\"> <div class=\"quote...'>,\n <data='<div class=\"quote\" itemscope itemtype=\"h...' parent='<div class=\"col-md-8\"> <div class=\"quote...'>,\n...]\n```\nFind all elements with a class that equals `quote`.\n```python\n>>> page.find_all({'class': 'quote'})\n[<data='<div class=\"quote\" itemscope itemtype=\"h...' parent='<div class=\"col-md-8\"> <div class=\"quote...'>,\n <data='<div class=\"quote\" itemscope itemtype=\"h...' parent='<div class=\"col-md-8\"> <div class=\"quote...'>,\n...]\n```\nFind all div elements with a class that equals `quote` and contains the element `.text`, which contains the word 'world' in its content.\n```python\n>>> page.find_all('div', {'class': 'quote'}, lambda e: \"world\" in e.css('.text::text').get())\n[<data='<div class=\"quote\" itemscope itemtype=\"h...' parent='<div class=\"col-md-8\"> <div class=\"quote...'>]\n```\nFind all elements that have children.\n```python\n>>> page.find_all(lambda element: len(element.children) > 0)\n[<data='<html lang=\"en\"><head><meta charset=\"UTF...'>,\n <data='<head><meta charset=\"UTF-8\"><title>Quote...' parent='<html lang=\"en\"><head><meta charset=\"UTF...'>,\n <data='<body> <div class=\"container\"> <div clas...' parent='<html lang=\"en\"><head><meta charset=\"UTF...'>,\n...]\n```\nFind all elements that contain the word 'world' in their content.\n```python\n>>> page.find_all(lambda element: \"world\" in element.text)\n[<data='<span class=\"text\" itemprop=\"text\">“The...' parent='<div class=\"quote\" itemscope itemtype=\"h...'>,\n <data='<a class=\"tag\" href=\"/tag/world/page/1/\"...' parent='<div class=\"tags\"> Tags: <meta class=\"ke...'>]\n```\nFind all span elements that match the given regex\n```python\n>>> page.find_all('span', re.compile(r'world'))\n[<data='<span class=\"text\" itemprop=\"text\">“The...' parent='<div class=\"quote\" itemscope itemtype=\"h...'>]\n```\nFind all div and span elements with class 'quote' (No span elements like that, so only div returned)\n```python\n>>> page.find_all(['div', 'span'], {'class': 'quote'})\n[<data='<div class=\"quote\" itemscope itemtype=\"h...' parent='<div class=\"col-md-8\"> <div class=\"quote...'>,\n <data='<div class=\"quote\" itemscope itemtype=\"h...' parent='<div class=\"col-md-8\"> <div class=\"quote...'>,\n...]\n```\nMix things up\n```python\n>>> page.find_all({'itemtype':\"http://schema.org/CreativeWork\"}, 'div').css('.author::text').getall()\n['Albert Einstein',\n 'J.K. Rowling',\n...]\n```\nA bonus pro tip: Find all elements whose `href` attribute's value ends with the word 'Einstein'.\n```python\n>>> page.find_all({'href$': 'Einstein'})\n[<data='<a href=\"/author/Albert-Einstein\">(about...' parent='<span>by <small class=\"author\" itemprop=...'>,\n <data='<a href=\"/author/Albert-Einstein\">(about...' parent='<span>by <small class=\"author\" itemprop=...'>,\n <data='<a href=\"/author/Albert-Einstein\">(about...' parent='<span>by <small class=\"author\" itemprop=...'>]\n```\nAnother pro tip: Find all elements whose `href` attribute's value has '/author/' in it\n```python\n>>> page.find_all({'href*': '/author/'})\n[<data='<a href=\"/author/Albert-Einstein\">(about...' parent='<span>by <small class=\"author\" itemprop=...'>,\n <data='<a href=\"/author/J-K-Rowling\">(about)</a...' parent='<span>by <small class=\"author\" itemprop=...'>,\n <data='<a href=\"/author/Albert-Einstein\">(about...' parent='<span>by <small class=\"author\" itemprop=...'>,\n...]\n```\nAnd so on...\n\n## Generating selectors\nYou can always generate CSS/XPath selectors for any element that can be reused here or anywhere else, and the most remarkable thing is that it doesn't matter what method you used to find that element!\n\nGenerate a short CSS selector for the `url_element` element (if possible, create a short one; otherwise, it's a full selector)\n```python\n>>> url_element = page.find({'href*': '/author/'})\n>>> url_element.generate_css_selector\n'body > div > div:nth-of-type(2) > div > div > span:nth-of-type(2) > a'\n```\nGenerate a full CSS selector for the `url_element` element from the start of the page\n```python\n>>> url_element.generate_full_css_selector\n'body > div > div:nth-of-type(2) > div > div > span:nth-of-type(2) > a'\n```\nGenerate a short XPath selector for the `url_element` element (if possible, create a short one; otherwise, it's a full selector)\n```python\n>>> url_element.generate_xpath_selector\n'//body/div/div[2]/div/div/span[2]/a'\n```\nGenerate a full XPath selector for the `url_element` element from the start of the page\n```python\n>>> url_element.generate_full_xpath_selector\n'//body/div/div[2]/div/div/span[2]/a'\n```\n!!! abstract \"Note:\"\n\n    When you tell Scrapling to create a short selector, it tries to find a unique element to use in generation as a stop point, like an element with an `id` attribute, but in our case, there wasn't any, so that's why the short and the full selector will be the same.\n\n## Using selectors with regular expressions\nSimilar to `parsel`/`scrapy`, `re` and `re_first` methods are available for extracting data using regular expressions. However, unlike the former libraries, these methods are in nearly all classes like `Selector`/`Selectors`/`TextHandler` and `TextHandlers`, which means you can use them directly on the element even if you didn't select a text node. \n\nWe will have a deep look at it while explaining the [TextHandler](main_classes.md#texthandler) class, but in general, it works like the examples below:\n```python\n>>> page.css('.price_color')[0].re_first(r'[\\d\\.]+')\n'51.77'\n\n>>> page.css('.price_color').re_first(r'[\\d\\.]+')\n'51.77'\n\n>>> page.css('.price_color').re(r'[\\d\\.]+')\n['51.77',\n '53.74',\n '50.10',\n '47.82',\n '54.23',\n...]\n\n>>> page.css('.product_pod h3 a::attr(href)').re(r'catalogue/(.*)/index.html')\n['a-light-in-the-attic_1000',\n 'tipping-the-velvet_999',\n 'soumission_998',\n 'sharp-objects_997',\n...]\n\n>>> filtering_function = lambda e: e.parent.tag == 'h3' and e.parent.parent.has_class('product_pod')  # As above selector\n>>> page.find('a', filtering_function).attrib['href'].re(r'catalogue/(.*)/index.html')\n['a-light-in-the-attic_1000']\n\n>>> page.find_by_text('Tipping the Velvet').attrib['href'].re(r'catalogue/(.*)/index.html')\n['tipping-the-velvet_999']\n```\nAnd so on. You get the idea. We will explain this in more detail on the next page, along with the [TextHandler](main_classes.md#texthandler) class."
  },
  {
    "path": "docs/requirements.txt",
    "content": "zensical>=0.0.27\nmkdocstrings>=1.0.3\nmkdocstrings-python>=2.0.3\ngriffe-inherited-docstrings>=1.1.3\ngriffe-runtime-objects>=0.3.1\ngriffe-sphinx>=0.2.1\nblack>=26.1.0\npngquant"
  },
  {
    "path": "docs/spiders/advanced.md",
    "content": "# Advanced usages\n\n## Introduction\n\n!!! success \"Prerequisites\"\n\n    1. You've read the [Getting started](getting-started.md) page and know how to create and run a basic spider.\n\nThis page covers the spider system's advanced features: concurrency control, pause/resume, streaming, lifecycle hooks, statistics, and logging.\n\n## Concurrency Control\n\nThe spider system uses three class attributes to control how aggressively it crawls:\n\n| Attribute                        | Default | Description                                                      |\n|----------------------------------|---------|------------------------------------------------------------------|\n| `concurrent_requests`            | `4`     | Maximum number of requests being processed at the same time      |\n| `concurrent_requests_per_domain` | `0`     | Maximum concurrent requests per domain (0 = no per-domain limit) |\n| `download_delay`                 | `0.0`   | Seconds to wait before each request                              |\n\n```python\nclass PoliteSpider(Spider):\n    name = \"polite\"\n    start_urls = [\"https://example.com\"]\n\n    # Be gentle with the server\n    concurrent_requests = 4\n    concurrent_requests_per_domain = 2\n    download_delay = 1.0  # Wait 1 second between requests\n\n    async def parse(self, response: Response):\n        yield {\"title\": response.css(\"title::text\").get(\"\")}\n```\n\nWhen `concurrent_requests_per_domain` is set, each domain gets its own concurrency limiter in addition to the global limit. This is useful when crawling multiple domains simultaneously — you can allow high global concurrency while being polite to each individual domain.\n\n!!! tip\n\n    The `download_delay` parameter adds a fixed wait before every request, regardless of the domain. Use it for simple rate limiting.\n\n### Using uvloop\n\nThe `start()` method accepts a `use_uvloop` parameter to use the faster [uvloop](https://github.com/MagicStack/uvloop)/[winloop](https://github.com/nicktimko/winloop) event loop implementation, if available:\n\n```python\nresult = MySpider().start(use_uvloop=True)\n```\n\nThis can improve throughput for I/O-heavy crawls. You'll need to install `uvloop` (Linux/macOS) or `winloop` (Windows) separately.\n\n## Pause & Resume\n\nThe spider supports graceful pause-and-resume via checkpointing. To enable it, pass a `crawldir` directory to the spider constructor:\n\n```python\nspider = MySpider(crawldir=\"crawl_data/my_spider\")\nresult = spider.start()\n\nif result.paused:\n    print(\"Crawl was paused. Run again to resume.\")\nelse:\n    print(\"Crawl completed!\")\n```\n\n### How It Works\n\n1. **Pausing**: Press `Ctrl+C` during a crawl. The spider waits for all in-flight requests to finish, saves a checkpoint (pending requests + a set of seen request fingerprints), and then exits.\n2. **Force stopping**: Press `Ctrl+C` a second time to stop immediately without waiting for active tasks.\n3. **Resuming**: Run the spider again with the same `crawldir`. It detects the checkpoint, restores the queue and seen set, and continues from where it left off — skipping `start_requests()`.\n4. **Cleanup**: When a crawl completes normally (not paused), the checkpoint files are deleted automatically.\n\n**Checkpoints are also saved periodically during the crawl (every 5 minutes by default).** \n\nYou can change the interval as follows:\n\n```python\n# Save checkpoint every 2 minutes\nspider = MySpider(crawldir=\"crawl_data/my_spider\", interval=120.0)\n```\n\nThe writing to the disk is atomic, so it's totally safe.\n\n!!! tip\n\n    Pressing `Ctrl+C` during a crawl always causes the spider to close gracefully, even if the checkpoint system is not enabled. Doing it again without waiting forces the spider to close immediately.\n\n### Knowing If You're Resuming\n\nThe `on_start()` hook receives a `resuming` flag:\n\n```python\nasync def on_start(self, resuming: bool = False):\n    if resuming:\n        self.logger.info(\"Resuming from checkpoint!\")\n    else:\n        self.logger.info(\"Starting fresh crawl\")\n```\n\n## Streaming\n\nFor long-running spiders or applications that need real-time access to scraped items, use the `stream()` method instead of `start()`:\n\n```python\nimport anyio\n\nasync def main():\n    spider = MySpider()\n    async for item in spider.stream():\n        print(f\"Got item: {item}\")\n        # Access real-time stats\n        print(f\"Items so far: {spider.stats.items_scraped}\")\n        print(f\"Requests made: {spider.stats.requests_count}\")\n\nanyio.run(main)\n```\n\nKey differences from `start()`:\n\n- `stream()` must be called from an async context\n- Items are yielded one by one as they're scraped, not collected into a list\n- You can access `spider.stats` during iteration for real-time statistics\n\n!!! abstract \n\n    The full list of all stats that can be accessed by `spider.stats` is explained below [here](#results--statistics)\n\nYou can use it with the checkpoint system too, so it's easy to build UI on top of spiders. UIs that have real-time data and can be paused/resumed.\n\n```python\nimport anyio\n\nasync def main():\n    spider = MySpider(crawldir=\"crawl_data/my_spider\")\n    async for item in spider.stream():\n        print(f\"Got item: {item}\")\n        # Access real-time stats\n        print(f\"Items so far: {spider.stats.items_scraped}\")\n        print(f\"Requests made: {spider.stats.requests_count}\")\n\nanyio.run(main)\n```\nYou can also use `spider.pause()` to shut down the spider in the code above. If you used it without enabling the checkpoint system, it will just close the crawl.\n\n## Lifecycle Hooks\n\nThe spider provides several hooks you can override to add custom behavior at different stages of the crawl:\n\n### on_start\n\nCalled before crawling begins. Use it for setup tasks like loading data or initializing resources:\n\n```python\nasync def on_start(self, resuming: bool = False):\n    self.logger.info(\"Spider starting up\")\n    # Load seed URLs from a database, initialize counters, etc.\n```\n\n### on_close\n\nCalled after crawling finishes (whether completed or paused). Use it for cleanup:\n\n```python\nasync def on_close(self):\n    self.logger.info(\"Spider shutting down\")\n    # Close database connections, flush buffers, etc.\n```\n\n### on_error\n\nCalled when a request fails with an exception. Use it for error tracking or custom recovery logic:\n\n```python\nasync def on_error(self, request: Request, error: Exception):\n    self.logger.error(f\"Failed: {request.url} - {error}\")\n    # Log to error tracker, save failed URL for later, etc.\n```\n\n### on_scraped_item\n\nCalled for every scraped item before it's added to the results. Return the item (modified or not) to keep it, or return `None` to drop it:\n\n```python\nasync def on_scraped_item(self, item: dict) -> dict | None:\n    # Drop items without a title\n    if not item.get(\"title\"):\n        return None\n\n    # Modify items (e.g., add timestamps)\n    item[\"scraped_at\"] = \"2026-01-01\"\n    return item\n```\n\n!!! tip\n\n    This hook can also be used to direct items through your own pipelines and drop them from the spider.\n\n### start_requests\n\nOverride `start_requests()` for custom initial request generation instead of using `start_urls`:\n\n```python\nasync def start_requests(self):\n    # POST request to log in first\n    yield Request(\n        \"https://example.com/login\",\n        method=\"POST\",\n        data={\"user\": \"admin\", \"pass\": \"secret\"},\n        callback=self.after_login,\n    )\n\nasync def after_login(self, response: Response):\n    # Now crawl the authenticated pages\n    yield response.follow(\"/dashboard\", callback=self.parse)\n```\n\n## Results & Statistics\n\nThe `CrawlResult` returned by `start()` contains both the scraped items and detailed statistics:\n\n```python\nresult = MySpider().start()\n\n# Items\nprint(f\"Total items: {len(result.items)}\")\nresult.items.to_json(\"output.json\", indent=True)\n\n# Did the crawl complete?\nprint(f\"Completed: {result.completed}\")\nprint(f\"Paused: {result.paused}\")\n\n# Statistics\nstats = result.stats\nprint(f\"Requests: {stats.requests_count}\")\nprint(f\"Failed: {stats.failed_requests_count}\")\nprint(f\"Blocked: {stats.blocked_requests_count}\")\nprint(f\"Offsite filtered: {stats.offsite_requests_count}\")\nprint(f\"Items scraped: {stats.items_scraped}\")\nprint(f\"Items dropped: {stats.items_dropped}\")\nprint(f\"Response bytes: {stats.response_bytes}\")\nprint(f\"Duration: {stats.elapsed_seconds:.1f}s\")\nprint(f\"Speed: {stats.requests_per_second:.1f} req/s\")\n```\n\n### Detailed Stats\n\nThe `CrawlStats` object tracks granular information:\n\n```python\nstats = result.stats\n\n# Status code distribution\nprint(stats.response_status_count)\n# {'status_200': 150, 'status_404': 3, 'status_403': 1}\n\n# Bytes downloaded per domain\nprint(stats.domains_response_bytes)\n# {'example.com': 1234567, 'api.example.com': 45678}\n\n# Requests per session\nprint(stats.sessions_requests_count)\n# {'http': 120, 'stealth': 34}\n\n# Proxies used during the crawl\nprint(stats.proxies)\n# ['http://proxy1:8080', 'http://proxy2:8080']\n\n# Log level counts\nprint(stats.log_levels_counter)\n# {'debug': 200, 'info': 50, 'warning': 3, 'error': 1, 'critical': 0}\n\n# Timing information\nprint(stats.start_time)       # Unix timestamp when crawl started\nprint(stats.end_time)         # Unix timestamp when crawl finished\nprint(stats.download_delay)   # The download delay used (seconds)\n\n# Concurrency settings used\nprint(stats.concurrent_requests)             # Global concurrency limit\nprint(stats.concurrent_requests_per_domain)  # Per-domain concurrency limit\n\n# Custom stats (set by your spider code)\nprint(stats.custom_stats)\n# {'login_attempts': 3, 'pages_with_errors': 5}\n\n# Export everything as a dict\nprint(stats.to_dict())\n```\n\n## Logging\n\nThe spider has a built-in logger accessible via `self.logger`. It's pre-configured with the spider's name and supports several customization options:\n\n| Attribute             | Default                                                      | Description                                        |\n|-----------------------|--------------------------------------------------------------|----------------------------------------------------|\n| `logging_level`       | `logging.DEBUG`                                              | Minimum log level                                  |\n| `logging_format`      | `\"[%(asctime)s]:({spider_name}) %(levelname)s: %(message)s\"` | Log message format                                 |\n| `logging_date_format` | `\"%Y-%m-%d %H:%M:%S\"`                                        | Date format in log messages                        |\n| `log_file`            | `None`                                                       | Path to a log file (in addition to console output) |\n\n```python\nimport logging\n\nclass MySpider(Spider):\n    name = \"my_spider\"\n    start_urls = [\"https://example.com\"]\n    logging_level = logging.INFO\n    log_file = \"logs/my_spider.log\"\n\n    async def parse(self, response: Response):\n        self.logger.info(f\"Processing {response.url}\")\n        yield {\"title\": response.css(\"title::text\").get(\"\")}\n```\n\nThe log file directory is created automatically if it doesn't exist. Both console and file output use the same format."
  },
  {
    "path": "docs/spiders/architecture.md",
    "content": "# Spiders architecture\n\n!!! success \"Prerequisites\"\n\n    1. You've completed or read the [Fetchers basics](../fetching/choosing.md) page to understand the different fetcher types and when to use each one.\n    2. You've completed or read the [Main classes](../parsing/main_classes.md) page to understand the [Selector](../parsing/main_classes.md#selector) and [Response](../fetching/choosing.md#response-object) classes.\n\nScrapling's spider system is a Scrapy-inspired async crawling framework designed for concurrent, multi-session crawls with built-in pause/resume support. It brings together Scrapling's parsing engine and fetchers into a unified crawling API while adding scheduling, concurrency control, and checkpointing.\n\nIf you're familiar with Scrapy, you'll feel right at home. If not, don't worry — the system is designed to be straightforward.\n\n## Data Flow\n\nThe diagram below shows how data flows through the spider system when a crawl is running:\n\n<img src=\"../assets/spider_architecture.png\" title=\"Spider architecture diagram by @TrueSkills\" alt=\"Spider architecture diagram by @TrueSkills\" style=\"width: 70%;\"/>\n\nHere's what happens step by step when you run a spider without many details:\n\n1. The **Spider** produces the first batch of `Request` objects. By default, it creates one request for each URL in `start_urls`, but you can override `start_requests()` for custom logic.\n2. The **Scheduler** receives requests and places them in a priority queue, and creates fingerprints for them. Higher-priority requests are dequeued first.\n3. The **Crawler Engine** asks the **Scheduler** to dequeue the next request, respecting concurrency limits (global and per-domain) and download delays. Once the **Crawler Engine** receives the request, it passes it to the **Session Manager**, which routes it to the correct session based on the request's `sid` (session ID).\n4. The **session** fetches the page and returns a [Response](../fetching/choosing.md#response-object) object to the **Crawler Engine**. The engine records statistics and checks for blocked responses. If the response is blocked, the engine retries the request up to `max_blocked_retries` times. Of course, the blocking detection and the retry logic for blocked requests can be customized.\n5. The **Crawler Engine** passes the [Response](../fetching/choosing.md#response-object) to the request's callback. The callback either yields a dictionary, which gets treated as a scraped item, or a follow-up request, which gets sent to the scheduler for queuing.\n6. The cycle repeats from step 2 until the scheduler is empty and no tasks are active, or the spider is paused.\n7. If `crawldir` is set while starting the spider, the **Crawler Engine** periodically saves a checkpoint (pending requests + seen URLs set) to disk. On graceful shutdown (Ctrl+C), a final checkpoint is saved. The next time the spider runs with the same `crawldir`, it resumes from where it left off — skipping `start_requests()` and restoring the scheduler state.\n\n\n## Components\n\n### Spider\n\nThe central class you interact with. You subclass `Spider`, define your `start_urls` and `parse()` method, and optionally configure sessions and override lifecycle hooks.\n\n```python\nfrom scrapling.spiders import Spider, Response, Request\n\nclass MySpider(Spider):\n    name = \"my_spider\"\n    start_urls = [\"https://example.com\"]\n\n    async def parse(self, response: Response):\n        for link in response.css(\"a::attr(href)\").getall():\n            yield response.follow(link, callback=self.parse_page)\n\n    async def parse_page(self, response: Response):\n        yield {\"title\": response.css(\"h1::text\").get(\"\")}\n```\n\n### Crawler Engine\n\nThe engine orchestrates the entire crawl. It manages the main loop, enforces concurrency limits, dispatches requests through the Session Manager, and processes results from callbacks. You don't interact with it directly — the `Spider.start()` and `Spider.stream()` methods handle it for you.\n\n### Scheduler\n\nA priority queue with built-in URL deduplication. Requests are fingerprinted based on their URL, HTTP method, body, and session ID. The scheduler supports `snapshot()` and `restore()` for the checkpoint system, allowing the crawl state to be saved and resumed.\n\n### Session Manager\n\nManages one or more named session instances. Each session is one of:\n\n- [FetcherSession](../fetching/static.md)\n- [AsyncDynamicSession](../fetching/dynamic.md)\n- [AsyncStealthySession](../fetching/stealthy.md)\n\nWhen a request comes in, the Session Manager routes it to the correct session based on the request's `sid` field. Sessions can be started with the spider start (default) or lazily (started on the first use).\n\n### Checkpoint System\n\nAn optional system that, if enabled, saves the crawler's state (pending requests + seen URL fingerprints) to a pickle file on disk. Writes are atomic (temp file + rename) to prevent corruption. Checkpoints are saved periodically at a configurable interval and on graceful shutdown. Upon successful completion (not paused), checkpoint files are automatically cleaned up.\n\n### Output\n\nScraped items are collected in an `ItemList` (a list subclass with `to_json()` and `to_jsonl()` export methods). Crawl statistics are tracked in a `CrawlStats` dataclass which contains a lot of useful info.\n\n\n## Comparison with Scrapy\n\nIf you're coming from Scrapy, here's how Scrapling's spider system maps:\n\n| Concept            | Scrapy                        | Scrapling                                                       |\n|--------------------|-------------------------------|-----------------------------------------------------------------|\n| Spider definition  | `scrapy.Spider` subclass      | `scrapling.spiders.Spider` subclass                             |\n| Initial requests   | `start_requests()`            | `async start_requests()`                                        |\n| Callbacks          | `def parse(self, response)`   | `async def parse(self, response)`                               |\n| Following links    | `response.follow(url)`        | `response.follow(url)`                                          |\n| Item output        | `yield dict` or `yield Item`  | `yield dict`                                                    |\n| Request scheduling | Scheduler + Dupefilter        | Scheduler with built-in deduplication                           |\n| Downloading        | Downloader + Middlewares      | Session Manager with multi-session support                      |\n| Item processing    | Item Pipelines                | `on_scraped_item()` hook                                        |\n| Blocked detection  | Through custom middlewares    | Built-in `is_blocked()` + `retry_blocked_request()` hooks       |\n| Concurrency        | `CONCURRENT_REQUESTS` setting | `concurrent_requests` class attribute                           |\n| Domain filtering   | `allowed_domains`             | `allowed_domains`                                               |\n| Pause/Resume       | `JOBDIR` setting              | `crawldir` constructor argument                                 |\n| Export             | Feed exports                  | `result.items.to_json()` / `to_jsonl()` or custom through hooks |\n| Running            | `scrapy crawl spider_name`    | `MySpider().start()`                                            |\n| Streaming          | N/A                           | `async for item in spider.stream()`                             |\n| Multi-session      | N/A                           | Multiple sessions with different types per spider               |"
  },
  {
    "path": "docs/spiders/getting-started.md",
    "content": "# Getting started\n\n## Introduction\n\n!!! success \"Prerequisites\"\n\n    1. You've completed or read the [Fetchers basics](../fetching/choosing.md) page to understand the different fetcher types and when to use each one.\n    2. You've completed or read the [Main classes](../parsing/main_classes.md) page to understand the [Selector](../parsing/main_classes.md#selector) and [Response](../fetching/choosing.md#response-object) classes.\n    3. You've read the [Architecture](architecture.md) page for a high-level overview of how the spider system works.\n\nThe spider system lets you build concurrent, multi-page crawlers in just a few lines of code. If you've used Scrapy before, the patterns will feel familiar. If not, this guide will walk you through everything you need to get started.\n\n## Your First Spider\n\nA spider is a class that defines how to crawl and extract data from websites. Here's the simplest possible spider:\n\n```python\nfrom scrapling.spiders import Spider, Response\n\nclass QuotesSpider(Spider):\n    name = \"quotes\"\n    start_urls = [\"https://quotes.toscrape.com\"]\n\n    async def parse(self, response: Response):\n        for quote in response.css(\"div.quote\"):\n            yield {\n                \"text\": quote.css(\"span.text::text\").get(\"\"),\n                \"author\": quote.css(\"small.author::text\").get(\"\"),\n            }\n```\n\nEvery spider needs three things:\n\n1. **`name`** — A unique identifier for the spider.\n2. **`start_urls`** — A list of URLs to start crawling from.\n3. **`parse()`** — An async generator method that processes each response and yields results.\n\nThe `parse()` method is where the magic happens. You use the same selection methods you'd use with Scrapling's [Selector](../parsing/main_classes.md#selector)/[Response](../fetching/choosing.md#response-object), and `yield` dictionaries to output scraped items.\n\n## Running the Spider\n\nTo run your spider, create an instance and call `start()`:\n\n```python\nresult = QuotesSpider().start()\n```\n\nThe `start()` method handles all the async machinery internally — no need to worry about event loops. While the spider is running, everything that happens is logged to the terminal, and at the end of the crawl, you get very detailed stats.\n\nThose stats are in the returned `CrawlResult` object, which gives you everything you need:\n\n```python\nresult = QuotesSpider().start()\n\n# Access scraped items\nfor item in result.items:\n    print(item[\"text\"], \"-\", item[\"author\"])\n\n# Check statistics\nprint(f\"Scraped {result.stats.items_scraped} items\")\nprint(f\"Made {result.stats.requests_count} requests\")\nprint(f\"Took {result.stats.elapsed_seconds:.1f} seconds\")\n\n# Did the crawl finish or was it paused?\nprint(f\"Completed: {result.completed}\")\n```\n\n## Following Links\n\nMost crawls need to follow links across multiple pages. Use `response.follow()` to create follow-up requests:\n\n```python\nfrom scrapling.spiders import Spider, Response\n\nclass QuotesSpider(Spider):\n    name = \"quotes\"\n    start_urls = [\"https://quotes.toscrape.com\"]\n\n    async def parse(self, response: Response):\n        # Extract items from the current page\n        for quote in response.css(\"div.quote\"):\n            yield {\n                \"text\": quote.css(\"span.text::text\").get(\"\"),\n                \"author\": quote.css(\"small.author::text\").get(\"\"),\n            }\n\n        # Follow the \"next page\" link\n        next_page = response.css(\"li.next a::attr(href)\").get()\n        if next_page:\n            yield response.follow(next_page, callback=self.parse)\n```\n\n`response.follow()` handles relative URLs automatically — it joins them with the current page's URL. It also sets the current page as the `Referer` header by default.\n\nYou can point follow-up requests at different callback methods for different page types:\n\n```python\nasync def parse(self, response: Response):\n    for link in response.css(\"a.product-link::attr(href)\").getall():\n        yield response.follow(link, callback=self.parse_product)\n\nasync def parse_product(self, response: Response):\n    yield {\n        \"name\": response.css(\"h1::text\").get(\"\"),\n        \"price\": response.css(\".price::text\").get(\"\"),\n    }\n```\n\n!!! note\n\n    All callback methods must be async generators (using `async def` and `yield`).\n\n## Exporting Data\n\nThe `ItemList` returned in `result.items` has built-in export methods:\n\n```python\nresult = QuotesSpider().start()\n\n# Export as JSON\nresult.items.to_json(\"quotes.json\")\n\n# Export as JSON with pretty-printing\nresult.items.to_json(\"quotes.json\", indent=True)\n\n# Export as JSON Lines (one JSON object per line)\nresult.items.to_jsonl(\"quotes.jsonl\")\n```\n\nBoth methods create parent directories automatically if they don't exist.\n\n## Filtering Domains\n\nUse `allowed_domains` to restrict the spider to specific domains. This prevents it from accidentally following links to external websites:\n\n```python\nclass MySpider(Spider):\n    name = \"my_spider\"\n    start_urls = [\"https://example.com\"]\n    allowed_domains = {\"example.com\"}\n\n    async def parse(self, response: Response):\n        for link in response.css(\"a::attr(href)\").getall():\n            # Links to other domains are silently dropped\n            yield response.follow(link, callback=self.parse)\n```\n\nSubdomains are matched automatically — setting `allowed_domains = {\"example.com\"}` also allows `sub.example.com`, `blog.example.com`, etc.\n\nWhen a request is filtered out, it's counted in `stats.offsite_requests_count` so you can see how many were dropped.\n\n## What's Next\n\nNow that you have the basics, you can explore:\n\n- [Requests & Responses](requests-responses.md) — learn about request priority, deduplication, metadata, and more.\n- [Sessions](sessions.md) — use multiple fetcher types (HTTP, browser, stealth) in a single spider.\n- [Proxy management & blocking](proxy-blocking.md) — rotate proxies across requests and how to handle blocking in the spider.\n- [Advanced features](advanced.md) — concurrency control, pause/resume, streaming, lifecycle hooks, and logging."
  },
  {
    "path": "docs/spiders/proxy-blocking.md",
    "content": "# Proxy management and handling Blocks\n\n## Introduction\n\n!!! success \"Prerequisites\"\n\n    1. You've read the [Getting started](getting-started.md) page and know how to create and run a basic spider.\n    2. You've read the [Sessions](sessions.md) page and understand how to configure sessions.\n\nWhen scraping at scale, you'll often need to rotate through multiple proxies to avoid rate limits and blocks. Scrapling's `ProxyRotator` makes this straightforward — it works with all session types and integrates with the spider's blocked request retry system.\n\nIf you don't know what a proxy is or how to choose a good one, [this guide can help](https://substack.thewebscraping.club/p/everything-about-proxies).\n\n## ProxyRotator\n\nThe `ProxyRotator` class manages a list of proxies and rotates through them automatically. Pass it to any session type via the `proxy_rotator` parameter:\n\n```python\nfrom scrapling.spiders import Spider, Response\nfrom scrapling.fetchers import FetcherSession, ProxyRotator\n\nclass MySpider(Spider):\n    name = \"my_spider\"\n    start_urls = [\"https://example.com\"]\n\n    def configure_sessions(self, manager):\n        rotator = ProxyRotator([\n            \"http://proxy1:8080\",\n            \"http://proxy2:8080\",\n            \"http://user:pass@proxy3:8080\",\n        ])\n        manager.add(\"default\", FetcherSession(proxy_rotator=rotator))\n\n    async def parse(self, response: Response):\n        # Check which proxy was used\n        print(f\"Proxy used: {response.meta.get('proxy')}\")\n        yield {\"title\": response.css(\"title::text\").get(\"\")}\n```\n\nEach request automatically gets the next proxy in the rotation. The proxy used is stored in `response.meta[\"proxy\"]` so you can track which proxy fetched which page.\n\n\nWhen you use it with browser sessions, you will need some adjustments, like below:\n\n```python\nfrom scrapling.fetchers import AsyncDynamicSession, AsyncStealthySession, ProxyRotator\n\n# String proxies work for all session types\nrotator = ProxyRotator([\n    \"http://proxy1:8080\",\n    \"http://proxy2:8080\",\n])\n\n# Dict proxies (Playwright format) work for browser sessions\nrotator = ProxyRotator([\n    {\"server\": \"http://proxy1:8080\", \"username\": \"user\", \"password\": \"pass\"},\n    {\"server\": \"http://proxy2:8080\"},\n])\n\n# Then inside the spider\ndef configure_sessions(self, manager):\n    rotator = ProxyRotator([\"http://proxy1:8080\", \"http://proxy2:8080\"])\n    manager.add(\"browser\", AsyncStealthySession(proxy_rotator=rotator))\n```\n\n!!! info\n\n    1. You cannot use the `proxy_rotator` argument together with the static `proxy` or `proxies` parameters on the same session. Pick one approach when configuring the session, and override it per request later if you want, as we will show later.\n    2. Remember that by default, all browser-based sessions use a persistent browser context with a pool of tabs. However, since browsers can't set a proxy per tab, when you use a `ProxyRotator`, the fetcher will automatically open a separate context for each proxy, with one tab per context. Once the tab's job is done, both the tab and its context are closed.\n\n## Custom Rotation Strategies\n\nBy default, `ProxyRotator` uses cyclic rotation — it iterates through proxies sequentially, wrapping around at the end.\n\nYou can provide a custom strategy function to change this behavior, but it has to match the below signature:\n\n```python\nfrom scrapling.core._types import ProxyType\n\ndef my_strategy(proxies: list, current_index: int) -> tuple[ProxyType, int]:\n    ...\n```\n\nIt receives the list of proxies and the current index, and must return the chosen proxy and the next index.\n\nBelow are some examples of custom rotation strategies you can use.\n\n### Random Rotation\n\n```python\nimport random\nfrom scrapling.fetchers import ProxyRotator\n\ndef random_strategy(proxies, current_index):\n    idx = random.randint(0, len(proxies) - 1)\n    return proxies[idx], idx\n\nrotator = ProxyRotator(\n    [\"http://proxy1:8080\", \"http://proxy2:8080\", \"http://proxy3:8080\"],\n    strategy=random_strategy,\n)\n```\n\n### Weighted Rotation\n\n```python\nimport random\n\ndef weighted_strategy(proxies, current_index):\n    # First proxy gets 60% of traffic, others split the rest\n    weights = [60] + [40 // (len(proxies) - 1)] * (len(proxies) - 1)\n    proxy = random.choices(proxies, weights=weights, k=1)[0]\n    return proxy, current_index  # Index doesn't matter for weighted\n\nrotator = ProxyRotator(proxies, strategy=weighted_strategy)\n```\n\n\n## Per-Request Proxy Override\n\nYou can override the rotator for individual requests by passing `proxy=` as a keyword argument:\n\n```python\nasync def parse(self, response: Response):\n    # This request uses the rotator's next proxy\n    yield response.follow(\"/page1\", callback=self.parse_page)\n\n    # This request uses a specific proxy, bypassing the rotator\n    yield response.follow(\n        \"/special-page\",\n        callback=self.parse_page,\n        proxy=\"http://special-proxy:8080\",\n    )\n```\n\nThis is useful when certain pages require a specific proxy (e.g., a geo-located proxy for region-specific content).\n\n## Blocked Request Handling\n\nThe spider has built-in blocked request detection and retry. By default, it considers the following HTTP status codes blocked: `401`, `403`, `407`, `429`, `444`, `500`, `502`, `503`, `504`.\n\nThe retry system works like this:\n\n1. After a response comes back, the spider calls the `is_blocked(response)` method.\n2. If blocked, it copies the request and calls the `retry_blocked_request()` method so you can modify it before retrying.\n3. The retried request is re-queued with `dont_filter=True` (bypassing deduplication) and lower priority, so it's not retried right away.\n4. This repeats up to `max_blocked_retries` times (default: 3).\n\n!!! tip\n\n    1. On retry, the previous `proxy`/`proxies` kwargs are cleared from the request automatically, so the rotator assigns a fresh proxy.\n    2. The `max_blocked_retries` attribute is different than the session retries and doesn't share the counter.\n\n### Custom Block Detection\n\nOverride `is_blocked()` to add your own detection logic:\n\n```python\nclass MySpider(Spider):\n    name = \"my_spider\"\n    start_urls = [\"https://example.com\"]\n\n    async def is_blocked(self, response: Response) -> bool:\n        # Check status codes (default behavior)\n        if response.status in {403, 429, 503}:\n            return True\n\n        # Check response content\n        body = response.body.decode(\"utf-8\", errors=\"ignore\")\n        if \"access denied\" in body.lower() or \"rate limit\" in body.lower():\n            return True\n\n        return False\n\n    async def parse(self, response: Response):\n        yield {\"title\": response.css(\"title::text\").get(\"\")}\n```\n\n### Customizing Retries\n\nOverride `retry_blocked_request()` to modify the request before retrying. The `max_blocked_retries` attribute controls how many times a blocked request is retried (default: 3):\n\n```python\nfrom scrapling.spiders import Spider, SessionManager, Request, Response\nfrom scrapling.fetchers import FetcherSession, AsyncStealthySession\n\n\nclass MySpider(Spider):\n    name = \"my_spider\"\n    start_urls = [\"https://example.com\"]\n    max_blocked_retries = 5\n\n    def configure_sessions(self, manager: SessionManager) -> None:\n        manager.add('requests', FetcherSession(impersonate=['chrome', 'firefox', 'safari']))\n        manager.add('stealth', AsyncStealthySession(block_webrtc=True), lazy=True)\n\n    async def retry_blocked_request(self, request: Request, response: Response) -> Request:\n        request.sid = \"stealth\"\n        self.logger.info(f\"Retrying blocked request: {request.url}\")\n        return request\n\n    async def parse(self, response: Response):\n        yield {\"title\": response.css(\"title::text\").get(\"\")}\n```\n\nWhat happened above is that I left the blocking detection logic unchanged and had the spider mainly use requests until it got blocked, then switch to the stealthy browser.\n\n\nPutting it all together:\n\n```python\nfrom scrapling.spiders import Spider, SessionManager, Request, Response\nfrom scrapling.fetchers import FetcherSession, AsyncStealthySession, ProxyRotator\n\n\ncheap_proxies = ProxyRotator([ \"http://proxy1:8080\", \"http://proxy2:8080\"])\n\n# A format acceptable by the browser\nexpensive_proxies = ProxyRotator([\n    {\"server\": \"http://residential_proxy1:8080\", \"username\": \"user\", \"password\": \"pass\"},\n    {\"server\": \"http://residential_proxy2:8080\", \"username\": \"user\", \"password\": \"pass\"},\n    {\"server\": \"http://mobile_proxy1:8080\", \"username\": \"user\", \"password\": \"pass\"},\n    {\"server\": \"http://mobile_proxy2:8080\", \"username\": \"user\", \"password\": \"pass\"},\n])\n\n\nclass MySpider(Spider):\n    name = \"my_spider\"\n    start_urls = [\"https://example.com\"]\n    max_blocked_retries = 5\n\n    def configure_sessions(self, manager: SessionManager) -> None:\n        manager.add('requests', FetcherSession(impersonate=['chrome', 'firefox', 'safari'], proxy_rotator=cheap_proxies))\n        manager.add('stealth', AsyncStealthySession(block_webrtc=True, proxy_rotator=expensive_proxies), lazy=True)\n\n    async def retry_blocked_request(self, request: Request, response: Response) -> Request:\n        request.sid = \"stealth\"\n        self.logger.info(f\"Retrying blocked request: {request.url}\")\n        return request\n\n    async def parse(self, response: Response):\n        yield {\"title\": response.css(\"title::text\").get(\"\")}\n```\nThe above logic is: requests are made with cheap proxies, such as datacenter proxies, until they are blocked, then retried with higher-quality proxies, such as residential or mobile proxies."
  },
  {
    "path": "docs/spiders/requests-responses.md",
    "content": "# Requests & Responses\n\n!!! success \"Prerequisites\"\n\n    1. You've read the [Getting started](getting-started.md) page and know how to create and run a basic spider.\n\nThis page covers the `Request` object in detail — how to construct requests, pass data between callbacks, control priority and deduplication, and use `response.follow()` for link-following.\n\n## The Request Object\n\nA `Request` represents a URL to be fetched. You create requests either directly or via `response.follow()`:\n\n```python\nfrom scrapling.spiders import Request\n\n# Direct construction\nrequest = Request(\n    \"https://example.com/page\",\n    callback=self.parse_page,\n    priority=5,\n)\n\n# Via response.follow (preferred in callbacks)\nrequest = response.follow(\"/page\", callback=self.parse_page)\n```\n\nHere are all the arguments you can pass to `Request`:\n\n| Argument      | Type       | Default    | Description                                                                                           |\n|---------------|------------|------------|-------------------------------------------------------------------------------------------------------|\n| `url`         | `str`      | *required* | The URL to fetch                                                                                      |\n| `sid`         | `str`      | `\"\"`       | Session ID — routes the request to a specific session (see [Sessions](sessions.md))                   |\n| `callback`    | `callable` | `None`     | Async generator method to process the response. Defaults to `parse()`                                 |\n| `priority`    | `int`      | `0`        | Higher values are processed first                                                                     |\n| `dont_filter` | `bool`     | `False`    | If `True`, skip deduplication (allow duplicate requests)                                              |\n| `meta`        | `dict`     | `{}`       | Arbitrary metadata passed through to the response                                                     |\n| `**kwargs`    |            |            | Additional keyword arguments passed to the session's fetch method (e.g., `headers`, `method`, `data`) |\n\nAny extra keyword arguments are forwarded directly to the underlying session. For example, to make a POST request:\n\n```python\nyield Request(\n    \"https://example.com/api\",\n    method=\"POST\",\n    data={\"key\": \"value\"},\n    callback=self.parse_result,\n)\n```\n\n## Response.follow()\n\n`response.follow()` is the recommended way to create follow-up requests inside callbacks. It offers several advantages over constructing `Request` objects directly:\n\n- **Relative URLs** are resolved automatically against the current page URL\n- **Referer header** is set to the current page URL by default\n- **Session kwargs** from the original request are inherited (headers, proxy settings, etc.)\n- **Callback, session ID, and priority** are inherited from the original request if not specified\n\n```python\nasync def parse(self, response: Response):\n    # Minimal — inherits callback, sid, priority from current request\n    yield response.follow(\"/next-page\")\n\n    # Override specific fields\n    yield response.follow(\n        \"/product/123\",\n        callback=self.parse_product,\n        priority=10,\n    )\n\n    # Pass additional metadata to\n    yield response.follow(\n        \"/details\",\n        callback=self.parse_details,\n        meta={\"category\": \"electronics\"},\n    )\n```\n\n| Argument           | Type       | Default    | Description                                                |\n|--------------------|------------|------------|------------------------------------------------------------|\n| `url`              | `str`      | *required* | URL to follow (absolute or relative)                       |\n| `sid`              | `str`      | `\"\"`       | Session ID (inherits from original request if empty)       |\n| `callback`         | `callable` | `None`     | Callback method (inherits from original request if `None`) |\n| `priority`         | `int`      | `None`     | Priority (inherits from original request if `None`)        |\n| `dont_filter`      | `bool`     | `False`    | Skip deduplication                                         |\n| `meta`             | `dict`     | `None`     | Metadata (merged with existing response meta)              |\n| **`referer_flow`** | `bool`     | `True`     | Set current URL as Referer header                          |\n| `**kwargs`         |            |            | Merged with original request's session kwargs              |\n\n### Disabling Referer Flow\n\nBy default, `response.follow()` sets the `Referer` header to the current page URL. To disable this:\n\n```python\nyield response.follow(\"/page\", referer_flow=False)\n```\n\n## Callbacks\n\nCallbacks are async generator methods on your spider that process responses. They must `yield` one of three types:\n\n- **`dict`** — A scraped item, added to the results\n- **`Request`** — A follow-up request, added to the queue\n- **`None`** — Silently ignored\n\n```python\nclass MySpider(Spider):\n    name = \"my_spider\"\n    start_urls = [\"https://example.com\"]\n\n    async def parse(self, response: Response):\n        # Yield items (dicts)\n        yield {\"url\": response.url, \"title\": response.css(\"title::text\").get(\"\")}\n\n        # Yield follow-up requests\n        for link in response.css(\"a::attr(href)\").getall():\n            yield response.follow(link, callback=self.parse_page)\n\n    async def parse_page(self, response: Response):\n        yield {\"content\": response.css(\"article::text\").get(\"\")}\n```\n\n!!! tip \"Note:\"\n\n    All callback methods must be `async def` and use `yield` (not `return`). Even if a callback only yields items with no follow-up requests, it must still be an async generator.\n\n## Request Priority\n\nRequests with higher priority values are processed first. This is useful when some pages are more important to be processed first before others:\n\n```python\nasync def parse(self, response: Response):\n    # High priority — process product pages first\n    for link in response.css(\"a.product::attr(href)\").getall():\n        yield response.follow(link, callback=self.parse_product, priority=10)\n\n    # Low priority — pagination links processed after products\n    next_page = response.css(\"a.next::attr(href)\").get()\n    if next_page:\n        yield response.follow(next_page, callback=self.parse, priority=0)\n```\n\nWhen using `response.follow()`, the priority is inherited from the original request unless you specify a new one.\n\n## Deduplication\n\nThe spider automatically deduplicates requests based on a fingerprint computed from the URL, HTTP method, request body, and session ID. If two requests produce the same fingerprint, the second one is silently dropped.\n\nTo allow duplicate requests (e.g., re-visiting a page after login), set `dont_filter=True`:\n\n```python\nyield Request(\"https://example.com/dashboard\", dont_filter=True, callback=self.parse_dashboard)\n\n# Or with response.follow\nyield response.follow(\"/dashboard\", dont_filter=True, callback=self.parse_dashboard)\n```\n\nYou can fine-tune what goes into the fingerprint using class attributes on your spider:\n\n| Attribute            | Default | Effect                                                                                                          |\n|----------------------|---------|-----------------------------------------------------------------------------------------------------------------|\n| `fp_include_kwargs`  | `False` | Include extra request kwargs (arguments you passed to the session fetch, like headers, etc.) in the fingerprint |\n| `fp_keep_fragments`  | `False` | Keep URL fragments (`#section`) when computing fingerprints                                                     |\n| `fp_include_headers` | `False` | Include request headers in the fingerprint                                                                      |\n\nFor example, if you need to treat `https://example.com/page#section1` and `https://example.com/page#section2` as different URLs:\n\n```python\nclass MySpider(Spider):\n    name = \"my_spider\"\n    fp_keep_fragments = True\n    # ...\n```\n\n## Request Meta\n\nThe `meta` dictionary lets you pass arbitrary data between callbacks. This is useful when you need context from one page to process another:\n\n```python\nasync def parse(self, response: Response):\n    for product in response.css(\"div.product\"):\n        category = product.css(\"span.category::text\").get(\"\")\n        link = product.css(\"a::attr(href)\").get()\n        if link:\n            yield response.follow(\n                link,\n                callback=self.parse_product,\n                meta={\"category\": category},\n            )\n\nasync def parse_product(self, response: Response):\n    yield {\n        \"name\": response.css(\"h1::text\").get(\"\"),\n        \"price\": response.css(\".price::text\").get(\"\"),\n        # Access meta from the request\n        \"category\": response.meta.get(\"category\", \"\"),\n    }\n```\n\nWhen using `response.follow()`, the meta from the current response is merged with the new meta you provide (new values take precedence).\n\nThe spider system also automatically stores some metadata. For example, the proxy used for a request is available as `response.meta[\"proxy\"]` when proxy rotation is enabled."
  },
  {
    "path": "docs/spiders/sessions.md",
    "content": "# Spiders sessions\n\n!!! success \"Prerequisites\"\n\n    1. You've read the [Getting started](getting-started.md) page and know how to create and run a basic spider.\n    2. You're familiar with [Fetchers basics](../fetching/choosing.md) and the differences between HTTP, Dynamic, and Stealthy sessions.\n\nA spider can use multiple fetcher sessions simultaneously — for example, a fast HTTP session for simple pages and a stealth browser session for protected pages. This page shows you how to configure and use sessions.\n\n## What are Sessions?\n\nAs you should already know, a session is a pre-configured fetcher instance that stays alive for the duration of the crawl. Instead of creating a new connection or browser for every request, the spider reuses sessions, which is faster and more resource-efficient.\n\nBy default, every spider creates a single [FetcherSession](../fetching/static.md). You can add more sessions or swap the default by overriding the `configure_sessions()` method, but you have to use the async version of each session only, as the table shows below:\n\n\n| Session Type                                    | Use Case                                 |\n|-------------------------------------------------|------------------------------------------|\n| [FetcherSession](../fetching/static.md)         | Fast HTTP requests, no JavaScript        |\n| [AsyncDynamicSession](../fetching/dynamic.md)   | Browser automation, JavaScript rendering |\n| [AsyncStealthySession](../fetching/stealthy.md) | Anti-bot bypass, Cloudflare, etc.        |\n\n\n## Configuring Sessions\n\nOverride `configure_sessions()` on your spider to set up sessions. The `manager` parameter is a `SessionManager` instance — use `manager.add()` to register sessions:\n\n```python\nfrom scrapling.spiders import Spider, Response\nfrom scrapling.fetchers import FetcherSession\n\nclass MySpider(Spider):\n    name = \"my_spider\"\n    start_urls = [\"https://example.com\"]\n\n    def configure_sessions(self, manager):\n        manager.add(\"default\", FetcherSession())\n\n    async def parse(self, response: Response):\n        yield {\"title\": response.css(\"title::text\").get(\"\")}\n```\n\nThe `manager.add()` method takes:\n\n| Argument     | Type      | Default    | Description                                  |\n|--------------|-----------|------------|----------------------------------------------|\n| `session_id` | `str`     | *required* | A name to reference this session in requests |\n| `session`    | `Session` | *required* | The session instance                         |\n| `default`    | `bool`    | `False`    | Make this the default session                |\n| `lazy`       | `bool`    | `False`    | Start the session only when first used       |\n\n!!! note \"Notes:\"\n\n    1. In all requests, if you don't specify which session to use, the default session is used. The default session is determined in one of two ways:\n        1. The first session you add to the managed becomes the default automatically.\n        2. The session that gets `default=True` while added to the manager.\n    2. The instances you pass of each session don't have to be already started by you; the spider checks on all sessions if they are not already started and starts them.\n    3. If you want a specific session to start when used only, then use the `lazy` argument while adding that session to the manager. Example: start the browser only when you need it, not with the spider start.\n\n## Multi-Session Spider\n\nHere's a practical example: use a fast HTTP session for listing pages and a stealth browser for detail pages that have bot protection:\n\n```python\nfrom scrapling.spiders import Spider, Response\nfrom scrapling.fetchers import FetcherSession, AsyncStealthySession\n\nclass ProductSpider(Spider):\n    name = \"products\"\n    start_urls = [\"https://shop.example.com/products\"]\n\n    def configure_sessions(self, manager):\n        # Fast HTTP for listing pages (default)\n        manager.add(\"http\", FetcherSession())\n\n        # Stealth browser for protected product pages\n        manager.add(\"stealth\", AsyncStealthySession(\n            headless=True,\n            network_idle=True,\n        ))\n\n    async def parse(self, response: Response):\n        for link in response.css(\"a.product::attr(href)\").getall():\n            # Route product pages through the stealth session\n            yield response.follow(link, sid=\"stealth\", callback=self.parse_product)\n\n        next_page = response.css(\"a.next::attr(href)\").get()\n        if next_page:\n            yield response.follow(next_page)\n\n    async def parse_product(self, response: Response):\n        yield {\n            \"name\": response.css(\"h1::text\").get(\"\"),\n            \"price\": response.css(\".price::text\").get(\"\"),\n        }\n```\n\nThe key is the `sid` parameter — it tells the spider which session to use for each request. When you call `response.follow()` without `sid`, the session ID from the original request is inherited.\n\nNote that the sessions don't have to be from different classes only, but can be the same session, but different instances with different configurations, for example, like below:\n\n```python\nfrom scrapling.spiders import Spider, Response\nfrom scrapling.fetchers import FetcherSession\n\nclass ProductSpider(Spider):\n    name = \"products\"\n    start_urls = [\"https://shop.example.com/products\"]\n\n    def configure_sessions(self, manager):\n        chrome_requests = FetcherSession(impersonate=\"chrome\")\n        firefox_requests = FetcherSession(impersonate=\"firefox\")\n\n        manager.add(\"chrome\", chrome_requests)\n        manager.add(\"firefox\", firefox_requests)\n\n    async def parse(self, response: Response):\n        for link in response.css(\"a.product::attr(href)\").getall():\n            yield response.follow(link, callback=self.parse_product)\n\n        next_page = response.css(\"a.next::attr(href)\").get()\n        if next_page:\n            yield response.follow(next_page, sid=\"firefox\")\n\n    async def parse_product(self, response: Response):\n        yield {\n            \"name\": response.css(\"h1::text\").get(\"\"),\n            \"price\": response.css(\".price::text\").get(\"\"),\n        }\n```\n\nOr you can separate concerns and keep a session with its cookies/state for specific requests, etc...\n\n## Session Arguments\n\nExtra keyword arguments passed to a `Request` (or through `response.follow(**kwargs)`) are forwarded to the session's fetch method. This lets you customize individual requests without changing the session configuration:\n\n```python\nasync def parse(self, response: Response):\n    # Pass extra headers for this specific request\n    yield Request(\n        \"https://api.example.com/data\",\n        headers={\"Authorization\": \"Bearer token123\"},\n        callback=self.parse_api,\n    )\n\n    # Use a different HTTP method\n    yield Request(\n        \"https://example.com/submit\",\n        method=\"POST\",\n        data={\"field\": \"value\"},\n        sid=\"firefox\",\n        callback=self.parse_result,\n    )\n```\n\n!!! warning\n\n    Normally, when you use `FetcherSession`, `Fetcher`, or `AsyncFetcher`, you specify the HTTP method to use with the corresponding method like `.get()` and `.post()`. But while using `FetcherSession` in spiders, you can't do this. By default, the request is an _HTTP GET_ request; if you want to use another HTTP method, you have to pass it to the `method` argument, as in the above example. The reason for this is to unify the `Request` interface across all session types.\n\nFor browser sessions (`AsyncDynamicSession`, `AsyncStealthySession`), you can pass browser-specific arguments like `wait_selector`, `page_action`, or `extra_headers`:\n\n```python\nasync def parse(self, response: Response):\n    # Use Cloudflare solver with the `AsyncStealthySession` we configured above\n    yield Request(\n        \"https://nopecha.com/demo/cloudflare\",\n        sid=\"stealth\",\n        callback=self.parse_result,\n        solve_cloudflare=True,\n        block_webrtc=True,\n        hide_canvas=True,\n        google_search=True,\n    )\n\n    yield response.follow(\n        \"/dynamic-page\",\n        sid=\"browser\",\n        callback=self.parse_dynamic,\n        wait_selector=\"div.loaded\",\n        network_idle=True,\n    )\n```\n\n!!! warning\n\n    Session arguments (**kwargs) passed from the original request are inherited by `response.follow()`. New kwargs take precedence over inherited ones.\n\n```python\nfrom scrapling.spiders import Spider, Response\nfrom scrapling.fetchers import FetcherSession\n\nclass ProductSpider(Spider):\n    name = \"products\"\n    start_urls = [\"https://shop.example.com/products\"]\n\n    def configure_sessions(self, manager):\n        manager.add(\"http\", FetcherSession(impersonate='chrome'))\n\n    async def parse(self, response: Response):\n        # I don't want the follow request to impersonate a desktop Chrome like the previous request, but a mobile one\n        # so I override it like this\n        for link in response.css(\"a.product::attr(href)\").getall():\n            yield response.follow(link, impersonate=\"chrome131_android\", callback=self.parse_product)\n\n        next_page = response.css(\"a.next::attr(href)\").get()\n        if next_page:\n            yield Request(next_page)\n\n    async def parse_product(self, response: Response):\n        yield {\n            \"name\": response.css(\"h1::text\").get(\"\"),\n            \"price\": response.css(\".price::text\").get(\"\"),\n        }\n```\n!!! info\n\n    No need to mention that, upon spider closure, the manager automatically checks whether any sessions are still running and closes them before closing the spider."
  },
  {
    "path": "docs/stylesheets/extra.css",
    "content": ".md-grid {\n  max-width: 90%;\n}\n\n@font-face {\n  font-family: 'Maple Mono';\n  font-style: normal;\n  font-display: swap;\n  font-weight: 400;\n  src: url(https://cdn.jsdelivr.net/fontsource/fonts/maple-mono@latest/latin-400-normal.woff2) format('woff2'), url(https://cdn.jsdelivr.net/fontsource/fonts/maple-mono@latest/latin-400-normal.woff) format('woff');\n}\n\n:root {\n  --md-code-font: 'Maple Mono';\n}\n[align=\"center\"] code {\n  font-family: 'Maple Mono';\n  font-style: italic;\n  font-weight: 800;\n}\n\n/* Announcement banner background */\n[data-md-color-scheme=\"default\"] .md-banner {\n  background-color: #232946;\n}\n\n[data-md-color-scheme=\"slate\"] .md-banner {\n  background-color: #141428;\n}"
  },
  {
    "path": "docs/tutorials/migrating_from_beautifulsoup.md",
    "content": "# Migrating from BeautifulSoup to Scrapling\n\nIf you're already familiar with BeautifulSoup, you're in for a treat. Scrapling is much faster, provides the same parsing capabilities as BS, adds additional parsing capabilities not found in BS, and introduces powerful new features for fetching and handling modern web pages. This guide will help you quickly adapt your existing BeautifulSoup code to leverage Scrapling's capabilities.\n\nBelow is a table that covers the most common operations you'll perform when scraping web pages. Each row illustrates how to achieve a specific task using BeautifulSoup and the corresponding method in Scrapling.\n\nYou will notice that some shortcuts in BeautifulSoup are missing in Scrapling, which is one of the reasons BeautifulSoup is slower than Scrapling. The point is: If the same feature can be used in a short one-liner, there is no need to sacrifice performance to shorten that short line :)\n\n\n| Task                                                            | BeautifulSoup Code                                                                                            | Scrapling Code                                                                    |\n|-----------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------|-----------------------------------------------------------------------------------|\n| Parser import                                                   | `from bs4 import BeautifulSoup`                                                                               | `from scrapling.parser import Selector`                                           |\n| Parsing HTML from string                                        | `soup = BeautifulSoup(html, 'html.parser')`                                                                   | `page = Selector(html)`                                                           |\n| Finding a single element                                        | `element = soup.find('div', class_='example')`                                                                | `element = page.find('div', class_='example')`                                    |\n| Finding multiple elements                                       | `elements = soup.find_all('div', class_='example')`                                                           | `elements = page.find_all('div', class_='example')`                               |\n| Finding a single element (Example 2)                            | `element = soup.find('div', attrs={\"class\": \"example\"})`                                                      | `element = page.find('div', {\"class\": \"example\"})`                                |\n| Finding a single element (Example 3)                            | `element = soup.find(re.compile(\"^b\"))`                                                                       | `element = page.find(re.compile(\"^b\"))`<br/>`element = page.find_by_regex(r\"^b\")` |\n| Finding a single element (Example 4)                            | `element = soup.find(lambda e: len(list(e.children)) > 0)`                                                    | `element = page.find(lambda e: len(e.children) > 0)`                              |\n| Finding a single element (Example 5)                            | `element = soup.find([\"a\", \"b\"])`                                                                             | `element = page.find([\"a\", \"b\"])`                                                 |\n| Find element by its text content                                | `element = soup.find(text=\"some text\")`                                                                       | `element = page.find_by_text(\"some text\", partial=False)`                         |\n| Using CSS selectors to find the first matching element          | `elements = soup.select_one('div.example')`                                                                   | `elements = page.css('div.example').first`                                        |\n| Using CSS selectors to find all matching element                | `elements = soup.select('div.example')`                                                                       | `elements = page.css('div.example')`                                              |\n| Get a prettified version of the page/element source             | `prettified = soup.prettify()`                                                                                | `prettified = page.prettify()`                                                    |\n| Get a Non-pretty version of the page/element source             | `source = str(soup)`                                                                                          | `source = page.html_content`                                                      |\n| Get tag name of an element                                      | `name = element.name`                                                                                         | `name = element.tag`                                                              |\n| Extracting text content of an element                           | `string = element.string`                                                                                     | `string = element.text`                                                           |\n| Extracting all the text in a document or beneath a tag          | `text = soup.get_text(strip=True)`                                                                            | `text = page.get_all_text(strip=True)`                                            |\n| Access the dictionary of attributes                             | `attrs = element.attrs`                                                                                       | `attrs = element.attrib`                                                          |\n| Extracting attributes                                           | `attr = element['href']`                                                                                      | `attr = element['href']`                                                          |\n| Navigating to parent                                            | `parent = element.parent`                                                                                     | `parent = element.parent`                                                         |\n| Get all parents of an element                                   | `parents = list(element.parents)`                                                                             | `parents = list(element.iterancestors())`                                         |\n| Searching for an element in the parents of an element           | `target_parent = element.find_parent(\"a\")`                                                                    | `target_parent = element.find_ancestor(lambda p: p.tag == 'a')`                   |\n| Get all siblings of an element                                  | N/A                                                                                                           | `siblings = element.siblings`                                                     |\n| Get next sibling of an element                                  | `next_element = element.next_sibling`                                                                         | `next_element = element.next`                                                     |\n| Searching for an element in the siblings of an element          | `target_sibling = element.find_next_sibling(\"a\")`<br/>`target_sibling = element.find_previous_sibling(\"a\")`   | `target_sibling = element.siblings.search(lambda s: s.tag == 'a')`                |\n| Searching for elements in the siblings of an element            | `target_sibling = element.find_next_siblings(\"a\")`<br/>`target_sibling = element.find_previous_siblings(\"a\")` | `target_sibling = element.siblings.filter(lambda s: s.tag == 'a')`                |\n| Searching for an element in the next elements of an element     | `target_parent = element.find_next(\"a\")`                                                                      | `target_parent = element.below_elements.search(lambda p: p.tag == 'a')`           |\n| Searching for elements in the next elements of an element       | `target_parent = element.find_all_next(\"a\")`                                                                  | `target_parent = element.below_elements.filter(lambda p: p.tag == 'a')`           |\n| Searching for an element in the ancestors of an element         | `target_parent = element.find_previous(\"a\")` ¹                                                                | `target_parent = element.path.search(lambda p: p.tag == 'a')`                     |\n| Searching for elements in the ancestors of an element           | `target_parent = element.find_all_previous(\"a\")` ¹                                                            | `target_parent = element.path.filter(lambda p: p.tag == 'a')`                     |\n| Get previous sibling of an element                              | `prev_element = element.previous_sibling`                                                                     | `prev_element = element.previous`                                                 |\n| Navigating to children                                          | `children = list(element.children)`                                                                           | `children = element.children`                                                     |\n| Get all descendants of an element                               | `children = list(element.descendants)`                                                                        | `children = element.below_elements`                                               |\n| Filtering a group of elements that satisfies a condition        | `group = soup.find('p', 'story').css.filter('a')`                                                             | `group = page.find_all('p', 'story').filter(lambda p: p.tag == 'a')`              |\n\n\n¹ **Note:** BS4's `find_previous`/`find_all_previous` searches all preceding elements in document order, while Scrapling's `path` only returns ancestors (the parent chain). These are not exact equivalents, but ancestor search covers the most common use case.\n\n**One key point to remember**: BeautifulSoup offers features for modifying and manipulating the page after it has been parsed. Scrapling focuses more on scraping the page faster for you, and then you can do what you want with the extracted information. So, two different tools can be used in Web Scraping, but one of them specializes in Web Scraping :)\n\n### Putting It All Together\n\nHere's a simple example of scraping a web page to extract all the links using BeautifulSoup and Scrapling.\n\n**With BeautifulSoup:**\n\n```python\nimport requests\nfrom bs4 import BeautifulSoup\n\nurl = 'https://example.com'\nresponse = requests.get(url)\nsoup = BeautifulSoup(response.text, 'html.parser')\n\nlinks = soup.find_all('a')\nfor link in links:\n    print(link['href'])\n```\n\n**With Scrapling:**\n\n```python\nfrom scrapling import Fetcher\n\nurl = 'https://example.com'\npage = Fetcher.get(url)\n\nlinks = page.css('a::attr(href)')\nfor link in links:\n    print(link)\n```\n\nAs you can see, Scrapling simplifies the process by combining fetching and parsing into a single step, making your code cleaner and more efficient.\n\n!!! abstract \"**Additional Notes:**\"\n\n    - **Different parsers**: BeautifulSoup allows you to set the parser engine to use, and one of them is `lxml`. Scrapling doesn't do that and uses the `lxml` library by default for performance reasons.\n    - **Element Types**: In BeautifulSoup, elements are `Tag` objects; in Scrapling, they are `Selector` objects. However, they provide similar methods and properties for navigation and data extraction.\n    - **Error Handling**: Both libraries return `None` when an element is not found (e.g., `soup.find()` or `page.find()`). In Scrapling, `page.css()` returns an empty `Selectors` list when no elements match, and you can use `page.css('.foo').first` to safely get the first match or `None`. To avoid errors, check for `None` or empty results before accessing properties.\n    - **Text Extraction**: Scrapling provides additional methods for handling text through `TextHandler`, such as `clean()`, which can help remove extra whitespace, consecutive spaces, or unwanted characters. Please check out the documentation for the complete list.\n\nThe documentation provides more details on Scrapling's features and the complete list of arguments that can be passed to all methods.\n\nThis guide should make your transition from BeautifulSoup to Scrapling smooth and straightforward. Happy scraping!"
  },
  {
    "path": "docs/tutorials/replacing_ai.md",
    "content": "# Scrapling: A Free Alternative to AI for Robust Web Scraping\n\nWeb scraping has long been a vital tool for data extraction, indexing, and preparing datasets, among other purposes. But experienced users often encounter persistent issues that can hinder effectiveness. Recently, there's been a noticeable shift toward AI-based web scraping, driven by its potential to address these challenges.\n\nIn this article, we will discuss these common issues, why companies are shifting toward that approach, the problems with that approach, and how scrapling solves them for you without the cost of using AI.\n\n## Common issues and challenging goals\n\nIf you have been doing Web Scraping for a long time, you probably noticed that there are repeating problems with Web Scraping, like:\n\n1. **Rapidly changing website structures** — Sites frequently update their DOM structures, breaking static XPath/CSS selectors.\n2. **Unstable selectors** — Class names and IDs often change or use randomly generated values that break scrapers or make scraping these websites difficult.\n3. **Increasingly complex anti-bot measures** — CAPTCHA systems, browser fingerprinting, and behavior analysis make traditional scraping difficult\nand others\n\nBut that's only if you are doing targeted Web Scraping for known websites, in which case you can write specific code for every website.\n\nIf you start thinking about bigger goals like Broad Scraping or Generic Web Scraping, or what you like to call it, then the above issues intensify, and you will face new issues like:\n\n1. **Extreme Website Diversity** — Generic scraping must handle countless variations in HTML structures, CSS usage, JavaScript frameworks, and backend technologies.\n2. **Identifying Relevant Data** — How does the scraper know what data is important on a page it has never seen before?\n3. **Pagination variations** — Infinite scroll, traditional pagination, \"load more\" buttons, all requiring different approaches\nand more\n\nHow will you solve that manually? I'm referring to generic web scraping of various websites that don't share any common technologies.\n\n## AI to the rescue, but at a high cost\n\nOf course, AI can easily solve most of these issues because it can understand the page source and identify the fields you want or create selectors for them. That's, of course, if you already solved the anti-bot measures through other tools :)\n\nThis approach is, of course, beautiful. I love AI and find it very fascinating, especially Generative AI. You will probably spend a lot of time on prompt engineering and tweaking the prompts, but if that's cool with you, you will soon hit the real issue with using AI here.\n\nMost websites have vast amounts of content per page, which you will need to pass to the AI somehow so it can do its magic. This will burn through tokens like fire in a haystack, quickly accumulating high costs.\n\nUnless money is irrelevant to you, you will try to find less expensive approaches, and that's where Scrapling comes into play :smile:\n\n## Scrapling got you covered\n\nScrapling can handle almost all issues you will face during Web Scraping, and the following updates will cover the rest carefully.\n\n### Solving issue T1: Rapidly changing website structures\nThat's why the [adaptive](https://scrapling.readthedocs.io/en/latest/parsing/adaptive.html) feature was made. You knew I would talk about it, and here we are :)\n\nWhile Web Scraping, if you have the `adaptive` feature enabled, you can save any element's unique properties so you can find it again later when the website's structure changes. The most frustrating thing about changes is that anything about an element can change, so there's nothing to rely on. \n\nThat's how the adaptive feature works: it stores everything unique about an element. When the website structure changes, it returns the element with the highest similarity score of the previous element.\n\nI have already explained this in more detail, with many examples. Read more from [here](https://scrapling.readthedocs.io/en/latest/parsing/adaptive.html#how-the-adaptive-feature-works).\n\n### Solving issue T2: Unstable selectors\nIf you have been doing Web scraping for a long enough time, you have likely experienced this once. I'm referring to a website that employs poor design patterns, built on raw HTML without any IDs/classes, or uses random class names with nothing else to rely on, etc...\n\nIn these cases, standard selection methods with CSS/XPath selectors won't be optimal, and that's why Scrapling provides three more methods for Selection:\n\n1. [Selection by element content](https://scrapling.readthedocs.io/en/latest/parsing/selection.html#text-content-selection): Through text content (`find_by_text`) or regex that matches text content (`find_by_regex`)\n2. [Selecting elements similar to another element](https://scrapling.readthedocs.io/en/latest/parsing/selection.html#finding-similar-elements): You find an element, and we will do the rest!\n3. [Selecting elements by filters](https://scrapling.readthedocs.io/en/latest/parsing/selection.html#filters-based-searching): You specify conditions/filters that this element must fulfill, we find it!\n\nThere is no need to explain any of these; click on the links, and it will be clear how Scrapling solves this.\n\n### Solving issue T3: Increasingly complex anti-bot measures\nIt's well known that creating an undetectable spider requires more than residential/mobile proxies and human-like behavior. It also needs a hard-to-detect browser, which Scrapling provides two main options to solve:\n\n1. [DynamicFetcher](https://scrapling.readthedocs.io/en/latest/fetching/dynamic.html) — This fetcher provides flexible browser automation with multiple configuration options and little under-the-hood stealth improvements.\n2. [StealthyFetcher](https://scrapling.readthedocs.io/en/latest/fetching/stealthy.html) — Because we live in a harsh world and you need to take [full measure instead of half-measures](https://www.youtube.com/watch?v=7BE4QcwX4dU), `StealthyFetcher` was born. This fetcher uses our stealthy browser -- a version of [DynamicFetcher](https://scrapling.readthedocs.io/en/latest/fetching/dynamic.html) that nearly bypasses all annoying anti-protections, provides tools to handle the rest, and automatically bypasses all types of Cloudflare's Turnstile/Interstitial!\n\nWe keep improving these two with each update, so stay tuned :)\n\n### Solving issues B1 & B2: Extreme Website Diversity / Identifying Relevant Data\n\nThis one is tough to handle, but Scrapling's flexibility makes it possible. \n\nI talked with someone who uses AI to extract prices from different websites. He is only interested in prices and titles, so he uses AI to find the price for him.\n\nI told him you don't need to use AI here and gave this code as an example\n```python\nprice_element = page.find_by_regex(r'£[\\d\\.,]+', first_match=True)  # Get the first element that contains a text that matches price regex eg. £10.50\n# If you want the container/element that contains the price element\nprice_element_container = price_element.parent or price_element.find_ancestor(lambda ancestor: ancestor.has_class('product'))  # or other methods...\ntarget_element_selector = price_element_container.generate_css_selector or price_element_container.generate_full_css_selector # or xpath\n```\nThen he said What about cases like this:\n```html\n<span class='currency'> $ </span> <span class='a-price'> 45,000 </span>\n```\nSo, I updated the code like this\n```python\nprice_element_container = page.find_by_regex(r'[\\d,]+', first_match=True).parent # Adjusted the regex for this example\nfull_price_data = price_element_container.get_all_text(strip=True)  # Returns '$45,000' in this case\n```\nThis was enough for his use case. You can use the first regex, and if it doesn't find anything, use the following regex, and so on. Try to cover the most common patterns first, then the less common ones, and so on.\nIt will be a bit boring, but it's definitely less expensive than AI.\n\nThis example illustrates the point I aim to convey here. Not every challenge will need AI to be solved, but sometimes you need to be creative, and that might save you a lot of money.\n\n### Solving issue B3: Pagination variations\nThis issue, Scrapling currently doesn't have a direct method to automatically extract pagination's URLs for you, but it will be added with the upcoming updates :)\n\nBut you can handle most websites if you search for the most common patterns with `page.find_by_text('Next')['href']` or `page.find_by_text('load more')['href']` or selectors like `'a[href*=\"?page=\"]'` or `'a[href*=\"/page/\"]'`—you get the idea.\n\n## Cost Comparison and Savings\nFor a quick comparison.\n\n| Aspect         | Scrapling                                                                  | AI-Based Tools (e.g., Browse AI, Oxylabs)                                  |\n|----------------|----------------------------------------------------------------------------|----------------------------------------------------------------------------|\n| Cost Structure | Likely free or low-cost, no per-use fees                                   | Starts at $19/month (Browse AI) to $49/month (Oxylabs), scales with usage  |\n| Setup Effort   | Requires little technical expertise, manual setup                          | Often no-code, easier for non-technical users                              |\n| Usage options  | Through code, terminal, or MCP server.                                     | Often through GUI or API, depending on the option the company is providing |\n| Scalability    | Depends on user implementation                                             | Built-in support for large-scale, managed services                         |\n| Adaptability   | High with features like `adaptive` and the non-selectors selection methods | High, automatic with AI, but costly for frequent changes                   |\n\nThis table is based on pricing from [Browse AI Pricing](https://www.browse.ai/pricing) and [Oxylabs Web Scraper API Pricing](https://oxylabs.io/products/scraper-api/web/pricing)\n\n## Conclusion\nWhile AI offers powerful capabilities, its cost can be prohibitive for many Web scraping tasks. Scrapling provides a robust, flexible, and cost-effective toolkit for tackling the real-world challenges of both targeted and broad scraping, often eliminating the need for expensive AI solutions. You can build resilient scrapers more efficiently by leveraging features like `adaptive`, diverse selection methods, and advanced fetchers.\n\nExplore the documentation further and see how Scrapling can simplify your future Web Scraping projects!"
  },
  {
    "path": "pyproject.toml",
    "content": "[build-system]\nrequires = [\"setuptools>=61.0\", \"wheel\"]\nbuild-backend = \"setuptools.build_meta\"\n\n[project]\nname = \"scrapling\"\n# Static version instead of a dynamic version so we can get better layer caching while building docker, check the docker file to understand\nversion = \"0.4.2\"\ndescription = \"Scrapling is an undetectable, powerful, flexible, high-performance Python library that makes Web Scraping easy and effortless as it should be!\"\nreadme = {file = \"README.md\", content-type = \"text/markdown\"}\nlicense = {file = \"LICENSE\"}\nauthors = [\n    {name = \"Karim Shoair\", email = \"karim.shoair@pm.me\"}\n]\nmaintainers = [\n    {name = \"Karim Shoair\", email = \"karim.shoair@pm.me\"}\n]\nkeywords = [\n    \"web-scraping\",\n    \"scraping\",\n    \"automation\",\n    \"browser-automation\",\n    \"data-extraction\",\n    \"html-parsing\",\n    \"undetectable\",\n    \"playwright\",\n    \"selenium-alternative\",\n    \"web-crawler\",\n    \"browser\",\n    \"crawling\",\n    \"headless\",\n    \"scraper\",\n    \"chrome\",\n]\nrequires-python = \">=3.10\"\nclassifiers = [\n    \"Operating System :: OS Independent\",\n    \"Development Status :: 4 - Beta\",\n    # \"Development Status :: 5 - Production/Stable\",\n    # \"Development Status :: 6 - Mature\",\n    # \"Development Status :: 7 - Inactive\",\n    \"Intended Audience :: Developers\",\n    \"Intended Audience :: Information Technology\",\n    \"License :: OSI Approved :: BSD License\",\n    \"Natural Language :: English\",\n    \"Topic :: Internet :: WWW/HTTP\",\n    \"Topic :: Internet :: WWW/HTTP :: Browsers\",\n    \"Topic :: Text Processing :: Markup\",\n    \"Topic :: Text Processing :: Markup :: HTML\",\n    \"Topic :: Scientific/Engineering :: Artificial Intelligence\",\n    \"Topic :: Software Development :: Libraries\",\n    \"Topic :: Software Development :: Libraries :: Application Frameworks\",\n    \"Topic :: Software Development :: Libraries :: Python Modules\",\n    \"Programming Language :: Python :: 3\",\n    \"Programming Language :: Python :: 3 :: Only\",\n    \"Programming Language :: Python :: 3.10\",\n    \"Programming Language :: Python :: 3.11\",\n    \"Programming Language :: Python :: 3.12\",\n    \"Programming Language :: Python :: 3.13\",\n    \"Programming Language :: Python :: Implementation :: CPython\",\n    \"Typing :: Typed\",\n]\ndependencies = [\n    \"lxml>=6.0.2\",\n    \"cssselect>=1.4.0\",\n    \"orjson>=3.11.7\",\n    \"tld>=0.13.2\",\n    \"w3lib>=2.4.0\",\n    \"typing_extensions\",\n]\n\n[project.optional-dependencies]\nfetchers = [\n    \"click>=8.3.0\",\n    \"curl_cffi>=0.14.0\",\n    \"playwright==1.58.0\",\n    \"patchright==1.58.2\",\n    \"browserforge>=1.2.4\",\n    \"apify-fingerprint-datapoints>=0.11.0\",\n    \"msgspec>=0.20.0\",\n    \"anyio>=4.12.1\"\n]\nai = [\n    \"mcp>=1.26.0\",\n    \"markdownify>=1.2.0\",\n    \"scrapling[fetchers]\",\n]\nshell = [\n    \"IPython>=8.37\",  # The last version that supports Python 3.10\n    \"markdownify>=1.2.0\",\n    \"scrapling[fetchers]\",\n]\nall = [\n    \"scrapling[ai,shell]\",\n]\n\n[project.urls]\nHomepage = \"https://github.com/D4Vinci/Scrapling\"\nChangelog = \"https://github.com/D4Vinci/Scrapling/releases\"\nDocumentation = \"https://scrapling.readthedocs.io/en/latest/\"\nRepository = \"https://github.com/D4Vinci/Scrapling\"\n\"Bug Tracker\" = \"https://github.com/D4Vinci/Scrapling/issues\"\n\"Discord\" = \"https://discord.gg/EMgGbDceNQ\"\n\"Release Notes\" = \"https://github.com/D4Vinci/Scrapling/releases\"\n\n[project.scripts]\nscrapling = \"scrapling.cli:main\"\n\n[tool.setuptools]\nzip-safe = false\ninclude-package-data = true\n\n[tool.setuptools.packages.find]\nwhere = [\".\"]\ninclude = [\"scrapling*\"]\n\n[tool.mypy]\npython_version = \"3.10\"\nwarn_unused_configs = true\nignore_missing_imports = true\ncheck_untyped_defs = true\n\n[tool.pyright]\npythonVersion = \"3.10\"\ntypeCheckingMode = \"basic\"\ninclude = [\"scrapling\"]\nignore = [\"tests\", \"benchmarks.py\"]"
  },
  {
    "path": "pytest.ini",
    "content": "[pytest]\nasyncio_mode = strict\nasyncio_default_fixture_loop_scope = function\naddopts = -p no:warnings --doctest-modules --ignore=setup.py --verbose\nmarkers =\n    asyncio: marks tests as async\nasyncio_fixture_scope = function"
  },
  {
    "path": "ruff.toml",
    "content": "exclude = [\n    \".git\",\n    \".venv\",\n    \"__pycache__\",\n    \"docs\",\n    \".github\",\n    \"build\",\n    \"dist\",\n    \"tests\",\n    \"benchmarks.py\",\n]\n\n# Assume Python 3.10\ntarget-version = \"py310\"\n# Allow lines to be as long as 120.\nline-length = 120\n\n[lint]\nselect = [\"E\", \"F\", \"W\"]\nignore = [\"E501\", \"F401\", \"F811\"]\n\n[format]\n# Like Black, use double quotes for strings.\nquote-style = \"double\"\n"
  },
  {
    "path": "scrapling/__init__.py",
    "content": "__author__ = \"Karim Shoair (karim.shoair@pm.me)\"\n__version__ = \"0.4.2\"\n__copyright__ = \"Copyright (c) 2024 Karim Shoair\"\n\nfrom typing import Any, TYPE_CHECKING\n\nif TYPE_CHECKING:\n    from scrapling.parser import Selector, Selectors\n    from scrapling.core.custom_types import AttributesHandler, TextHandler\n    from scrapling.fetchers import Fetcher, AsyncFetcher, StealthyFetcher, DynamicFetcher\n\n\n# Lazy import mapping\n_LAZY_IMPORTS = {\n    \"Fetcher\": (\"scrapling.fetchers\", \"Fetcher\"),\n    \"Selector\": (\"scrapling.parser\", \"Selector\"),\n    \"Selectors\": (\"scrapling.parser\", \"Selectors\"),\n    \"AttributesHandler\": (\"scrapling.core.custom_types\", \"AttributesHandler\"),\n    \"TextHandler\": (\"scrapling.core.custom_types\", \"TextHandler\"),\n    \"AsyncFetcher\": (\"scrapling.fetchers\", \"AsyncFetcher\"),\n    \"StealthyFetcher\": (\"scrapling.fetchers\", \"StealthyFetcher\"),\n    \"DynamicFetcher\": (\"scrapling.fetchers\", \"DynamicFetcher\"),\n}\n__all__ = [\"Selector\", \"Fetcher\", \"AsyncFetcher\", \"StealthyFetcher\", \"DynamicFetcher\"]\n\n\ndef __getattr__(name: str) -> Any:\n    if name in _LAZY_IMPORTS:\n        module_path, class_name = _LAZY_IMPORTS[name]\n        module = __import__(module_path, fromlist=[class_name])\n        return getattr(module, class_name)\n    else:\n        raise AttributeError(f\"module {__name__!r} has no attribute {name!r}\")\n\n\ndef __dir__() -> list[str]:\n    \"\"\"Support for dir() and autocomplete.\"\"\"\n    return sorted(__all__ + [\"fetchers\", \"parser\", \"cli\", \"core\", \"__author__\", \"__version__\", \"__copyright__\"])\n"
  },
  {
    "path": "scrapling/cli.py",
    "content": "from pathlib import Path\nfrom subprocess import check_output\nfrom sys import executable as python_executable\n\nfrom scrapling.core.utils import log\nfrom scrapling.engines.toolbelt.custom import Response\nfrom scrapling.core.utils._shell import _CookieParser, _ParseHeaders\nfrom scrapling.core._types import List, Optional, Dict, Tuple, Any, Callable\n\nfrom orjson import loads as json_loads, JSONDecodeError\n\ntry:\n    from click import command, option, Choice, group, argument\nexcept (ImportError, ModuleNotFoundError) as e:\n    raise ModuleNotFoundError(\n        \"You need to install scrapling with any of the extras to enable Shell commands. See: https://scrapling.readthedocs.io/en/latest/#installation\"\n    ) from e\n\n__OUTPUT_FILE_HELP__ = \"The output file path can be an HTML file, a Markdown file of the HTML content, or the text content itself. Use file extensions (`.html`/`.md`/`.txt`) respectively.\"\n__PACKAGE_DIR__ = Path(__file__).parent\n\n\ndef __Execute(cmd: List[str], help_line: str) -> None:  # pragma: no cover\n    print(f\"Installing {help_line}...\")\n    _ = check_output(cmd, shell=False)  # nosec B603\n    # I meant to not use try except here\n\n\ndef __ParseJSONData(json_string: Optional[str] = None) -> Optional[Dict[str, Any]]:\n    \"\"\"Parse JSON string into a Python object\"\"\"\n    if not json_string:\n        return None\n\n    try:\n        return json_loads(json_string)\n    except JSONDecodeError as err:  # pragma: no cover\n        raise ValueError(f\"Invalid JSON data '{json_string}': {err}\")\n\n\ndef __Request_and_Save(\n    fetcher_func: Callable[..., Response],\n    url: str,\n    output_file: str,\n    css_selector: Optional[str] = None,\n    **kwargs,\n) -> None:\n    \"\"\"Make a request using the specified fetcher function and save the result\"\"\"\n    from scrapling.core.shell import Convertor\n\n    # Handle relative paths - convert to an absolute path based on the current working directory\n    output_path = Path(output_file)\n    if not output_path.is_absolute():\n        output_path = Path.cwd() / output_file\n\n    response = fetcher_func(url, **kwargs)\n    Convertor.write_content_to_file(response, str(output_path), css_selector)\n    log.info(f\"Content successfully saved to '{output_path}'\")\n\n\ndef __ParseExtractArguments(\n    headers: List[str], cookies: str, params: str, json: Optional[str] = None\n) -> Tuple[Dict[str, str], Dict[str, str], Dict[str, str], Optional[Dict[str, str]]]:\n    \"\"\"Parse arguments for extract command\"\"\"\n    parsed_headers, parsed_cookies = _ParseHeaders(headers)\n    if cookies:\n        for key, value in _CookieParser(cookies):\n            try:\n                parsed_cookies[key] = value\n            except Exception as err:\n                raise ValueError(f\"Could not parse cookies '{cookies}': {err}\")\n\n    parsed_json = __ParseJSONData(json)\n    parsed_params = {}\n    for param in params:\n        if \"=\" in param:\n            key, value = param.split(\"=\", 1)\n            parsed_params[key] = value\n\n    return parsed_headers, parsed_cookies, parsed_params, parsed_json\n\n\ndef __BuildRequest(headers: List[str], cookies: str, params: str, json: Optional[str] = None, **kwargs) -> Dict:\n    \"\"\"Build a request object using the specified arguments\"\"\"\n    # Parse parameters\n    parsed_headers, parsed_cookies, parsed_params, parsed_json = __ParseExtractArguments(headers, cookies, params, json)\n    # Build request arguments\n    request_kwargs: Dict[str, Any] = {\n        \"headers\": parsed_headers if parsed_headers else None,\n        \"cookies\": parsed_cookies if parsed_cookies else None,\n    }\n    if parsed_json:\n        request_kwargs[\"json\"] = parsed_json\n    if parsed_params:\n        request_kwargs[\"params\"] = parsed_params\n    if \"proxy\" in kwargs:\n        request_kwargs[\"proxy\"] = kwargs.pop(\"proxy\")\n\n    # Parse impersonate parameter if it contains commas (for random selection)\n    if \"impersonate\" in kwargs and \",\" in (kwargs.get(\"impersonate\") or \"\"):\n        kwargs[\"impersonate\"] = [browser.strip() for browser in kwargs[\"impersonate\"].split(\",\")]\n\n    return {**request_kwargs, **kwargs}\n\n\n@command(help=\"Install all Scrapling's Fetchers dependencies\")\n@option(\n    \"-f\",\n    \"--force\",\n    \"force\",\n    is_flag=True,\n    default=False,\n    type=bool,\n    help=\"Force Scrapling to reinstall all Fetchers dependencies\",\n)\ndef install(force):  # pragma: no cover\n    if force or not __PACKAGE_DIR__.joinpath(\".scrapling_dependencies_installed\").exists():\n        __Execute(\n            [python_executable, \"-m\", \"playwright\", \"install\", \"chromium\"],\n            \"Playwright browsers\",\n        )\n        __Execute(\n            [\n                python_executable,\n                \"-m\",\n                \"playwright\",\n                \"install-deps\",\n                \"chromium\",\n            ],\n            \"Playwright dependencies\",\n        )\n        from tld.utils import update_tld_names\n\n        update_tld_names(fail_silently=True)\n        # if no errors raised by the above commands, then we add the below file\n        __PACKAGE_DIR__.joinpath(\".scrapling_dependencies_installed\").touch()\n    else:\n        print(\"The dependencies are already installed\")\n\n\n@command(help=\"Run Scrapling's MCP server (Check the docs for more info).\")\n@option(\n    \"--http\",\n    is_flag=True,\n    default=False,\n    help=\"Whether to run the MCP server in streamable-http transport or leave it as stdio (Default: False)\",\n)\n@option(\n    \"--host\",\n    type=str,\n    default=\"0.0.0.0\",\n    help=\"The host to use if streamable-http transport is enabled (Default: '0.0.0.0')\",\n)\n@option(\n    \"--port\", type=int, default=8000, help=\"The port to use if streamable-http transport is enabled (Default: 8000)\"\n)\ndef mcp(http, host, port):\n    from scrapling.core.ai import ScraplingMCPServer\n\n    server = ScraplingMCPServer()\n    server.serve(http, host, port)\n\n\n@command(help=\"Interactive scraping console\")\n@option(\n    \"-c\",\n    \"--code\",\n    \"code\",\n    is_flag=False,\n    default=\"\",\n    type=str,\n    help=\"Evaluate the code in the shell, print the result and exit\",\n)\n@option(\n    \"-L\",\n    \"--loglevel\",\n    \"level\",\n    is_flag=False,\n    default=\"debug\",\n    type=Choice([\"debug\", \"info\", \"warning\", \"error\", \"critical\", \"fatal\"], case_sensitive=False),\n    help=\"Log level (default: DEBUG)\",\n)\ndef shell(code, level):\n    from scrapling.core.shell import CustomShell\n\n    console = CustomShell(code=code, log_level=level)\n    console.start()\n\n\n@group(\n    help=\"Fetch web pages using various fetchers and extract full/selected HTML content as HTML, Markdown, or extract text content.\"\n)\ndef extract():\n    \"\"\"Extract content from web pages and save to files\"\"\"\n    pass\n\n\n@extract.command(help=f\"Perform a GET request and save the content to a file.\\n\\n{__OUTPUT_FILE_HELP__}\")\n@argument(\"url\", required=True)\n@argument(\"output_file\", required=True)\n@option(\n    \"--headers\",\n    \"-H\",\n    multiple=True,\n    help='HTTP headers in format \"Key: Value\" (can be used multiple times)',\n)\n@option(\"--cookies\", help='Cookies string in format \"name1=value1; name2=value2\"')\n@option(\"--timeout\", type=int, default=30, help=\"Request timeout in seconds (default: 30)\")\n@option(\"--proxy\", help='Proxy URL in format \"http://username:password@host:port\"')\n@option(\n    \"--css-selector\",\n    \"-s\",\n    help=\"CSS selector to extract specific content from the page. It returns all matches.\",\n)\n@option(\n    \"--params\",\n    \"-p\",\n    multiple=True,\n    help='Query parameters in format \"key=value\" (can be used multiple times)',\n)\n@option(\n    \"--follow-redirects/--no-follow-redirects\",\n    default=True,\n    help=\"Whether to follow redirects (default: True)\",\n)\n@option(\n    \"--verify/--no-verify\",\n    default=True,\n    help=\"Whether to verify SSL certificates (default: True)\",\n)\n@option(\n    \"--impersonate\",\n    help=\"Browser to impersonate. Can be a single browser (e.g., chrome) or comma-separated list for random selection (e.g., chrome,firefox,safari).\",\n)\n@option(\n    \"--stealthy-headers/--no-stealthy-headers\",\n    default=True,\n    help=\"Use stealthy browser headers (default: True)\",\n)\ndef get(\n    url,\n    output_file,\n    headers,\n    cookies,\n    timeout,\n    proxy,\n    css_selector,\n    params,\n    follow_redirects,\n    verify,\n    impersonate,\n    stealthy_headers,\n):\n    \"\"\"\n    Perform a GET request and save the content to a file.\n\n    :param url: Target URL for the request.\n    :param output_file: Output file path (.md for Markdown, .html for HTML).\n    :param headers: HTTP headers to include in the request.\n    :param cookies: Cookies to use in the request.\n    :param timeout: Number of seconds to wait before timing out.\n    :param proxy: Proxy URL to use. (Format: \"http://username:password@localhost:8030\")\n    :param css_selector: CSS selector to extract specific content.\n    :param params: Query string parameters for the request.\n    :param follow_redirects: Whether to follow redirects.\n    :param verify: Whether to verify HTTPS certificates.\n    :param impersonate: Browser version to impersonate.\n    :param stealthy_headers: If enabled, creates and adds real browser headers.\n    \"\"\"\n\n    kwargs = __BuildRequest(\n        headers,\n        cookies,\n        params,\n        None,\n        timeout=timeout,\n        follow_redirects=follow_redirects,\n        verify=verify,\n        stealthy_headers=stealthy_headers,\n        impersonate=impersonate,\n        proxy=proxy,\n    )\n    from scrapling.fetchers import Fetcher\n\n    __Request_and_Save(Fetcher.get, url, output_file, css_selector, **kwargs)\n\n\n@extract.command(help=f\"Perform a POST request and save the content to a file.\\n\\n{__OUTPUT_FILE_HELP__}\")\n@argument(\"url\", required=True)\n@argument(\"output_file\", required=True)\n@option(\n    \"--data\",\n    \"-d\",\n    help='Form data to include in the request body (as string, ex: \"param1=value1&param2=value2\")',\n)\n@option(\"--json\", \"-j\", help=\"JSON data to include in the request body (as string)\")\n@option(\n    \"--headers\",\n    \"-H\",\n    multiple=True,\n    help='HTTP headers in format \"Key: Value\" (can be used multiple times)',\n)\n@option(\"--cookies\", help='Cookies string in format \"name1=value1; name2=value2\"')\n@option(\"--timeout\", type=int, default=30, help=\"Request timeout in seconds (default: 30)\")\n@option(\"--proxy\", help='Proxy URL in format \"http://username:password@host:port\"')\n@option(\n    \"--css-selector\",\n    \"-s\",\n    help=\"CSS selector to extract specific content from the page. It returns all matches.\",\n)\n@option(\n    \"--params\",\n    \"-p\",\n    multiple=True,\n    help='Query parameters in format \"key=value\" (can be used multiple times)',\n)\n@option(\n    \"--follow-redirects/--no-follow-redirects\",\n    default=True,\n    help=\"Whether to follow redirects (default: True)\",\n)\n@option(\n    \"--verify/--no-verify\",\n    default=True,\n    help=\"Whether to verify SSL certificates (default: True)\",\n)\n@option(\n    \"--impersonate\",\n    help=\"Browser to impersonate. Can be a single browser (e.g., chrome) or comma-separated list for random selection (e.g., chrome,firefox,safari).\",\n)\n@option(\n    \"--stealthy-headers/--no-stealthy-headers\",\n    default=True,\n    help=\"Use stealthy browser headers (default: True)\",\n)\ndef post(\n    url,\n    output_file,\n    data,\n    json,\n    headers,\n    cookies,\n    timeout,\n    proxy,\n    css_selector,\n    params,\n    follow_redirects,\n    verify,\n    impersonate,\n    stealthy_headers,\n):\n    \"\"\"\n    Perform a POST request and save the content to a file.\n\n    :param url: Target URL for the request.\n    :param output_file: Output file path (.md for Markdown, .html for HTML).\n    :param data: Form data to include in the request body. (as string, ex: \"param1=value1&param2=value2\")\n    :param json: A JSON serializable object to include in the body of the request.\n    :param headers: Headers to include in the request.\n    :param cookies: Cookies to use in the request.\n    :param timeout: Number of seconds to wait before timing out.\n    :param proxy: Proxy URL to use.\n    :param css_selector: CSS selector to extract specific content.\n    :param params: Query string parameters for the request.\n    :param follow_redirects: Whether to follow redirects.\n    :param verify: Whether to verify HTTPS certificates.\n    :param impersonate: Browser version to impersonate.\n    :param stealthy_headers: If enabled, creates and adds real browser headers.\n    \"\"\"\n\n    kwargs = __BuildRequest(\n        headers,\n        cookies,\n        params,\n        json,\n        timeout=timeout,\n        follow_redirects=follow_redirects,\n        verify=verify,\n        stealthy_headers=stealthy_headers,\n        impersonate=impersonate,\n        proxy=proxy,\n        data=data,\n    )\n    from scrapling.fetchers import Fetcher\n\n    __Request_and_Save(Fetcher.post, url, output_file, css_selector, **kwargs)\n\n\n@extract.command(help=f\"Perform a PUT request and save the content to a file.\\n\\n{__OUTPUT_FILE_HELP__}\")\n@argument(\"url\", required=True)\n@argument(\"output_file\", required=True)\n@option(\"--data\", \"-d\", help=\"Form data to include in the request body\")\n@option(\"--json\", \"-j\", help=\"JSON data to include in the request body (as string)\")\n@option(\n    \"--headers\",\n    \"-H\",\n    multiple=True,\n    help='HTTP headers in format \"Key: Value\" (can be used multiple times)',\n)\n@option(\"--cookies\", help='Cookies string in format \"name1=value1; name2=value2\"')\n@option(\"--timeout\", type=int, default=30, help=\"Request timeout in seconds (default: 30)\")\n@option(\"--proxy\", help='Proxy URL in format \"http://username:password@host:port\"')\n@option(\n    \"--css-selector\",\n    \"-s\",\n    help=\"CSS selector to extract specific content from the page. It returns all matches.\",\n)\n@option(\n    \"--params\",\n    \"-p\",\n    multiple=True,\n    help='Query parameters in format \"key=value\" (can be used multiple times)',\n)\n@option(\n    \"--follow-redirects/--no-follow-redirects\",\n    default=True,\n    help=\"Whether to follow redirects (default: True)\",\n)\n@option(\n    \"--verify/--no-verify\",\n    default=True,\n    help=\"Whether to verify SSL certificates (default: True)\",\n)\n@option(\n    \"--impersonate\",\n    help=\"Browser to impersonate. Can be a single browser (e.g., chrome) or comma-separated list for random selection (e.g., chrome,firefox,safari).\",\n)\n@option(\n    \"--stealthy-headers/--no-stealthy-headers\",\n    default=True,\n    help=\"Use stealthy browser headers (default: True)\",\n)\ndef put(\n    url,\n    output_file,\n    data,\n    json,\n    headers,\n    cookies,\n    timeout,\n    proxy,\n    css_selector,\n    params,\n    follow_redirects,\n    verify,\n    impersonate,\n    stealthy_headers,\n):\n    \"\"\"\n    Perform a PUT request and save the content to a file.\n\n    :param url: Target URL for the request.\n    :param output_file: Output file path (.md for Markdown, .html for HTML).\n    :param data: Form data to include in the request body.\n    :param json: A JSON serializable object to include in the body of the request.\n    :param headers: Headers to include in the request.\n    :param cookies: Cookies to use in the request.\n    :param timeout: Number of seconds to wait before timing out.\n    :param proxy: Proxy URL to use.\n    :param css_selector: CSS selector to extract specific content.\n    :param params: Query string parameters for the request.\n    :param follow_redirects: Whether to follow redirects.\n    :param verify: Whether to verify HTTPS certificates.\n    :param impersonate: Browser version to impersonate.\n    :param stealthy_headers: If enabled, creates and adds real browser headers.\n    \"\"\"\n\n    kwargs = __BuildRequest(\n        headers,\n        cookies,\n        params,\n        json,\n        timeout=timeout,\n        follow_redirects=follow_redirects,\n        verify=verify,\n        stealthy_headers=stealthy_headers,\n        impersonate=impersonate,\n        proxy=proxy,\n        data=data,\n    )\n    from scrapling.fetchers import Fetcher\n\n    __Request_and_Save(Fetcher.put, url, output_file, css_selector, **kwargs)\n\n\n@extract.command(help=f\"Perform a DELETE request and save the content to a file.\\n\\n{__OUTPUT_FILE_HELP__}\")\n@argument(\"url\", required=True)\n@argument(\"output_file\", required=True)\n@option(\n    \"--headers\",\n    \"-H\",\n    multiple=True,\n    help='HTTP headers in format \"Key: Value\" (can be used multiple times)',\n)\n@option(\"--cookies\", help='Cookies string in format \"name1=value1; name2=value2\"')\n@option(\"--timeout\", type=int, default=30, help=\"Request timeout in seconds (default: 30)\")\n@option(\"--proxy\", help='Proxy URL in format \"http://username:password@host:port\"')\n@option(\n    \"--css-selector\",\n    \"-s\",\n    help=\"CSS selector to extract specific content from the page. It returns all matches.\",\n)\n@option(\n    \"--params\",\n    \"-p\",\n    multiple=True,\n    help='Query parameters in format \"key=value\" (can be used multiple times)',\n)\n@option(\n    \"--follow-redirects/--no-follow-redirects\",\n    default=True,\n    help=\"Whether to follow redirects (default: True)\",\n)\n@option(\n    \"--verify/--no-verify\",\n    default=True,\n    help=\"Whether to verify SSL certificates (default: True)\",\n)\n@option(\n    \"--impersonate\",\n    help=\"Browser to impersonate. Can be a single browser (e.g., chrome) or comma-separated list for random selection (e.g., chrome,firefox,safari).\",\n)\n@option(\n    \"--stealthy-headers/--no-stealthy-headers\",\n    default=True,\n    help=\"Use stealthy browser headers (default: True)\",\n)\ndef delete(\n    url,\n    output_file,\n    headers,\n    cookies,\n    timeout,\n    proxy,\n    css_selector,\n    params,\n    follow_redirects,\n    verify,\n    impersonate,\n    stealthy_headers,\n):\n    \"\"\"\n    Perform a DELETE request and save the content to a file.\n\n    :param url: Target URL for the request.\n    :param output_file: Output file path (.md for Markdown, .html for HTML).\n    :param headers: Headers to include in the request.\n    :param cookies: Cookies to use in the request.\n    :param timeout: Number of seconds to wait before timing out.\n    :param proxy: Proxy URL to use.\n    :param css_selector: CSS selector to extract specific content.\n    :param params: Query string parameters for the request.\n    :param follow_redirects: Whether to follow redirects.\n    :param verify: Whether to verify HTTPS certificates.\n    :param impersonate: Browser version to impersonate.\n    :param stealthy_headers: If enabled, creates and adds real browser headers.\n    \"\"\"\n\n    kwargs = __BuildRequest(\n        headers,\n        cookies,\n        params,\n        None,\n        timeout=timeout,\n        follow_redirects=follow_redirects,\n        verify=verify,\n        stealthy_headers=stealthy_headers,\n        impersonate=impersonate,\n        proxy=proxy,\n    )\n    from scrapling.fetchers import Fetcher\n\n    __Request_and_Save(Fetcher.delete, url, output_file, css_selector, **kwargs)\n\n\n@extract.command(help=f\"Use DynamicFetcher to fetch content with browser automation.\\n\\n{__OUTPUT_FILE_HELP__}\")\n@argument(\"url\", required=True)\n@argument(\"output_file\", required=True)\n@option(\n    \"--headless/--no-headless\",\n    default=True,\n    help=\"Run browser in headless mode (default: True)\",\n)\n@option(\n    \"--disable-resources/--enable-resources\",\n    default=False,\n    help=\"Drop unnecessary resources for speed boost (default: False)\",\n)\n@option(\n    \"--network-idle/--no-network-idle\",\n    default=False,\n    help=\"Wait for network idle (default: False)\",\n)\n@option(\n    \"--timeout\",\n    type=int,\n    default=30000,\n    help=\"Timeout in milliseconds (default: 30000)\",\n)\n@option(\n    \"--wait\",\n    type=int,\n    default=0,\n    help=\"Additional wait time in milliseconds after page load (default: 0)\",\n)\n@option(\n    \"--css-selector\",\n    \"-s\",\n    help=\"CSS selector to extract specific content from the page. It returns all matches.\",\n)\n@option(\"--wait-selector\", help=\"CSS selector to wait for before proceeding\")\n@option(\"--locale\", default=None, help=\"Specify user locale. Defaults to the system default locale.\")\n@option(\n    \"--real-chrome/--no-real-chrome\",\n    default=False,\n    help=\"If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it. (default: False)\",\n)\n@option(\"--proxy\", help='Proxy URL in format \"http://username:password@host:port\"')\n@option(\n    \"--extra-headers\",\n    \"-H\",\n    multiple=True,\n    help='Extra headers in format \"Key: Value\" (can be used multiple times)',\n)\ndef fetch(\n    url,\n    output_file,\n    headless,\n    disable_resources,\n    network_idle,\n    timeout,\n    wait,\n    css_selector,\n    wait_selector,\n    locale,\n    real_chrome,\n    proxy,\n    extra_headers,\n):\n    \"\"\"\n    Opens up a browser and fetch content using DynamicFetcher.\n\n    :param url: Target url.\n    :param output_file: Output file path (.md for Markdown, .html for HTML).\n    :param headless: Run the browser in headless/hidden or headful/visible mode.\n    :param disable_resources: Drop requests of unnecessary resources for a speed boost.\n    :param network_idle: Wait for the page until there are no network connections for at least 500 ms.\n    :param timeout: The timeout in milliseconds that is used in all operations and waits through the page.\n    :param wait: The time (milliseconds) the fetcher will wait after everything finishes before returning.\n    :param css_selector: CSS selector to extract specific content.\n    :param wait_selector: Wait for a specific CSS selector to be in a specific state.\n    :param locale: Set the locale for the browser.\n    :param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.\n    :param proxy: The proxy to be used with requests.\n    :param extra_headers: Extra headers to add to the request.\n    \"\"\"\n\n    # Parse parameters\n    parsed_headers, _ = _ParseHeaders(extra_headers, False)\n\n    # Build request arguments\n    kwargs = {\n        \"headless\": headless,\n        \"disable_resources\": disable_resources,\n        \"network_idle\": network_idle,\n        \"timeout\": timeout,\n        \"locale\": locale,\n        \"real_chrome\": real_chrome,\n    }\n\n    if wait > 0:\n        kwargs[\"wait\"] = wait\n    if wait_selector:\n        kwargs[\"wait_selector\"] = wait_selector\n    if proxy:\n        kwargs[\"proxy\"] = proxy\n    if parsed_headers:\n        kwargs[\"extra_headers\"] = parsed_headers\n\n    from scrapling.fetchers import DynamicFetcher\n\n    __Request_and_Save(DynamicFetcher.fetch, url, output_file, css_selector, **kwargs)\n\n\n@extract.command(help=f\"Use StealthyFetcher to fetch content with advanced stealth features.\\n\\n{__OUTPUT_FILE_HELP__}\")\n@argument(\"url\", required=True)\n@argument(\"output_file\", required=True)\n@option(\n    \"--headless/--no-headless\",\n    default=True,\n    help=\"Run browser in headless mode (default: True)\",\n)\n@option(\n    \"--disable-resources/--enable-resources\",\n    default=False,\n    help=\"Drop unnecessary resources for speed boost (default: False)\",\n)\n@option(\n    \"--block-webrtc/--allow-webrtc\",\n    default=False,\n    help=\"Block WebRTC entirely (default: False)\",\n)\n@option(\n    \"--solve-cloudflare/--no-solve-cloudflare\",\n    default=False,\n    help=\"Solve Cloudflare challenges (default: False)\",\n)\n@option(\"--allow-webgl/--block-webgl\", default=True, help=\"Allow WebGL (default: True)\")\n@option(\n    \"--network-idle/--no-network-idle\",\n    default=False,\n    help=\"Wait for network idle (default: False)\",\n)\n@option(\n    \"--real-chrome/--no-real-chrome\",\n    default=False,\n    help=\"If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it. (default: False)\",\n)\n@option(\n    \"--hide-canvas/--show-canvas\",\n    default=False,\n    help=\"Add noise to canvas operations (default: False)\",\n)\n@option(\n    \"--timeout\",\n    type=int,\n    default=30000,\n    help=\"Timeout in milliseconds (default: 30000)\",\n)\n@option(\n    \"--wait\",\n    type=int,\n    default=0,\n    help=\"Additional wait time in milliseconds after page load (default: 0)\",\n)\n@option(\n    \"--css-selector\",\n    \"-s\",\n    help=\"CSS selector to extract specific content from the page. It returns all matches.\",\n)\n@option(\"--wait-selector\", help=\"CSS selector to wait for before proceeding\")\n@option(\"--proxy\", help='Proxy URL in format \"http://username:password@host:port\"')\n@option(\n    \"--extra-headers\",\n    \"-H\",\n    multiple=True,\n    help='Extra headers in format \"Key: Value\" (can be used multiple times)',\n)\ndef stealthy_fetch(\n    url,\n    output_file,\n    headless,\n    disable_resources,\n    block_webrtc,\n    solve_cloudflare,\n    allow_webgl,\n    network_idle,\n    real_chrome,\n    hide_canvas,\n    timeout,\n    wait,\n    css_selector,\n    wait_selector,\n    proxy,\n    extra_headers,\n):\n    \"\"\"\n    Opens up a browser with advanced stealth features and fetch content using StealthyFetcher.\n\n    :param url: Target url.\n    :param output_file: Output file path (.md for Markdown, .html for HTML).\n    :param headless: Run the browser in headless/hidden, or headful/visible mode.\n    :param disable_resources: Drop requests of unnecessary resources for a speed boost.\n    :param block_webrtc: Blocks WebRTC entirely.\n    :param solve_cloudflare: Solves all types of the Cloudflare's Turnstile/Interstitial challenges.\n    :param allow_webgl: Allow WebGL (recommended to keep enabled).\n    :param network_idle: Wait for the page until there are no network connections for at least 500 ms.\n    :param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.\n    :param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.\n    :param timeout: The timeout in milliseconds that is used in all operations and waits through the page.\n    :param wait: The time (milliseconds) the fetcher will wait after everything finishes before returning.\n    :param css_selector: CSS selector to extract specific content.\n    :param wait_selector: Wait for a specific CSS selector to be in a specific state.\n    :param proxy: The proxy to be used with requests.\n    :param extra_headers: Extra headers to add to the request.\n    \"\"\"\n\n    # Parse parameters\n    parsed_headers, _ = _ParseHeaders(extra_headers, False)\n\n    # Build request arguments\n    kwargs = {\n        \"headless\": headless,\n        \"disable_resources\": disable_resources,\n        \"block_webrtc\": block_webrtc,\n        \"solve_cloudflare\": solve_cloudflare,\n        \"allow_webgl\": allow_webgl,\n        \"network_idle\": network_idle,\n        \"real_chrome\": real_chrome,\n        \"hide_canvas\": hide_canvas,\n        \"timeout\": timeout,\n    }\n\n    if wait > 0:\n        kwargs[\"wait\"] = wait\n    if wait_selector:\n        kwargs[\"wait_selector\"] = wait_selector\n    if proxy:\n        kwargs[\"proxy\"] = proxy\n    if parsed_headers:\n        kwargs[\"extra_headers\"] = parsed_headers\n\n    from scrapling.fetchers import StealthyFetcher\n\n    __Request_and_Save(StealthyFetcher.fetch, url, output_file, css_selector, **kwargs)\n\n\n@group()\ndef main():\n    pass\n\n\n# Adding commands\nmain.add_command(install)\nmain.add_command(shell)\nmain.add_command(extract)\nmain.add_command(mcp)\n"
  },
  {
    "path": "scrapling/core/__init__.py",
    "content": ""
  },
  {
    "path": "scrapling/core/_shell_signatures.py",
    "content": "from scrapling.core._types import (\n    Any,\n    Dict,\n    List,\n    Tuple,\n    Sequence,\n    Callable,\n    Optional,\n    SetCookieParam,\n    SelectorWaitStates,\n)\n\n# Parameter definitions for shell function signatures (defined once at module level)\n# Mirrors TypedDict definitions from _types.py but runtime-accessible for IPython introspection\n_REQUESTS_PARAMS = {\n    \"params\": Optional[Dict | List | Tuple],\n    \"cookies\": Any,\n    \"auth\": Optional[Tuple[str, str]],\n    \"impersonate\": Any,\n    \"http3\": Optional[bool],\n    \"stealthy_headers\": Optional[bool],\n    \"proxies\": Any,\n    \"proxy\": Optional[str],\n    \"proxy_auth\": Optional[Tuple[str, str]],\n    \"timeout\": Optional[int | float],\n    \"headers\": Any,\n    \"retries\": Optional[int],\n    \"retry_delay\": Optional[int],\n    \"follow_redirects\": Optional[bool],\n    \"max_redirects\": Optional[int],\n    \"verify\": Optional[bool],\n    \"cert\": Optional[str | Tuple[str, str]],\n    \"selector_config\": Optional[Dict],\n}\n\n_FETCH_PARAMS = {\n    \"headless\": bool,\n    \"disable_resources\": bool,\n    \"network_idle\": bool,\n    \"load_dom\": bool,\n    \"wait_selector\": Optional[str],\n    \"wait_selector_state\": SelectorWaitStates,\n    \"cookies\": Sequence[SetCookieParam],\n    \"google_search\": bool,\n    \"wait\": int | float,\n    \"timezone_id\": str | None,\n    \"page_action\": Optional[Callable],\n    \"proxy\": Optional[str | Dict[str, str] | Tuple],\n    \"extra_headers\": Optional[Dict[str, str]],\n    \"timeout\": int | float,\n    \"init_script\": Optional[str],\n    \"user_data_dir\": str,\n    \"selector_config\": Optional[Dict],\n    \"additional_args\": Optional[Dict],\n    \"locale\": Optional[str],\n    \"real_chrome\": bool,\n    \"cdp_url\": Optional[str],\n    \"useragent\": Optional[str],\n    \"extra_flags\": Optional[List[str]],\n}\n\n_STEALTHY_FETCH_PARAMS = {\n    \"headless\": bool,\n    \"disable_resources\": bool,\n    \"network_idle\": bool,\n    \"load_dom\": bool,\n    \"wait_selector\": Optional[str],\n    \"wait_selector_state\": SelectorWaitStates,\n    \"cookies\": Sequence[SetCookieParam],\n    \"google_search\": bool,\n    \"wait\": int | float,\n    \"timezone_id\": str | None,\n    \"page_action\": Optional[Callable],\n    \"proxy\": Optional[str | Dict[str, str] | Tuple],\n    \"extra_headers\": Optional[Dict[str, str]],\n    \"timeout\": int | float,\n    \"init_script\": Optional[str],\n    \"user_data_dir\": str,\n    \"selector_config\": Optional[Dict],\n    \"additional_args\": Optional[Dict],\n    \"locale\": Optional[str],\n    \"real_chrome\": bool,\n    \"cdp_url\": Optional[str],\n    \"useragent\": Optional[str],\n    \"extra_flags\": Optional[List[str]],\n    \"allow_webgl\": bool,\n    \"hide_canvas\": bool,\n    \"block_webrtc\": bool,\n    \"solve_cloudflare\": bool,\n}\n\n# Mapping of function names to their parameter definitions\nSignatures_map = {\n    \"get\": _REQUESTS_PARAMS,\n    \"post\": {**_REQUESTS_PARAMS, \"data\": Optional[Dict | str], \"json\": Optional[Dict | List]},\n    \"put\": {**_REQUESTS_PARAMS, \"data\": Optional[Dict | str], \"json\": Optional[Dict | List]},\n    \"delete\": _REQUESTS_PARAMS,\n    \"fetch\": _FETCH_PARAMS,\n    \"stealthy_fetch\": _STEALTHY_FETCH_PARAMS,\n}\n"
  },
  {
    "path": "scrapling/core/_types.py",
    "content": "\"\"\"\nType definitions for type checking purposes.\n\"\"\"\n\nfrom typing import (\n    TYPE_CHECKING,\n    TypeAlias,\n    cast,\n    overload,\n    Any,\n    Callable,\n    Dict,\n    Generator,\n    AsyncGenerator,\n    Generic,\n    Iterable,\n    List,\n    Set,\n    Literal,\n    Optional,\n    Iterator,\n    Pattern,\n    Sequence,\n    Tuple,\n    TypeVar,\n    Union,\n    Match,\n    Mapping,\n    Awaitable,\n    Protocol,\n    Coroutine,\n    SupportsIndex,\n)\nfrom typing_extensions import Self, Unpack, TypedDict\n\n# Proxy can be a string URL or a dict (Playwright format: {\"server\": \"...\", \"username\": \"...\", \"password\": \"...\"})\nProxyType = Union[str, Dict[str, str]]\nSUPPORTED_HTTP_METHODS = Literal[\"GET\", \"POST\", \"PUT\", \"DELETE\"]\nSelectorWaitStates = Literal[\"attached\", \"detached\", \"hidden\", \"visible\"]\nPageLoadStates = Literal[\"commit\", \"domcontentloaded\", \"load\", \"networkidle\"]\nextraction_types = Literal[\"text\", \"html\", \"markdown\"]\nStrOrBytes = Union[str, bytes]\n\n\n# Copied from `playwright._impl._api_structures.SetCookieParam`\nclass SetCookieParam(TypedDict, total=False):\n    name: str\n    value: str\n    url: Optional[str]\n    domain: Optional[str]\n    path: Optional[str]\n    expires: Optional[float]\n    httpOnly: Optional[bool]\n    secure: Optional[bool]\n    sameSite: Optional[Literal[\"Lax\", \"None\", \"Strict\"]]\n    partitionKey: Optional[str]\n"
  },
  {
    "path": "scrapling/core/ai.py",
    "content": "from asyncio import gather\n\nfrom mcp.server.fastmcp import FastMCP\nfrom pydantic import BaseModel, Field\n\nfrom scrapling.core.shell import Convertor\nfrom scrapling.engines.toolbelt.custom import Response as _ScraplingResponse\nfrom scrapling.engines.static import ImpersonateType\nfrom scrapling.fetchers import (\n    Fetcher,\n    FetcherSession,\n    DynamicFetcher,\n    AsyncDynamicSession,\n    StealthyFetcher,\n    AsyncStealthySession,\n)\nfrom scrapling.core._types import (\n    Optional,\n    Tuple,\n    Mapping,\n    Dict,\n    List,\n    Any,\n    Generator,\n    Sequence,\n    SetCookieParam,\n    extraction_types,\n    SelectorWaitStates,\n)\n\n\nclass ResponseModel(BaseModel):\n    \"\"\"Request's response information structure.\"\"\"\n\n    status: int = Field(description=\"The status code returned by the website.\")\n    content: list[str] = Field(description=\"The content as Markdown/HTML or the text content of the page.\")\n    url: str = Field(description=\"The URL given by the user that resulted in this response.\")\n\n\ndef _content_translator(content: Generator[str, None, None], page: _ScraplingResponse) -> ResponseModel:\n    \"\"\"Convert a content generator to a list of ResponseModel objects.\"\"\"\n    return ResponseModel(status=page.status, content=[result for result in content], url=page.url)\n\n\ndef _normalize_credentials(credentials: Optional[Dict[str, str]]) -> Optional[Tuple[str, str]]:\n    \"\"\"Convert a credentials dictionary to a tuple accepted by fetchers.\"\"\"\n    if not credentials:\n        return None\n\n    username = credentials.get(\"username\")\n    password = credentials.get(\"password\")\n\n    if username is None or password is None:\n        raise ValueError(\"Credentials dictionary must contain both 'username' and 'password' keys\")\n\n    return username, password\n\n\nclass ScraplingMCPServer:\n    @staticmethod\n    def get(\n        url: str,\n        impersonate: ImpersonateType = \"chrome\",\n        extraction_type: extraction_types = \"markdown\",\n        css_selector: Optional[str] = None,\n        main_content_only: bool = True,\n        params: Optional[Dict] = None,\n        headers: Optional[Mapping[str, Optional[str]]] = None,\n        cookies: Optional[Dict[str, str]] = None,\n        timeout: Optional[int | float] = 30,\n        follow_redirects: bool = True,\n        max_redirects: int = 30,\n        retries: Optional[int] = 3,\n        retry_delay: Optional[int] = 1,\n        proxy: Optional[str] = None,\n        proxy_auth: Optional[Dict[str, str]] = None,\n        auth: Optional[Dict[str, str]] = None,\n        verify: Optional[bool] = True,\n        http3: Optional[bool] = False,\n        stealthy_headers: Optional[bool] = True,\n    ) -> ResponseModel:\n        \"\"\"Make GET HTTP request to a URL and return a structured output of the result.\n        Note: This is only suitable for low-mid protection levels. For high-protection levels or websites that require JS loading, use the other tools directly.\n        Note: If the `css_selector` resolves to more than one element, all the elements will be returned.\n\n        :param url: The URL to request.\n        :param impersonate: Browser version to impersonate its fingerprint. It's using the latest chrome version by default.\n        :param extraction_type: The type of content to extract from the page. Defaults to \"markdown\". Options are:\n            - Markdown will convert the page content to Markdown format.\n            - HTML will return the raw HTML content of the page.\n            - Text will return the text content of the page.\n        :param css_selector: CSS selector to extract the content from the page. If main_content_only is True, then it will be executed on the main content of the page. Defaults to None.\n        :param main_content_only: Whether to extract only the main content of the page. Defaults to True. The main content here is the data inside the `<body>` tag.\n        :param params: Query string parameters for the request.\n        :param headers: Headers to include in the request.\n        :param cookies: Cookies to use in the request.\n        :param timeout: Number of seconds to wait before timing out.\n        :param follow_redirects: Whether to follow redirects. Defaults to True.\n        :param max_redirects: Maximum number of redirects. Default 30, use -1 for unlimited.\n        :param retries: Number of retry attempts. Defaults to 3.\n        :param retry_delay: Number of seconds to wait between retry attempts. Defaults to 1 second.\n        :param proxy: Proxy URL to use. Format: \"http://username:password@localhost:8030\".\n                     Cannot be used together with the `proxies` parameter.\n        :param proxy_auth: HTTP basic auth for proxy in dictionary format with `username` and `password` keys.\n        :param auth: HTTP basic auth in dictionary format with `username` and `password` keys.\n        :param verify: Whether to verify HTTPS certificates.\n        :param http3: Whether to use HTTP3. Defaults to False. It might be problematic if used it with `impersonate`.\n        :param stealthy_headers: If enabled (default), it creates and adds real browser headers. It also sets a Google referer header.\n        \"\"\"\n        normalized_proxy_auth = _normalize_credentials(proxy_auth)\n        normalized_auth = _normalize_credentials(auth)\n\n        page = Fetcher.get(\n            url,\n            auth=normalized_auth,\n            proxy=proxy,\n            http3=http3,\n            verify=verify,\n            params=params,\n            proxy_auth=normalized_proxy_auth,\n            retry_delay=retry_delay,\n            stealthy_headers=stealthy_headers,\n            impersonate=impersonate,\n            headers=headers,\n            cookies=cookies,\n            timeout=timeout,\n            retries=retries,\n            max_redirects=max_redirects,\n            follow_redirects=follow_redirects,\n        )\n        return _content_translator(\n            Convertor._extract_content(\n                page,\n                css_selector=css_selector,\n                extraction_type=extraction_type,\n                main_content_only=main_content_only,\n            ),\n            page,\n        )\n\n    @staticmethod\n    async def bulk_get(\n        urls: List[str],\n        impersonate: ImpersonateType = \"chrome\",\n        extraction_type: extraction_types = \"markdown\",\n        css_selector: Optional[str] = None,\n        main_content_only: bool = True,\n        params: Optional[Dict] = None,\n        headers: Optional[Mapping[str, Optional[str]]] = None,\n        cookies: Optional[Dict[str, str]] = None,\n        timeout: Optional[int | float] = 30,\n        follow_redirects: bool = True,\n        max_redirects: int = 30,\n        retries: Optional[int] = 3,\n        retry_delay: Optional[int] = 1,\n        proxy: Optional[str] = None,\n        proxy_auth: Optional[Dict[str, str]] = None,\n        auth: Optional[Dict[str, str]] = None,\n        verify: Optional[bool] = True,\n        http3: Optional[bool] = False,\n        stealthy_headers: Optional[bool] = True,\n    ) -> List[ResponseModel]:\n        \"\"\"Make GET HTTP request to a group of URLs and for each URL, return a structured output of the result.\n        Note: This is only suitable for low-mid protection levels. For high-protection levels or websites that require JS loading, use the other tools directly.\n        Note: If the `css_selector` resolves to more than one element, all the elements will be returned.\n\n        :param urls: A list of the URLs to request.\n        :param impersonate: Browser version to impersonate its fingerprint. It's using the latest chrome version by default.\n        :param extraction_type: The type of content to extract from the page. Defaults to \"markdown\". Options are:\n            - Markdown will convert the page content to Markdown format.\n            - HTML will return the raw HTML content of the page.\n            - Text will return the text content of the page.\n        :param css_selector: CSS selector to extract the content from the page. If main_content_only is True, then it will be executed on the main content of the page. Defaults to None.\n        :param main_content_only: Whether to extract only the main content of the page. Defaults to True. The main content here is the data inside the `<body>` tag.\n        :param params: Query string parameters for the request.\n        :param headers: Headers to include in the request.\n        :param cookies: Cookies to use in the request.\n        :param timeout: Number of seconds to wait before timing out.\n        :param follow_redirects: Whether to follow redirects. Defaults to True.\n        :param max_redirects: Maximum number of redirects. Default 30, use -1 for unlimited.\n        :param retries: Number of retry attempts. Defaults to 3.\n        :param retry_delay: Number of seconds to wait between retry attempts. Defaults to 1 second.\n        :param proxy: Proxy URL to use. Format: \"http://username:password@localhost:8030\".\n                     Cannot be used together with the `proxies` parameter.\n        :param proxy_auth: HTTP basic auth for proxy in dictionary format with `username` and `password` keys.\n        :param auth: HTTP basic auth in dictionary format with `username` and `password` keys.\n        :param verify: Whether to verify HTTPS certificates.\n        :param http3: Whether to use HTTP3. Defaults to False. It might be problematic if used it with `impersonate`.\n        :param stealthy_headers: If enabled (default), it creates and adds real browser headers. It also sets a Google referer header.\n        \"\"\"\n        normalized_proxy_auth = _normalize_credentials(proxy_auth)\n        normalized_auth = _normalize_credentials(auth)\n\n        async with FetcherSession() as session:\n            tasks: List[Any] = [\n                session.get(\n                    url,\n                    auth=normalized_auth,\n                    proxy=proxy,\n                    http3=http3,\n                    verify=verify,\n                    params=params,\n                    headers=headers,\n                    cookies=cookies,\n                    timeout=timeout,\n                    retries=retries,\n                    proxy_auth=normalized_proxy_auth,\n                    retry_delay=retry_delay,\n                    impersonate=impersonate,\n                    max_redirects=max_redirects,\n                    follow_redirects=follow_redirects,\n                    stealthy_headers=stealthy_headers,\n                )\n                for url in urls\n            ]\n            responses = await gather(*tasks)\n            return [\n                _content_translator(\n                    Convertor._extract_content(\n                        page,\n                        css_selector=css_selector,\n                        extraction_type=extraction_type,\n                        main_content_only=main_content_only,\n                    ),\n                    page,\n                )\n                for page in responses\n            ]\n\n    @staticmethod\n    async def fetch(\n        url: str,\n        extraction_type: extraction_types = \"markdown\",\n        css_selector: Optional[str] = None,\n        main_content_only: bool = True,\n        headless: bool = True,  # noqa: F821\n        google_search: bool = True,\n        real_chrome: bool = False,\n        wait: int | float = 0,\n        proxy: Optional[str | Dict[str, str]] = None,\n        timezone_id: str | None = None,\n        locale: str | None = None,\n        extra_headers: Optional[Dict[str, str]] = None,\n        useragent: Optional[str] = None,\n        cdp_url: Optional[str] = None,\n        timeout: int | float = 30000,\n        disable_resources: bool = False,\n        wait_selector: Optional[str] = None,\n        cookies: Sequence[SetCookieParam] | None = None,\n        network_idle: bool = False,\n        wait_selector_state: SelectorWaitStates = \"attached\",\n    ) -> ResponseModel:\n        \"\"\"Use playwright to open a browser to fetch a URL and return a structured output of the result.\n        Note: This is only suitable for low-mid protection levels.\n        Note: If the `css_selector` resolves to more than one element, all the elements will be returned.\n\n        :param url: The URL to request.\n        :param extraction_type: The type of content to extract from the page. Defaults to \"markdown\". Options are:\n            - Markdown will convert the page content to Markdown format.\n            - HTML will return the raw HTML content of the page.\n            - Text will return the text content of the page.\n        :param css_selector: CSS selector to extract the content from the page. If main_content_only is True, then it will be executed on the main content of the page. Defaults to None.\n        :param main_content_only: Whether to extract only the main content of the page. Defaults to True. The main content here is the data inside the `<body>` tag.\n        :param headless: Run the browser in headless/hidden (default), or headful/visible mode.\n        :param disable_resources: Drop requests for unnecessary resources for a speed boost.\n            Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.\n        :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.\n        :param cookies: Set cookies for the next request. It should be in a dictionary format that Playwright accepts.\n        :param network_idle: Wait for the page until there are no network connections for at least 500 ms.\n        :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000\n        :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.\n        :param wait_selector: Wait for a specific CSS selector to be in a specific state.\n        :param timezone_id: Changes the timezone of the browser. Defaults to the system timezone.\n        :param locale: Specify user locale, for example, `en-GB`, `de-DE`, etc. Locale will affect navigator.language value, Accept-Language request header value as well as number and date formatting\n            rules. Defaults to the system default locale.\n        :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.\n        :param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.\n        :param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.\n        :param google_search: Enabled by default, Scrapling will set a Google referer header.\n        :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by `google_search` takes priority over the referer set here if used together._\n        :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.\n        \"\"\"\n        page = await DynamicFetcher.async_fetch(\n            url,\n            wait=wait,\n            proxy=proxy,\n            locale=locale,\n            timeout=timeout,\n            cookies=cookies,\n            cdp_url=cdp_url,\n            headless=headless,\n            useragent=useragent,\n            timezone_id=timezone_id,\n            real_chrome=real_chrome,\n            network_idle=network_idle,\n            wait_selector=wait_selector,\n            extra_headers=extra_headers,\n            google_search=google_search,\n            disable_resources=disable_resources,\n            wait_selector_state=wait_selector_state,\n        )\n        return _content_translator(\n            Convertor._extract_content(\n                page,\n                css_selector=css_selector,\n                extraction_type=extraction_type,\n                main_content_only=main_content_only,\n            ),\n            page,\n        )\n\n    @staticmethod\n    async def bulk_fetch(\n        urls: List[str],\n        extraction_type: extraction_types = \"markdown\",\n        css_selector: Optional[str] = None,\n        main_content_only: bool = True,\n        headless: bool = True,  # noqa: F821\n        google_search: bool = True,\n        real_chrome: bool = False,\n        wait: int | float = 0,\n        proxy: Optional[str | Dict[str, str]] = None,\n        timezone_id: str | None = None,\n        locale: str | None = None,\n        extra_headers: Optional[Dict[str, str]] = None,\n        useragent: Optional[str] = None,\n        cdp_url: Optional[str] = None,\n        timeout: int | float = 30000,\n        disable_resources: bool = False,\n        wait_selector: Optional[str] = None,\n        cookies: Sequence[SetCookieParam] | None = None,\n        network_idle: bool = False,\n        wait_selector_state: SelectorWaitStates = \"attached\",\n    ) -> List[ResponseModel]:\n        \"\"\"Use playwright to open a browser, then fetch a group of URLs at the same time, and for each page return a structured output of the result.\n        Note: This is only suitable for low-mid protection levels.\n        Note: If the `css_selector` resolves to more than one element, all the elements will be returned.\n\n        :param urls: A list of the URLs to request.\n        :param extraction_type: The type of content to extract from the page. Defaults to \"markdown\". Options are:\n            - Markdown will convert the page content to Markdown format.\n            - HTML will return the raw HTML content of the page.\n            - Text will return the text content of the page.\n        :param css_selector: CSS selector to extract the content from the page. If main_content_only is True, then it will be executed on the main content of the page. Defaults to None.\n        :param main_content_only: Whether to extract only the main content of the page. Defaults to True. The main content here is the data inside the `<body>` tag.\n        :param headless: Run the browser in headless/hidden (default), or headful/visible mode.\n        :param disable_resources: Drop requests for unnecessary resources for a speed boost.\n            Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.\n        :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.\n        :param cookies: Set cookies for the next request. It should be in a dictionary format that Playwright accepts.\n        :param network_idle: Wait for the page until there are no network connections for at least 500 ms.\n        :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000\n        :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.\n        :param wait_selector: Wait for a specific CSS selector to be in a specific state.\n        :param timezone_id: Changes the timezone of the browser. Defaults to the system timezone.\n        :param locale: Specify user locale, for example, `en-GB`, `de-DE`, etc. Locale will affect navigator.language value, Accept-Language request header value as well as number and date formatting\n            rules. Defaults to the system default locale.\n        :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.\n        :param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.\n        :param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.\n        :param google_search: Enabled by default, Scrapling will set a Google referer header.\n        :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by `google_search` takes priority over the referer set here if used together._\n        :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.\n        \"\"\"\n        async with AsyncDynamicSession(\n            wait=wait,\n            proxy=proxy,\n            locale=locale,\n            timeout=timeout,\n            cookies=cookies,\n            cdp_url=cdp_url,\n            headless=headless,\n            max_pages=len(urls),\n            useragent=useragent,\n            timezone_id=timezone_id,\n            real_chrome=real_chrome,\n            network_idle=network_idle,\n            wait_selector=wait_selector,\n            google_search=google_search,\n            extra_headers=extra_headers,\n            disable_resources=disable_resources,\n            wait_selector_state=wait_selector_state,\n        ) as session:\n            tasks = [session.fetch(url) for url in urls]\n            responses = await gather(*tasks)\n            return [\n                _content_translator(\n                    Convertor._extract_content(\n                        page,\n                        css_selector=css_selector,\n                        extraction_type=extraction_type,\n                        main_content_only=main_content_only,\n                    ),\n                    page,\n                )\n                for page in responses\n            ]\n\n    @staticmethod\n    async def stealthy_fetch(\n        url: str,\n        extraction_type: extraction_types = \"markdown\",\n        css_selector: Optional[str] = None,\n        main_content_only: bool = True,\n        headless: bool = True,  # noqa: F821\n        google_search: bool = True,\n        real_chrome: bool = False,\n        wait: int | float = 0,\n        proxy: Optional[str | Dict[str, str]] = None,\n        timezone_id: str | None = None,\n        locale: str | None = None,\n        extra_headers: Optional[Dict[str, str]] = None,\n        useragent: Optional[str] = None,\n        hide_canvas: bool = False,\n        cdp_url: Optional[str] = None,\n        timeout: int | float = 30000,\n        disable_resources: bool = False,\n        wait_selector: Optional[str] = None,\n        cookies: Sequence[SetCookieParam] | None = None,\n        network_idle: bool = False,\n        wait_selector_state: SelectorWaitStates = \"attached\",\n        block_webrtc: bool = False,\n        allow_webgl: bool = True,\n        solve_cloudflare: bool = False,\n        additional_args: Optional[Dict] = None,\n    ) -> ResponseModel:\n        \"\"\"Use the stealthy fetcher to fetch a URL and return a structured output of the result.\n        Note: This is the only suitable fetcher for high protection levels.\n        Note: If the `css_selector` resolves to more than one element, all the elements will be returned.\n\n        :param url: The URL to request.\n        :param extraction_type: The type of content to extract from the page. Defaults to \"markdown\". Options are:\n            - Markdown will convert the page content to Markdown format.\n            - HTML will return the raw HTML content of the page.\n            - Text will return the text content of the page.\n        :param css_selector: CSS selector to extract the content from the page. If main_content_only is True, then it will be executed on the main content of the page. Defaults to None.\n        :param main_content_only: Whether to extract only the main content of the page. Defaults to True. The main content here is the data inside the `<body>` tag.\n        :param headless: Run the browser in headless/hidden (default), or headful/visible mode.\n        :param disable_resources: Drop requests for unnecessary resources for a speed boost.\n            Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.\n        :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.\n        :param cookies: Set cookies for the next request.\n        :param solve_cloudflare: Solves all types of the Cloudflare's Turnstile/Interstitial challenges before returning the response to you.\n        :param allow_webgl: Enabled by default. Disabling WebGL is not recommended as many WAFs now check if WebGL is enabled.\n        :param network_idle: Wait for the page until there are no network connections for at least 500 ms.\n        :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.\n        :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000\n        :param wait_selector: Wait for a specific CSS selector to be in a specific state.\n        :param timezone_id: Changes the timezone of the browser. Defaults to the system timezone.\n        :param locale: Specify user locale, for example, `en-GB`, `de-DE`, etc. Locale will affect navigator.language value, Accept-Language request header value as well as number and date formatting\n            rules. Defaults to the system default locale.\n        :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.\n        :param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.\n        :param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.\n        :param block_webrtc: Forces WebRTC to respect proxy settings to prevent local IP address leak.\n        :param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.\n        :param google_search: Enabled by default, Scrapling will set a Google referer header.\n        :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by `google_search` takes priority over the referer set here if used together._\n        :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.\n        :param additional_args: Additional arguments to be passed to Playwright's context as additional settings, and it takes higher priority than Scrapling's settings.\n        \"\"\"\n        page = await StealthyFetcher.async_fetch(\n            url,\n            wait=wait,\n            proxy=proxy,\n            locale=locale,\n            cdp_url=cdp_url,\n            timeout=timeout,\n            cookies=cookies,\n            headless=headless,\n            useragent=useragent,\n            timezone_id=timezone_id,\n            real_chrome=real_chrome,\n            hide_canvas=hide_canvas,\n            allow_webgl=allow_webgl,\n            network_idle=network_idle,\n            block_webrtc=block_webrtc,\n            wait_selector=wait_selector,\n            google_search=google_search,\n            extra_headers=extra_headers,\n            additional_args=additional_args,\n            solve_cloudflare=solve_cloudflare,\n            disable_resources=disable_resources,\n            wait_selector_state=wait_selector_state,\n        )\n        return _content_translator(\n            Convertor._extract_content(\n                page,\n                css_selector=css_selector,\n                extraction_type=extraction_type,\n                main_content_only=main_content_only,\n            ),\n            page,\n        )\n\n    @staticmethod\n    async def bulk_stealthy_fetch(\n        urls: List[str],\n        extraction_type: extraction_types = \"markdown\",\n        css_selector: Optional[str] = None,\n        main_content_only: bool = True,\n        headless: bool = True,  # noqa: F821\n        google_search: bool = True,\n        real_chrome: bool = False,\n        wait: int | float = 0,\n        proxy: Optional[str | Dict[str, str]] = None,\n        timezone_id: str | None = None,\n        locale: str | None = None,\n        extra_headers: Optional[Dict[str, str]] = None,\n        useragent: Optional[str] = None,\n        hide_canvas: bool = False,\n        cdp_url: Optional[str] = None,\n        timeout: int | float = 30000,\n        disable_resources: bool = False,\n        wait_selector: Optional[str] = None,\n        cookies: Sequence[SetCookieParam] | None = None,\n        network_idle: bool = False,\n        wait_selector_state: SelectorWaitStates = \"attached\",\n        block_webrtc: bool = False,\n        allow_webgl: bool = True,\n        solve_cloudflare: bool = False,\n        additional_args: Optional[Dict] = None,\n    ) -> List[ResponseModel]:\n        \"\"\"Use the stealthy fetcher to fetch a group of URLs at the same time, and for each page return a structured output of the result.\n        Note: This is the only suitable fetcher for high protection levels.\n        Note: If the `css_selector` resolves to more than one element, all the elements will be returned.\n\n        :param urls: A list of the URLs to request.\n        :param extraction_type: The type of content to extract from the page. Defaults to \"markdown\". Options are:\n            - Markdown will convert the page content to Markdown format.\n            - HTML will return the raw HTML content of the page.\n            - Text will return the text content of the page.\n        :param css_selector: CSS selector to extract the content from the page. If main_content_only is True, then it will be executed on the main content of the page. Defaults to None.\n        :param main_content_only: Whether to extract only the main content of the page. Defaults to True. The main content here is the data inside the `<body>` tag.\n        :param headless: Run the browser in headless/hidden (default), or headful/visible mode.\n        :param disable_resources: Drop requests for unnecessary resources for a speed boost.\n            Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.\n        :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.\n        :param cookies: Set cookies for the next request.\n        :param solve_cloudflare: Solves all types of the Cloudflare's Turnstile/Interstitial challenges before returning the response to you.\n        :param allow_webgl: Enabled by default. Disabling WebGL is not recommended as many WAFs now check if WebGL is enabled.\n        :param network_idle: Wait for the page until there are no network connections for at least 500 ms.\n        :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.\n        :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000\n        :param wait_selector: Wait for a specific CSS selector to be in a specific state.\n        :param timezone_id: Changes the timezone of the browser. Defaults to the system timezone.\n        :param locale: Specify user locale, for example, `en-GB`, `de-DE`, etc. Locale will affect navigator.language value, Accept-Language request header value as well as number and date formatting\n            rules. Defaults to the system default locale.\n        :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.\n        :param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.\n        :param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.\n        :param block_webrtc: Forces WebRTC to respect proxy settings to prevent local IP address leak.\n        :param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.\n        :param google_search: Enabled by default, Scrapling will set a Google referer header.\n        :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by `google_search` takes priority over the referer set here if used together._\n        :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.\n        :param additional_args: Additional arguments to be passed to Playwright's context as additional settings, and it takes higher priority than Scrapling's settings.\n        \"\"\"\n        async with AsyncStealthySession(\n            wait=wait,\n            proxy=proxy,\n            locale=locale,\n            cdp_url=cdp_url,\n            timeout=timeout,\n            cookies=cookies,\n            headless=headless,\n            useragent=useragent,\n            timezone_id=timezone_id,\n            real_chrome=real_chrome,\n            hide_canvas=hide_canvas,\n            allow_webgl=allow_webgl,\n            network_idle=network_idle,\n            block_webrtc=block_webrtc,\n            wait_selector=wait_selector,\n            google_search=google_search,\n            extra_headers=extra_headers,\n            additional_args=additional_args,\n            solve_cloudflare=solve_cloudflare,\n            disable_resources=disable_resources,\n            wait_selector_state=wait_selector_state,\n        ) as session:\n            tasks = [session.fetch(url) for url in urls]\n            responses = await gather(*tasks)\n            return [\n                _content_translator(\n                    Convertor._extract_content(\n                        page,\n                        css_selector=css_selector,\n                        extraction_type=extraction_type,\n                        main_content_only=main_content_only,\n                    ),\n                    page,\n                )\n                for page in responses\n            ]\n\n    def serve(self, http: bool, host: str, port: int):\n        \"\"\"Serve the MCP server.\"\"\"\n        server = FastMCP(name=\"Scrapling\", host=host, port=port)\n        server.add_tool(self.get, title=\"get\", description=self.get.__doc__, structured_output=True)\n        server.add_tool(self.bulk_get, title=\"bulk_get\", description=self.bulk_get.__doc__, structured_output=True)\n        server.add_tool(self.fetch, title=\"fetch\", description=self.fetch.__doc__, structured_output=True)\n        server.add_tool(\n            self.bulk_fetch, title=\"bulk_fetch\", description=self.bulk_fetch.__doc__, structured_output=True\n        )\n        server.add_tool(\n            self.stealthy_fetch, title=\"stealthy_fetch\", description=self.stealthy_fetch.__doc__, structured_output=True\n        )\n        server.add_tool(\n            self.bulk_stealthy_fetch,\n            title=\"bulk_stealthy_fetch\",\n            description=self.bulk_stealthy_fetch.__doc__,\n            structured_output=True,\n        )\n        server.run(transport=\"stdio\" if not http else \"streamable-http\")\n"
  },
  {
    "path": "scrapling/core/custom_types.py",
    "content": "from collections.abc import Mapping\nfrom types import MappingProxyType\nfrom re import compile as re_compile, UNICODE, IGNORECASE\n\nfrom orjson import dumps, loads\nfrom w3lib.html import replace_entities as _replace_entities\n\nfrom scrapling.core._types import (\n    Any,\n    cast,\n    Dict,\n    List,\n    Union,\n    overload,\n    TypeVar,\n    Literal,\n    Pattern,\n    Iterable,\n    Generator,\n    SupportsIndex,\n)\nfrom scrapling.core.utils import _is_iterable, flatten, __CONSECUTIVE_SPACES_REGEX__\n\n# Define type variable for AttributeHandler value type\n_TextHandlerType = TypeVar(\"_TextHandlerType\", bound=\"TextHandler\")\n__CLEANING_TABLE__ = str.maketrans(\"\\t\\r\\n\", \"   \")\n\n\nclass TextHandler(str):\n    \"\"\"Extends standard Python string by adding more functionality\"\"\"\n\n    __slots__ = ()\n\n    def __getitem__(self, key: SupportsIndex | slice) -> \"TextHandler\":  # pragma: no cover\n        lst = super().__getitem__(key)\n        return TextHandler(lst)\n\n    def split(self, sep: str | None = None, maxsplit: SupportsIndex = -1) -> list[Any]:  # pragma: no cover\n        return TextHandlers([TextHandler(s) for s in super().split(sep, maxsplit)])\n\n    def strip(self, chars: str | None = None) -> Union[str, \"TextHandler\"]:  # pragma: no cover\n        return TextHandler(super().strip(chars))\n\n    def lstrip(self, chars: str | None = None) -> Union[str, \"TextHandler\"]:  # pragma: no cover\n        return TextHandler(super().lstrip(chars))\n\n    def rstrip(self, chars: str | None = None) -> Union[str, \"TextHandler\"]:  # pragma: no cover\n        return TextHandler(super().rstrip(chars))\n\n    def capitalize(self) -> Union[str, \"TextHandler\"]:  # pragma: no cover\n        return TextHandler(super().capitalize())\n\n    def casefold(self) -> Union[str, \"TextHandler\"]:  # pragma: no cover\n        return TextHandler(super().casefold())\n\n    def center(self, width: SupportsIndex, fillchar: str = \" \") -> Union[str, \"TextHandler\"]:  # pragma: no cover\n        return TextHandler(super().center(width, fillchar))\n\n    def expandtabs(self, tabsize: SupportsIndex = 8) -> Union[str, \"TextHandler\"]:  # pragma: no cover\n        return TextHandler(super().expandtabs(tabsize))\n\n    def format(self, *args: object, **kwargs: object) -> Union[str, \"TextHandler\"]:  # pragma: no cover\n        return TextHandler(super().format(*args, **kwargs))\n\n    def format_map(self, mapping) -> Union[str, \"TextHandler\"]:  # pragma: no cover\n        return TextHandler(super().format_map(mapping))\n\n    def join(self, iterable: Iterable[str]) -> Union[str, \"TextHandler\"]:  # pragma: no cover\n        return TextHandler(super().join(iterable))\n\n    def ljust(self, width: SupportsIndex, fillchar: str = \" \") -> Union[str, \"TextHandler\"]:  # pragma: no cover\n        return TextHandler(super().ljust(width, fillchar))\n\n    def rjust(self, width: SupportsIndex, fillchar: str = \" \") -> Union[str, \"TextHandler\"]:  # pragma: no cover\n        return TextHandler(super().rjust(width, fillchar))\n\n    def swapcase(self) -> Union[str, \"TextHandler\"]:  # pragma: no cover\n        return TextHandler(super().swapcase())\n\n    def title(self) -> Union[str, \"TextHandler\"]:  # pragma: no cover\n        return TextHandler(super().title())\n\n    def translate(self, table) -> Union[str, \"TextHandler\"]:  # pragma: no cover\n        return TextHandler(super().translate(table))\n\n    def zfill(self, width: SupportsIndex) -> Union[str, \"TextHandler\"]:  # pragma: no cover\n        return TextHandler(super().zfill(width))\n\n    def replace(self, old: str, new: str, count: SupportsIndex = -1) -> Union[str, \"TextHandler\"]:\n        return TextHandler(super().replace(old, new, count))\n\n    def upper(self) -> Union[str, \"TextHandler\"]:\n        return TextHandler(super().upper())\n\n    def lower(self) -> Union[str, \"TextHandler\"]:\n        return TextHandler(super().lower())\n\n    ##############\n\n    def sort(self, reverse: bool = False) -> Union[str, \"TextHandler\"]:\n        \"\"\"Return a sorted version of the string\"\"\"\n        return self.__class__(\"\".join(sorted(self, reverse=reverse)))\n\n    def clean(self, remove_entities=False) -> Union[str, \"TextHandler\"]:\n        \"\"\"Return a new version of the string after removing all white spaces and consecutive spaces\"\"\"\n        data = self.translate(__CLEANING_TABLE__)\n        if remove_entities:\n            data = _replace_entities(data)\n        return self.__class__(__CONSECUTIVE_SPACES_REGEX__.sub(\" \", data).strip())\n\n    # For easy copy-paste from Scrapy/parsel code when needed :)\n    def get(self, default=None):  # pragma: no cover\n        return self\n\n    def get_all(self):  # pragma: no cover\n        return self\n\n    extract = get_all\n    extract_first = get\n\n    def json(self) -> Dict:\n        \"\"\"Return JSON response if the response is jsonable otherwise throw error\"\"\"\n        # Using str function as a workaround for orjson issue with subclasses of str.\n        # Check this out: https://github.com/ijl/orjson/issues/445\n        return loads(str(self))\n\n    @overload\n    def re(\n        self,\n        regex: str | Pattern,\n        replace_entities: bool = True,\n        clean_match: bool = False,\n        case_sensitive: bool = True,\n        *,\n        check_match: Literal[True],\n    ) -> bool: ...\n\n    @overload\n    def re(\n        self,\n        regex: str | Pattern,\n        replace_entities: bool = True,\n        clean_match: bool = False,\n        case_sensitive: bool = True,\n        check_match: Literal[False] = False,\n    ) -> \"TextHandlers\": ...\n\n    def re(\n        self,\n        regex: str | Pattern,\n        replace_entities: bool = True,\n        clean_match: bool = False,\n        case_sensitive: bool = True,\n        check_match: bool = False,\n    ) -> Union[\"TextHandlers\", bool]:\n        \"\"\"Apply the given regex to the current text and return a list of strings with the matches.\n\n        :param regex: Can be either a compiled regular expression or a string.\n        :param replace_entities: If enabled character entity references are replaced by their corresponding character\n        :param clean_match: If enabled, this will ignore all whitespaces and consecutive spaces while matching\n        :param case_sensitive: If disabled, function will set the regex to ignore the letters-case while compiling it\n        :param check_match: Used to quickly check if this regex matches or not without any operations on the results\n\n        \"\"\"\n        if isinstance(regex, str):\n            if case_sensitive:\n                regex = re_compile(regex, UNICODE)\n            else:\n                regex = re_compile(regex, flags=UNICODE | IGNORECASE)\n\n        input_text = self.clean() if clean_match else self\n        results = regex.findall(input_text)\n        if check_match:\n            return bool(results)\n\n        if all(_is_iterable(res) for res in results):\n            results = flatten(results)\n\n        if not replace_entities:\n            return TextHandlers([TextHandler(string) for string in results])\n\n        return TextHandlers([TextHandler(_replace_entities(s)) for s in results])\n\n    def re_first(\n        self,\n        regex: str | Pattern,\n        default: Any = None,\n        replace_entities: bool = True,\n        clean_match: bool = False,\n        case_sensitive: bool = True,\n    ) -> \"TextHandler\":\n        \"\"\"Apply the given regex to text and return the first match if found, otherwise return the default value.\n\n        :param regex: Can be either a compiled regular expression or a string.\n        :param default: The default value to be returned if there is no match\n        :param replace_entities: If enabled character entity references are replaced by their corresponding character\n        :param clean_match: If enabled, this will ignore all whitespaces and consecutive spaces while matching\n        :param case_sensitive: If disabled, function will set the regex to ignore the letters-case while compiling it\n\n        \"\"\"\n        result = self.re(\n            regex,\n            replace_entities,\n            clean_match=clean_match,\n            case_sensitive=case_sensitive,\n        )\n        return result[0] if result else default\n\n\nclass TextHandlers(List[TextHandler]):\n    \"\"\"\n    The :class:`TextHandlers` class is a subclass of the builtin ``List`` class, which provides a few additional methods.\n    \"\"\"\n\n    __slots__ = ()\n\n    @overload\n    def __getitem__(self, pos: SupportsIndex) -> TextHandler:  # pragma: no cover\n        pass\n\n    @overload\n    def __getitem__(self, pos: slice) -> \"TextHandlers\":  # pragma: no cover\n        pass\n\n    def __getitem__(self, pos: SupportsIndex | slice) -> Union[TextHandler, \"TextHandlers\"]:\n        lst = super().__getitem__(pos)\n        if isinstance(pos, slice):\n            return TextHandlers(cast(List[TextHandler], lst))\n        return TextHandler(cast(TextHandler, lst))\n\n    def re(\n        self,\n        regex: str | Pattern,\n        replace_entities: bool = True,\n        clean_match: bool = False,\n        case_sensitive: bool = True,\n    ) -> \"TextHandlers\":\n        \"\"\"Call the ``.re()`` method for each element in this list and return\n        their results flattened as TextHandlers.\n\n        :param regex: Can be either a compiled regular expression or a string.\n        :param replace_entities: If enabled character entity references are replaced by their corresponding character\n        :param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching\n        :param case_sensitive: if disabled, the function will set the regex to ignore the letters-case while compiling it\n        \"\"\"\n        results = [n.re(regex, replace_entities, clean_match, case_sensitive) for n in self]\n        return TextHandlers(flatten(results))\n\n    def re_first(\n        self,\n        regex: str | Pattern,\n        default: Any = None,\n        replace_entities: bool = True,\n        clean_match: bool = False,\n        case_sensitive: bool = True,\n    ) -> TextHandler:  # pragma: no cover\n        \"\"\"Call the ``.re_first()`` method for each element in this list and return\n        the first result or the default value otherwise.\n\n        :param regex: Can be either a compiled regular expression or a string.\n        :param default: The default value to be returned if there is no match\n        :param replace_entities: If enabled character entity references are replaced by their corresponding character\n        :param clean_match: If enabled, this will ignore all whitespaces and consecutive spaces while matching\n        :param case_sensitive: If disabled, function will set the regex to ignore the letters-case while compiling it\n        \"\"\"\n        for n in self:\n            for result in n.re(regex, replace_entities, clean_match, case_sensitive):\n                return result\n        return default\n\n    # For easy copy-paste from Scrapy/parsel code when needed :)\n    def get(self, default=None):\n        \"\"\"Returns the first item of the current list\n        :param default: the default value to return if the current list is empty\n        \"\"\"\n        return self[0] if len(self) > 0 else default\n\n    def extract(self):\n        return self\n\n    extract_first = get\n    get_all = extract\n\n\nclass AttributesHandler(Mapping[str, _TextHandlerType]):\n    \"\"\"A read-only mapping to use instead of the standard dictionary for the speed boost, but at the same time I use it to add more functionalities.\n    If the standard dictionary is needed, convert this class to a dictionary with the `dict` function\n    \"\"\"\n\n    __slots__ = (\"_data\",)\n\n    def __init__(self, mapping: Any = None, **kwargs: Any) -> None:\n        mapping = (\n            {key: TextHandler(value) if isinstance(value, str) else value for key, value in mapping.items()}\n            if mapping is not None\n            else {}\n        )\n\n        if kwargs:\n            mapping.update(\n                {key: TextHandler(value) if isinstance(value, str) else value for key, value in kwargs.items()}\n            )\n\n        # Fastest read-only mapping type\n        self._data: Mapping[str, Any] = MappingProxyType(mapping)\n\n    def get(self, key: str, default: Any = None) -> _TextHandlerType:\n        \"\"\"Acts like the standard dictionary `.get()` method\"\"\"\n        return self._data.get(key, default)\n\n    def search_values(self, keyword: str, partial: bool = False) -> Generator[\"AttributesHandler\", None, None]:\n        \"\"\"Search current attributes by values and return a dictionary of each matching item\n        :param keyword: The keyword to search for in the attribute values\n        :param partial: If True, the function will search if keyword in each value instead of perfect match\n        \"\"\"\n        for key, value in self._data.items():\n            if partial:\n                if keyword in value:\n                    yield AttributesHandler({key: value})\n            else:\n                if keyword == value:\n                    yield AttributesHandler({key: value})\n\n    @property\n    def json_string(self) -> bytes:\n        \"\"\"Convert current attributes to JSON bytes if the attributes are JSON serializable otherwise throws error\"\"\"\n        return dumps(dict(self._data))\n\n    def __getitem__(self, key: str) -> _TextHandlerType:\n        return self._data[key]\n\n    def __iter__(self):\n        return iter(self._data)\n\n    def __len__(self):\n        return len(self._data)\n\n    def __repr__(self):\n        return f\"{self.__class__.__name__}({self._data})\"\n\n    def __str__(self):\n        return str(self._data)\n\n    def __contains__(self, key):\n        return key in self._data\n"
  },
  {
    "path": "scrapling/core/mixins.py",
    "content": "from scrapling.core._types import Any, Dict\n\n\nclass SelectorsGeneration:\n    \"\"\"\n    Functions for generating selectors\n    Trying to generate selectors like Firefox or maybe cleaner ones!? Ehm\n    Inspiration: https://searchfox.org/mozilla-central/source/devtools/shared/inspector/css-logic.js#591\n    \"\"\"\n\n    # Note: This is a mixin class meant to be used with Selector.\n    # The methods access Selector attributes (._root, .parent, .attrib, .tag, etc.)\n    # through self, which will be a Selector instance at runtime.\n\n    def _general_selection(self: Any, selection: str = \"css\", full_path: bool = False) -> str:\n        \"\"\"Generate a selector for the current element.\n        :return: A string of the generated selector.\n        \"\"\"\n        if self._is_text_node(self._root):\n            return \"\"\n\n        selectorPath = []\n        target = self\n        css = selection.lower() == \"css\"\n        while target is not None:\n            if target.parent:\n                if target.attrib.get(\"id\"):\n                    # id is enough\n                    part = f\"#{target.attrib['id']}\" if css else f\"[@id='{target.attrib['id']}']\"\n                    selectorPath.append(part)\n                    if not full_path:\n                        return \" > \".join(reversed(selectorPath)) if css else \"//*\" + \"/\".join(reversed(selectorPath))\n                else:\n                    part = f\"{target.tag}\"\n                    # We won't use classes anymore because I some websites share exact classes between elements\n                    # classes = target.attrib.get('class', '').split()\n                    # if classes and css:\n                    #     part += f\".{'.'.join(classes)}\"\n                    # else:\n                    counter: Dict[str, int] = {}\n                    for child in target.parent.children:\n                        counter.setdefault(child.tag, 0)\n                        counter[child.tag] += 1\n                        if child._root == target._root:\n                            break\n\n                    if counter[target.tag] > 1:\n                        part += f\":nth-of-type({counter[target.tag]})\" if css else f\"[{counter[target.tag]}]\"\n\n                selectorPath.append(part)\n                target = target.parent\n                if target is None or target.tag == \"html\":\n                    return \" > \".join(reversed(selectorPath)) if css else \"//\" + \"/\".join(reversed(selectorPath))\n            else:\n                break\n\n        return \" > \".join(reversed(selectorPath)) if css else \"//\" + \"/\".join(reversed(selectorPath))\n\n    @property\n    def generate_css_selector(self: Any) -> str:\n        \"\"\"Generate a CSS selector for the current element\n        :return: A string of the generated selector.\n        \"\"\"\n        return self._general_selection()\n\n    @property\n    def generate_full_css_selector(self: Any) -> str:\n        \"\"\"Generate a complete CSS selector for the current element\n        :return: A string of the generated selector.\n        \"\"\"\n        return self._general_selection(full_path=True)\n\n    @property\n    def generate_xpath_selector(self: Any) -> str:\n        \"\"\"Generate an XPath selector for the current element\n        :return: A string of the generated selector.\n        \"\"\"\n        return self._general_selection(\"xpath\")\n\n    @property\n    def generate_full_xpath_selector(self: Any) -> str:\n        \"\"\"Generate a complete XPath selector for the current element\n        :return: A string of the generated selector.\n        \"\"\"\n        return self._general_selection(\"xpath\", full_path=True)\n"
  },
  {
    "path": "scrapling/core/shell.py",
    "content": "# -*- coding: utf-8 -*-\nfrom sys import stderr\nfrom copy import deepcopy\nfrom functools import wraps\nfrom re import sub as re_sub\nfrom collections import namedtuple\nfrom shlex import split as shlex_split\nfrom inspect import signature, Parameter\nfrom tempfile import mkstemp as make_temp_file\nfrom argparse import ArgumentParser, SUPPRESS\nfrom webbrowser import open as open_in_browser\nfrom urllib.parse import urlparse, urlunparse, parse_qsl\nfrom logging import (\n    DEBUG,\n    INFO,\n    WARNING,\n    ERROR,\n    CRITICAL,\n    FATAL,\n    getLogger,\n    getLevelName,\n)\n\nfrom orjson import loads as json_loads, JSONDecodeError\n\nfrom ._shell_signatures import Signatures_map\nfrom scrapling import __version__\nfrom scrapling.core.utils import log\nfrom scrapling.parser import Selector, Selectors\nfrom scrapling.core.custom_types import TextHandler\nfrom scrapling.engines.toolbelt.custom import Response\nfrom scrapling.core.utils._shell import _ParseHeaders, _CookieParser\nfrom scrapling.core._types import (\n    Callable,\n    Dict,\n    Any,\n    cast,\n    Optional,\n    Generator,\n    extraction_types,\n)\n\n\n_known_logging_levels = {\n    \"debug\": DEBUG,\n    \"info\": INFO,\n    \"warning\": WARNING,\n    \"error\": ERROR,\n    \"critical\": CRITICAL,\n    \"fatal\": FATAL,\n}\n\n\n# Define the structure for parsed context - Simplified for Fetcher args\nRequest = namedtuple(\n    \"Request\",\n    [\n        \"method\",\n        \"url\",\n        \"params\",\n        \"data\",  # Can be str, bytes, or dict (for urlencoded)\n        \"json_data\",  # Python object (dict/list) for JSON payload\n        \"headers\",\n        \"cookies\",\n        \"proxy\",\n        \"follow_redirects\",  # Added for -L flag\n    ],\n)\n\n\n# Suppress exit on error to handle parsing errors gracefully\nclass NoExitArgumentParser(ArgumentParser):  # pragma: no cover\n    def error(self, message):\n        log.error(f\"Curl arguments parsing error: {message}\")\n        raise ValueError(f\"Curl arguments parsing error: {message}\")\n\n    def exit(self, status=0, message=None):\n        if message:\n            log.error(f\"Scrapling shell exited with status {status}: {message}\")\n            self._print_message(message, stderr)\n        raise ValueError(f\"Scrapling shell exited with status {status}: {message or 'Unknown reason'}\")\n\n\nclass CurlParser:\n    \"\"\"Builds the argument parser for relevant curl flags from DevTools.\"\"\"\n\n    def __init__(self) -> None:\n        from scrapling.fetchers import Fetcher as __Fetcher\n\n        self.__fetcher = __Fetcher\n        # We will use argparse parser to parse the curl command directly instead of regex\n        # We will focus more on flags that will show up on curl commands copied from DevTools's network tab\n        _parser = NoExitArgumentParser(add_help=False)  # Disable default help\n        # Basic curl arguments\n        _parser.add_argument(\"curl_command_placeholder\", nargs=\"?\", help=SUPPRESS)\n        _parser.add_argument(\"url\")\n        _parser.add_argument(\"-X\", \"--request\", dest=\"method\", default=None)\n        _parser.add_argument(\"-H\", \"--header\", action=\"append\", default=[])\n        _parser.add_argument(\n            \"-A\", \"--user-agent\", help=\"Will be parsed from -H if present\"\n        )  # Note: DevTools usually includes this in -H\n\n        # Data arguments (prioritizing types common from DevTools)\n        _parser.add_argument(\"-d\", \"--data\", default=None)\n        _parser.add_argument(\"--data-raw\", default=None)  # Often used by browsers for JSON body\n        _parser.add_argument(\"--data-binary\", default=None)\n        # Keep urlencode for completeness, though less common from browser copy/paste\n        _parser.add_argument(\"--data-urlencode\", action=\"append\", default=[])\n        _parser.add_argument(\"-G\", \"--get\", action=\"store_true\")  # Use GET and put data in URL\n\n        _parser.add_argument(\n            \"-b\",\n            \"--cookie\",\n            default=None,\n            help=\"Send cookies from string/file (string format used by DevTools)\",\n        )\n\n        # Proxy\n        _parser.add_argument(\"-x\", \"--proxy\", default=None)\n        _parser.add_argument(\"-U\", \"--proxy-user\", default=None)  # Basic proxy auth\n\n        # Connection/Security\n        _parser.add_argument(\"-k\", \"--insecure\", action=\"store_true\")\n        _parser.add_argument(\"--compressed\", action=\"store_true\")  # Very common from browsers\n\n        # Other flags often included but may not map directly to request args\n        _parser.add_argument(\"-i\", \"--include\", action=\"store_true\")\n        _parser.add_argument(\"-s\", \"--silent\", action=\"store_true\")\n        _parser.add_argument(\"-v\", \"--verbose\", action=\"store_true\")\n\n        self.parser: NoExitArgumentParser = _parser\n        self._supported_methods = (\"get\", \"post\", \"put\", \"delete\")\n\n    # --- Main Parsing Logic ---\n    def parse(self, curl_command: str) -> Optional[Request]:\n        \"\"\"Parses the curl command string into a structured context for Fetcher.\"\"\"\n\n        clean_command = curl_command.strip().lstrip(\"curl\").strip().replace(\"\\\\\\n\", \" \")\n\n        try:\n            tokens = shlex_split(clean_command)  # Split the string using shell-like syntax\n        except ValueError as e:  # pragma: no cover\n            log.error(f\"Could not split command line: {e}\")\n            return None\n\n        try:\n            parsed_args, unknown = self.parser.parse_known_args(tokens)\n            if unknown:\n                raise AttributeError(f\"Unknown/Unsupported curl arguments: {unknown}\")\n\n        except ValueError:  # pragma: no cover\n            return None\n\n        except AttributeError:\n            raise\n\n        except Exception as e:  # pragma: no cover\n            log.error(f\"An unexpected error occurred during curl arguments parsing: {e}\")\n            return None\n\n        # --- Determine Method ---\n        method = \"get\"  # Default\n        if parsed_args.get:  # `-G` forces GET\n            method = \"get\"\n\n        elif parsed_args.method:\n            method = parsed_args.method.strip().lower()\n\n        # Infer POST if data is present (unless overridden by -X or -G)\n        elif any(\n            [\n                parsed_args.data,\n                parsed_args.data_raw,\n                parsed_args.data_binary,\n                parsed_args.data_urlencode,\n            ]\n        ):\n            method = \"post\"\n\n        headers, cookies = _ParseHeaders(parsed_args.header)\n\n        if parsed_args.cookie:\n            # We are focusing on the string format from DevTools.\n            try:\n                for key, value in _CookieParser(parsed_args.cookie):\n                    # Update the cookie dict, potentially overwriting cookies with the same name from -H 'cookie:'\n                    cookies[key] = value\n                log.debug(f\"Parsed cookies from -b argument: {list(cookies.keys())}\")\n            except Exception as e:  # pragma: no cover\n                log.error(f\"Could not parse cookie string from -b '{parsed_args.cookie}': {e}\")\n\n        # --- Process Data Payload ---\n        params = dict()\n        data_payload: Optional[str | bytes | Dict] = None\n        json_payload: Optional[Any] = None\n\n        # DevTools often uses --data-raw for JSON bodies\n        # Precedence: --data-binary > --data-raw / -d > --data-urlencode\n        if parsed_args.data_binary is not None:  # pragma: no cover\n            try:\n                data_payload = parsed_args.data_binary.encode(\"utf-8\")\n                log.debug(\"Using data from --data-binary as bytes.\")\n            except Exception as e:\n                log.warning(\n                    f\"Could not encode binary data '{parsed_args.data_binary}' as bytes: {e}. Using raw string.\"\n                )\n                data_payload = parsed_args.data_binary  # Fallback to string\n\n        elif parsed_args.data_raw is not None:\n            data_payload = parsed_args.data_raw.lstrip(\"$\")\n\n        elif parsed_args.data is not None:\n            data_payload = parsed_args.data\n\n        elif parsed_args.data_urlencode:  # pragma: no cover\n            # Combine and parse urlencoded data\n            combined_data = \"&\".join(parsed_args.data_urlencode)\n            try:\n                data_payload = dict(parse_qsl(combined_data, keep_blank_values=True))\n            except Exception as e:\n                log.warning(f\"Could not parse urlencoded data '{combined_data}': {e}. Treating as raw string.\")\n                data_payload = combined_data\n\n        # Check if raw data looks like JSON, prefer 'json' param if so\n        if isinstance(data_payload, str):\n            try:\n                maybe_json = json_loads(data_payload)\n                if isinstance(maybe_json, (dict, list)):\n                    json_payload = maybe_json\n                    data_payload = None\n            except JSONDecodeError:\n                pass  # Not JSON, keep it in data_payload\n\n        # Handle `-G`: Move data to params if the method is GET\n        if method == \"get\" and data_payload:  # pragma: no cover\n            if isinstance(data_payload, dict):  # From --data-urlencode likely\n                params.update(data_payload)\n            elif isinstance(data_payload, str):\n                try:\n                    params.update(dict(parse_qsl(data_payload, keep_blank_values=True)))\n                except ValueError:\n                    log.warning(f\"Could not parse data '{data_payload}' into GET parameters for -G.\")\n\n            if params:\n                data_payload = None  # Clear data as it's moved to params\n                json_payload = None  # Should not have JSON body with -G\n\n        # --- Process Proxy ---\n        proxies: Optional[Dict[str, str]] = None\n        if parsed_args.proxy:\n            proxy_url = f\"http://{parsed_args.proxy}\" if \"://\" not in parsed_args.proxy else parsed_args.proxy\n\n            if parsed_args.proxy_user:\n                user_pass = parsed_args.proxy_user\n                parts = urlparse(proxy_url)\n                netloc_parts = parts.netloc.split(\"@\")\n                netloc = f\"{user_pass}@{netloc_parts[-1]}\" if len(netloc_parts) > 1 else f\"{user_pass}@{parts.netloc}\"\n                proxy_url = urlunparse(\n                    (\n                        parts.scheme,\n                        netloc,\n                        parts.path,\n                        parts.params,\n                        parts.query,\n                        parts.fragment,\n                    )\n                )\n\n            # Standard proxy dict format\n            proxies = {\"http\": proxy_url, \"https\": proxy_url}\n            log.debug(f\"Using proxy configuration: {proxies}\")\n\n        # --- Final Context ---\n        return Request(\n            method=method,\n            url=parsed_args.url,\n            params=params,\n            data=data_payload,\n            json_data=json_payload,\n            headers=headers,\n            cookies=cookies,\n            proxy=proxies,\n            follow_redirects=True,  # Scrapling default is True\n        )\n\n    def convert2fetcher(self, curl_command: Request | str) -> Optional[Response]:\n        if isinstance(curl_command, (Request, str)):\n            request = self.parse(curl_command) if isinstance(curl_command, str) else curl_command\n\n            # Ensure request parsing was successful before proceeding\n            if request is None:  # pragma: no cover\n                log.error(\"Failed to parse curl command, cannot convert to fetcher.\")\n                return None\n\n            request_args = request._asdict()\n            method = request_args.pop(\"method\").strip().lower()\n            if method in self._supported_methods:\n                request_args[\"json\"] = request_args.pop(\"json_data\")\n\n                # Ensure data/json are removed for non-POST/PUT methods\n                if method not in (\"post\", \"put\"):\n                    _ = request_args.pop(\"data\", None)\n                    _ = request_args.pop(\"json\", None)\n\n                try:\n                    return getattr(self.__fetcher, method)(**request_args)\n                except Exception as e:  # pragma: no cover\n                    log.error(f\"Error calling Fetcher.{method}: {e}\")\n                    return None\n            else:  # pragma: no cover\n                log.error(f'Request method \"{method}\" isn\\'t supported by Scrapling yet')\n                return None\n\n        else:  # pragma: no cover\n            log.error(\"Input must be a valid curl command string or a Request object.\")\n            return None\n\n\ndef _unpack_signature(func, signature_name=None):\n    \"\"\"\n    Unpack TypedDict from Unpack[TypedDict] annotations in **kwargs and reconstruct the signature.\n\n    This allows the interactive shell to show individual parameters instead of just **kwargs, similar to how IDEs display them.\n    \"\"\"\n    try:\n        sig = signature(func)\n        func_name = signature_name or getattr(func, \"__name__\", None)\n\n        # Check if this function has known parameters\n        if func_name not in Signatures_map:\n            return sig\n\n        new_params = []\n        for param in sig.parameters.values():\n            if param.kind == Parameter.VAR_KEYWORD:\n                # Replace **kwargs with individual keyword-only parameters\n                for field_name, field_type in Signatures_map[func_name].items():\n                    new_params.append(\n                        Parameter(field_name, Parameter.KEYWORD_ONLY, default=Parameter.empty, annotation=field_type)\n                    )\n            else:\n                new_params.append(param)\n\n        # Reconstruct signature with unpacked parameters\n        if len(new_params) != len(sig.parameters):\n            return sig.replace(parameters=new_params)\n        return sig\n\n    except Exception:  # pragma: no cover\n        return signature(func)\n\n\ndef show_page_in_browser(page: Selector):  # pragma: no cover\n    if not page or not isinstance(page, Selector):\n        log.error(\"Input must be of type `Selector`\")\n        return\n\n    try:\n        fd, fname = make_temp_file(prefix=\"scrapling_view_\", suffix=\".html\")\n        with open(fd, \"w\", encoding=page.encoding) as f:\n            f.write(page.html_content)\n\n        open_in_browser(f\"file://{fname}\")\n    except IOError as e:\n        log.error(f\"Failed to write temporary file for viewing: {e}\")\n    except Exception as e:\n        log.error(f\"An unexpected error occurred while viewing the page: {e}\")\n\n\nclass CustomShell:\n    \"\"\"A custom IPython shell with minimal dependencies\"\"\"\n\n    def __init__(self, code, log_level=\"debug\"):\n        from IPython.terminal.embed import InteractiveShellEmbed as __InteractiveShellEmbed\n        from scrapling.fetchers import (\n            Fetcher as __Fetcher,\n            AsyncFetcher as __AsyncFetcher,\n            FetcherSession as __FetcherSession,\n            DynamicFetcher as __DynamicFetcher,\n            DynamicSession as __DynamicSession,\n            AsyncDynamicSession as __AsyncDynamicSession,\n            StealthyFetcher as __StealthyFetcher,\n            StealthySession as __StealthySession,\n            AsyncStealthySession as __AsyncStealthySession,\n        )\n\n        self.__InteractiveShellEmbed = __InteractiveShellEmbed\n        self.__Fetcher = __Fetcher\n        self.__AsyncFetcher = __AsyncFetcher\n        self.__FetcherSession = __FetcherSession\n        self.__DynamicFetcher = __DynamicFetcher\n        self.__DynamicSession = __DynamicSession\n        self.__AsyncDynamicSession = __AsyncDynamicSession\n        self.__StealthyFetcher = __StealthyFetcher\n        self.__StealthySession = __StealthySession\n        self.__AsyncStealthySession = __AsyncStealthySession\n        self.code = code\n        self.page = None\n        self.pages = Selectors([])\n        self._curl_parser = CurlParser()\n        log_level = log_level.strip().lower()\n\n        if _known_logging_levels.get(log_level):\n            self.log_level = _known_logging_levels[log_level]\n        else:  # pragma: no cover\n            log.warning(f'Unknown log level \"{log_level}\", defaulting to \"DEBUG\"')\n            self.log_level = DEBUG\n\n        self.shell = None\n\n        # Initialize your application components\n        self.init_components()\n\n    def init_components(self):\n        \"\"\"Initialize application components\"\"\"\n        # This is where you'd set up your application-specific objects\n        if self.log_level:\n            getLogger(\"scrapling\").setLevel(self.log_level)\n\n        settings = self.__Fetcher.display_config()\n        settings.pop(\"storage\", None)\n        settings.pop(\"storage_args\", None)\n        log.info(f\"Scrapling {__version__} shell started\")\n        log.info(f\"Logging level is set to '{getLevelName(self.log_level)}'\")\n        log.info(f\"Fetchers' parsing settings: {settings}\")\n\n    @staticmethod\n    def banner():\n        \"\"\"Create a custom banner for the shell\"\"\"\n        return f\"\"\"\n-> Available Scrapling objects:\n   - Fetcher/AsyncFetcher/FetcherSession\n   - DynamicFetcher/DynamicSession/AsyncDynamicSession\n   - StealthyFetcher/StealthySession/AsyncStealthySession\n   - Selector\n\n-> Useful shortcuts:\n   - {\"get\":<30} Shortcut for `Fetcher.get`\n   - {\"post\":<30} Shortcut for `Fetcher.post`\n   - {\"put\":<30} Shortcut for `Fetcher.put`\n   - {\"delete\":<30} Shortcut for `Fetcher.delete`\n   - {\"fetch\":<30} Shortcut for `DynamicFetcher.fetch`\n   - {\"stealthy_fetch\":<30} Shortcut for `StealthyFetcher.fetch`\n\n-> Useful commands\n   - {\"page / response\":<30} The response object of the last page you fetched\n   - {\"pages\":<30} Selectors object of the last 5 response objects you fetched\n   - {\"uncurl('curl_command')\":<30} Convert curl command to a Request object. (Optimized to handle curl commands copied from DevTools network tab.)\n   - {\"curl2fetcher('curl_command')\":<30} Convert curl command and make the request with Fetcher. (Optimized to handle curl commands copied from DevTools network tab.)\n   - {\"view(page)\":<30} View page in a browser\n   - {\"help()\":<30} Show this help message (Shell help)\n\nType 'exit' or press Ctrl+D to exit.\n        \"\"\"\n\n    def update_page(self, result):  # pragma: no cover\n        \"\"\"Update the current page and add to pages history\"\"\"\n        self.page = result\n        if isinstance(result, (Response, Selector)):\n            self.pages.append(result)\n            if len(self.pages) > 5:\n                self.pages.pop(0)  # Remove the oldest item\n\n            # Update in IPython namespace too\n            if self.shell:\n                self.shell.user_ns[\"page\"] = self.page\n                self.shell.user_ns[\"response\"] = self.page\n                self.shell.user_ns[\"pages\"] = self.pages\n\n        return result\n\n    def create_wrapper(\n        self, func: Callable, get_signature: bool = True, signature_name: Optional[str] = None\n    ) -> Callable:\n        \"\"\"Create a wrapper that preserves function signature but updates page\"\"\"\n\n        @wraps(func)\n        def wrapper(*args: Any, **kwargs: Any) -> Any:\n            result = func(*args, **kwargs)\n            return self.update_page(result)\n\n        if get_signature:\n            # Explicitly preserve and unpack signature for IPython introspection and autocompletion\n            setattr(wrapper, \"__signature__\", _unpack_signature(func, signature_name))\n        else:\n            setattr(wrapper, \"__signature__\", signature(func))\n\n        return wrapper\n\n    def get_namespace(self):\n        \"\"\"Create a namespace with application-specific objects\"\"\"\n\n        # Create wrapped versions of fetch functions\n        get = self.create_wrapper(self.__Fetcher.get)\n        post = self.create_wrapper(self.__Fetcher.post)\n        put = self.create_wrapper(self.__Fetcher.put)\n        delete = self.create_wrapper(self.__Fetcher.delete)\n        dynamic_fetch = self.create_wrapper(self.__DynamicFetcher.fetch)\n        stealthy_fetch = self.create_wrapper(self.__StealthyFetcher.fetch, signature_name=\"stealthy_fetch\")\n        curl2fetcher = self.create_wrapper(self._curl_parser.convert2fetcher, get_signature=False)\n\n        # Create the namespace dictionary\n        return {\n            \"get\": get,\n            \"post\": post,\n            \"put\": put,\n            \"delete\": delete,\n            \"Fetcher\": self.__Fetcher,\n            \"AsyncFetcher\": self.__AsyncFetcher,\n            \"FetcherSession\": self.__FetcherSession,\n            \"DynamicSession\": self.__DynamicSession,\n            \"AsyncDynamicSession\": self.__AsyncDynamicSession,\n            \"StealthySession\": self.__StealthySession,\n            \"AsyncStealthySession\": self.__AsyncStealthySession,\n            \"fetch\": dynamic_fetch,\n            \"DynamicFetcher\": self.__DynamicFetcher,\n            \"stealthy_fetch\": stealthy_fetch,\n            \"StealthyFetcher\": self.__StealthyFetcher,\n            \"Selector\": Selector,\n            \"page\": self.page,\n            \"response\": self.page,\n            \"pages\": self.pages,\n            \"view\": show_page_in_browser,\n            \"uncurl\": self._curl_parser.parse,\n            \"curl2fetcher\": curl2fetcher,\n            \"help\": self.show_help,\n        }\n\n    def show_help(self):  # pragma: no cover\n        \"\"\"Show help information\"\"\"\n        print(self.banner())\n\n    def start(self):  # pragma: no cover\n        \"\"\"Start the interactive shell\"\"\"\n\n        # Get our namespace with application objects\n        namespace = self.get_namespace()\n        ipython_shell = self.__InteractiveShellEmbed(\n            banner1=self.banner(),\n            banner2=\"\",\n            enable_tip=False,\n            exit_msg=\"Bye Bye\",\n            user_ns=namespace,\n        )\n        self.shell = ipython_shell\n\n        # If a command was provided, execute it and exit\n        if self.code:\n            log.info(f\"Executing provided code: {self.code}\")\n            try:\n                ipython_shell.run_cell(self.code, store_history=False)\n            except Exception as e:\n                log.error(f\"Error executing initial code: {e}\")\n            return\n\n        ipython_shell()\n\n\nclass Convertor:\n    \"\"\"Utils for the extract shell command\"\"\"\n\n    _extension_map: Dict[str, extraction_types] = {\n        \"md\": \"markdown\",\n        \"html\": \"html\",\n        \"txt\": \"text\",\n    }\n\n    @classmethod\n    def _convert_to_markdown(cls, body: TextHandler) -> str:\n        \"\"\"Convert HTML content to Markdown\"\"\"\n        from markdownify import markdownify\n\n        return markdownify(body)\n\n    @classmethod\n    def _strip_noise_tags(cls, page: Selector) -> Selector:\n        \"\"\"Return a copy of the Selector with noise tags removed.\"\"\"\n        clean_root = deepcopy(page._root)\n        for element in clean_root.iter(*{\"script\", \"style\", \"noscript\", \"svg\"}):\n            element.drop_tree()\n        return Selector(root=clean_root, url=page.url)\n\n    @classmethod\n    def _extract_content(\n        cls,\n        page: Selector,\n        extraction_type: extraction_types = \"markdown\",\n        css_selector: Optional[str] = None,\n        main_content_only: bool = False,\n    ) -> Generator[str, None, None]:\n        \"\"\"Extract the content of a Selector\"\"\"\n        if not page or not isinstance(page, Selector):  # pragma: no cover\n            raise TypeError(\"Input must be of type `Selector`\")\n        elif not extraction_type or extraction_type not in cls._extension_map.values():\n            raise ValueError(f\"Unknown extraction type: {extraction_type}\")\n        else:\n            if main_content_only:\n                page = cast(Selector, page.css(\"body\").first) or page\n                page = cls._strip_noise_tags(page)\n\n            pages = [page] if not css_selector else cast(Selectors, page.css(css_selector))\n            for page in pages:\n                match extraction_type:\n                    case \"markdown\":\n                        yield cls._convert_to_markdown(page.html_content)\n                    case \"html\":\n                        yield page.html_content\n                    case \"text\":\n                        txt_content = page.get_all_text(\n                            strip=True, ignore_tags=(\"script\", \"style\", \"noscript\", \"svg\", \"iframe\")\n                        )\n                        for s in (\n                            \"\\n\",\n                            \"\\r\",\n                            \"\\t\",\n                            \" \",\n                        ):\n                            # Remove consecutive white-spaces\n                            txt_content = TextHandler(re_sub(f\"[{s}]+\", s, txt_content))\n                        yield txt_content\n            yield \"\"\n\n    @classmethod\n    def write_content_to_file(cls, page: Selector, filename: str, css_selector: Optional[str] = None) -> None:\n        \"\"\"Write a Selector's content to a file\"\"\"\n        if not page or not isinstance(page, Selector):  # pragma: no cover\n            raise TypeError(\"Input must be of type `Selector`\")\n        elif not filename or not isinstance(filename, str) or not filename.strip():\n            raise ValueError(\"Filename must be provided\")\n        elif not filename.endswith((\".md\", \".html\", \".txt\")):\n            raise ValueError(\"Unknown file type: filename must end with '.md', '.html', or '.txt'\")\n        else:\n            with open(filename, \"w\", encoding=page.encoding) as f:\n                extension = filename.split(\".\")[-1]\n                f.write(\n                    \"\".join(\n                        cls._extract_content(\n                            page,\n                            cls._extension_map[extension],\n                            css_selector=css_selector,\n                        )\n                    )\n                )\n"
  },
  {
    "path": "scrapling/core/storage.py",
    "content": "from hashlib import sha256\nfrom threading import RLock\nfrom functools import lru_cache\nfrom abc import ABC, abstractmethod\nfrom sqlite3 import connect as db_connect\n\nfrom orjson import dumps, loads\nfrom lxml.html import HtmlElement\n\nfrom scrapling.core.utils import _StorageTools, log\nfrom scrapling.core._types import Dict, Optional, Any, cast\n\n\nclass StorageSystemMixin(ABC):  # pragma: no cover\n    # If you want to make your own storage system, you have to inherit from this\n    def __init__(self, url: Optional[str] = None):\n        \"\"\"\n        :param url: URL of the website we are working on to separate it from other websites data\n        \"\"\"\n        # Make the url in lowercase to handle this edge case until it's updated: https://github.com/barseghyanartur/tld/issues/124\n        self.url = url.lower() if (url and isinstance(url, str)) else None\n\n    @lru_cache(64, typed=True)\n    def _get_base_url(self, default_value: str = \"default\") -> str:\n        if not self.url:\n            return default_value\n\n        try:\n            from tld import get_tld, Result\n\n            # Fixing the inaccurate return type hint in `get_tld`\n            extracted: Result | None = cast(\n                Result, get_tld(self.url, as_object=True, fail_silently=True, fix_protocol=True)\n            )\n            if not extracted:\n                return default_value\n            return extracted.fld or extracted.domain or default_value\n        except AttributeError:\n            return default_value\n\n    @abstractmethod\n    def save(self, element: HtmlElement, identifier: str) -> None:\n        \"\"\"Saves the element's unique properties to the storage for retrieval and relocation later\n\n        :param element: The element itself which we want to save to storage.\n        :param identifier: This is the identifier that will be used to retrieve the element later from the storage. See\n            the docs for more info.\n        \"\"\"\n        raise NotImplementedError(\"Storage system must implement `save` method\")\n\n    @abstractmethod\n    def retrieve(self, identifier: str) -> Optional[Dict]:\n        \"\"\"Using the identifier, we search the storage and return the unique properties of the element\n\n        :param identifier: This is the identifier that will be used to retrieve the element from the storage. See\n            the docs for more info.\n        :return: A dictionary of the unique properties\n        \"\"\"\n        raise NotImplementedError(\"Storage system must implement `save` method\")\n\n    @staticmethod\n    @lru_cache(128, typed=True)\n    def _get_hash(identifier: str) -> str:\n        \"\"\"If you want to hash identifier in your storage system, use this safer\"\"\"\n        _identifier = identifier.lower().strip()\n        # Hash functions have to take bytes\n        _identifier_bytes = _identifier.encode(\"utf-8\")\n\n        hash_value = sha256(_identifier_bytes).hexdigest()\n        return f\"{hash_value}_{len(_identifier_bytes)}\"  # Length to reduce collision chance\n\n\n@lru_cache(1, typed=True)\nclass SQLiteStorageSystem(StorageSystemMixin):\n    \"\"\"The recommended system to use, it's race condition safe and thread safe.\n    Mainly built, so the library can run in threaded frameworks like scrapy or threaded tools\n    > It's optimized for threaded applications, but running it without threads shouldn't make it slow.\"\"\"\n\n    def __init__(self, storage_file: str, url: Optional[str] = None):\n        \"\"\"\n        :param storage_file: File to be used to store elements' data.\n        :param url: URL of the website we are working on to separate it from other websites data\n\n        \"\"\"\n        super().__init__(url)\n        self.storage_file = storage_file\n        self.lock = RLock()  # Better than Lock for reentrancy\n        # >SQLite default mode in the earlier version is 1 not 2 (1=thread-safe 2=serialized)\n        # `check_same_thread=False` to allow it to be used across different threads.\n        self.connection = db_connect(self.storage_file, check_same_thread=False)\n        # WAL (Write-Ahead Logging) allows for better concurrency.\n        self.connection.execute(\"PRAGMA journal_mode=WAL\")\n        self.cursor = self.connection.cursor()\n        self._setup_database()\n        log.debug(f'Storage system loaded with arguments (storage_file=\"{storage_file}\", url=\"{url}\")')\n\n    def _setup_database(self) -> None:\n        self.cursor.execute(\"\"\"\n            CREATE TABLE IF NOT EXISTS storage (\n                id INTEGER PRIMARY KEY,\n                url TEXT,\n                identifier TEXT,\n                element_data TEXT,\n                UNIQUE (url, identifier)\n            )\n        \"\"\")\n        self.connection.commit()\n\n    def save(self, element: HtmlElement, identifier: str) -> None:\n        \"\"\"Saves the elements unique properties to the storage for retrieval and relocation later\n\n        :param element: The element itself which we want to save to storage.\n        :param identifier: This is the identifier that will be used to retrieve the element later from the storage. See\n            the docs for more info.\n        \"\"\"\n        url = self._get_base_url()\n        element_data = _StorageTools.element_to_dict(element)\n        with self.lock:\n            self.cursor.execute(\n                \"\"\"\n                INSERT OR REPLACE INTO storage (url, identifier, element_data)\n                VALUES (?, ?, ?)\n            \"\"\",\n                (url, identifier, dumps(element_data)),\n            )\n            self.cursor.fetchall()\n            self.connection.commit()\n\n    def retrieve(self, identifier: str) -> Optional[Dict[str, Any]]:\n        \"\"\"Using the identifier, we search the storage and return the unique properties of the element\n\n        :param identifier: This is the identifier that will be used to retrieve the element from the storage. See\n            the docs for more info.\n        :return: A dictionary of the unique properties\n        \"\"\"\n        url = self._get_base_url()\n        with self.lock:\n            self.cursor.execute(\n                \"SELECT element_data FROM storage WHERE url = ? AND identifier = ?\",\n                (url, identifier),\n            )\n            result = self.cursor.fetchone()\n            if result:\n                return loads(result[0])\n            return None\n\n    def close(self):\n        \"\"\"Close all connections. It will be useful when with some things like scrapy Spider.closed() function/signal\"\"\"\n        with self.lock:\n            self.connection.commit()\n            self.cursor.close()\n            self.connection.close()\n\n    def __del__(self):\n        \"\"\"To ensure all connections are closed when the object is destroyed.\"\"\"\n        self.close()\n"
  },
  {
    "path": "scrapling/core/translator.py",
    "content": "\"\"\"\nMost of this file is an adapted version of the parsel library's translator with some modifications simply for 1 important reason...\n\nTo add pseudo-elements ``::text`` and ``::attr(ATTR_NAME)`` so we match the Parsel/Scrapy selectors format which will be important in future releases but most importantly...\n\nSo you don't have to learn a new selectors/api method like what bs4 done with soupsieve :)\n\n    If you want to learn about this, head to https://cssselect.readthedocs.io/en/latest/#cssselect.FunctionalPseudoElement\n\"\"\"\n\nfrom functools import lru_cache\n\nfrom cssselect import HTMLTranslator as OriginalHTMLTranslator\nfrom cssselect.xpath import ExpressionError, XPathExpr as OriginalXPathExpr\nfrom cssselect.parser import Element, FunctionalPseudoElement, PseudoElement\n\nfrom scrapling.core._types import Any, Protocol, Self\n\n\nclass XPathExpr(OriginalXPathExpr):\n    textnode: bool = False\n    attribute: str | None = None\n\n    @classmethod\n    def from_xpath(\n        cls,\n        xpath: OriginalXPathExpr,\n        textnode: bool = False,\n        attribute: str | None = None,\n    ) -> Self:\n        x = cls(path=xpath.path, element=xpath.element, condition=xpath.condition)\n        x.textnode = textnode\n        x.attribute = attribute\n        return x\n\n    def __str__(self) -> str:\n        path = super().__str__()\n        if self.textnode:\n            if path == \"*\":  # pragma: no cover\n                path = \"text()\"\n            elif path.endswith(\"::*/*\"):  # pragma: no cover\n                path = path[:-3] + \"text()\"\n            else:\n                path += \"/text()\"\n\n        if self.attribute is not None:\n            if path.endswith(\"::*/*\"):  # pragma: no cover\n                path = path[:-2]\n            path += f\"/@{self.attribute}\"\n\n        return path\n\n    def join(\n        self: Self,\n        combiner: str,\n        other: OriginalXPathExpr,\n        *args: Any,\n        **kwargs: Any,\n    ) -> Self:\n        if not isinstance(other, XPathExpr):\n            raise ValueError(  # pragma: no cover\n                f\"Expressions of type {__name__}.XPathExpr can ony join expressions\"\n                f\" of the same type (or its descendants), got {type(other)}\"\n            )\n        super().join(combiner, other, *args, **kwargs)\n        self.textnode = other.textnode\n        self.attribute = other.attribute\n        return self\n\n\n# e.g. cssselect.GenericTranslator, cssselect.HTMLTranslator\nclass TranslatorProtocol(Protocol):\n    def xpath_element(self, selector: Element) -> OriginalXPathExpr:  # pyright: ignore # pragma: no cover\n        pass\n\n    def css_to_xpath(self, css: str, prefix: str = ...) -> str:  # pyright: ignore # pragma: no cover\n        pass\n\n\nclass TranslatorMixin:\n    \"\"\"This mixin adds support to CSS pseudo elements via dynamic dispatch.\n\n    Currently supported pseudo-elements are ``::text`` and ``::attr(ATTR_NAME)``.\n    \"\"\"\n\n    def xpath_element(self: TranslatorProtocol, selector: Element) -> XPathExpr:\n        # https://github.com/python/mypy/issues/14757\n        xpath = super().xpath_element(selector)  # type: ignore[safe-super]\n        return XPathExpr.from_xpath(xpath)\n\n    def xpath_pseudo_element(self, xpath: OriginalXPathExpr, pseudo_element: PseudoElement) -> OriginalXPathExpr:\n        \"\"\"\n        Dispatch method that transforms XPath to support the pseudo-element.\n        \"\"\"\n        if isinstance(pseudo_element, FunctionalPseudoElement):\n            method_name = f\"xpath_{pseudo_element.name.replace('-', '_')}_functional_pseudo_element\"\n            method = getattr(self, method_name, None)\n            if not method:  # pragma: no cover\n                raise ExpressionError(f\"The functional pseudo-element ::{pseudo_element.name}() is unknown\")\n            xpath = method(xpath, pseudo_element)\n        else:\n            method_name = f\"xpath_{pseudo_element.replace('-', '_')}_simple_pseudo_element\"\n            method = getattr(self, method_name, None)\n            if not method:  # pragma: no cover\n                raise ExpressionError(f\"The pseudo-element ::{pseudo_element} is unknown\")\n            xpath = method(xpath)\n        return xpath\n\n    @staticmethod\n    def xpath_attr_functional_pseudo_element(xpath: OriginalXPathExpr, function: FunctionalPseudoElement) -> XPathExpr:\n        \"\"\"Support selecting attribute values using ::attr() pseudo-element\"\"\"\n        if function.argument_types() not in ([\"STRING\"], [\"IDENT\"]):  # pragma: no cover\n            raise ExpressionError(f\"Expected a single string or ident for ::attr(), got {function.arguments!r}\")\n        return XPathExpr.from_xpath(xpath, attribute=function.arguments[0].value)\n\n    @staticmethod\n    def xpath_text_simple_pseudo_element(xpath: OriginalXPathExpr) -> XPathExpr:\n        \"\"\"Support selecting text nodes using ::text pseudo-element\"\"\"\n        return XPathExpr.from_xpath(xpath, textnode=True)\n\n\nclass HTMLTranslator(TranslatorMixin, OriginalHTMLTranslator):\n    def css_to_xpath(self, css: str, prefix: str = \"descendant-or-self::\") -> str:\n        return super().css_to_xpath(css, prefix)\n\n\ntranslator = HTMLTranslator()\n# Using a function instead of the translator directly to avoid Pyright override error\n\n\n@lru_cache(maxsize=256)\ndef css_to_xpath(query: str) -> str:\n    \"\"\"Return the translated XPath version of a given CSS query\"\"\"\n    return translator.css_to_xpath(query)\n"
  },
  {
    "path": "scrapling/core/utils/__init__.py",
    "content": "from ._utils import (\n    log,\n    set_logger,\n    reset_logger,\n    __CONSECUTIVE_SPACES_REGEX__,\n    flatten,\n    _is_iterable,\n    _StorageTools,\n    clean_spaces,\n    html_forbidden,\n)\n"
  },
  {
    "path": "scrapling/core/utils/_shell.py",
    "content": "from http import cookies as Cookie\n\n\nfrom scrapling.core._types import (\n    List,\n    Dict,\n    Tuple,\n)\n\n\ndef _CookieParser(cookie_string):\n    # Errors will be handled on call so the log can be specified\n    cookie_parser = Cookie.SimpleCookie()\n    cookie_parser.load(cookie_string)\n    for key, morsel in cookie_parser.items():\n        yield key, morsel.value\n\n\ndef _ParseHeaders(header_lines: List[str], parse_cookies: bool = True) -> Tuple[Dict[str, str], Dict[str, str]]:\n    \"\"\"Parses headers into separate header and cookie dictionaries.\"\"\"\n    header_dict = dict()\n    cookie_dict = dict()\n\n    for header_line in header_lines:\n        if \":\" not in header_line:\n            if header_line.endswith(\";\"):\n                header_key = header_line[:-1].strip()\n                header_value = \"\"\n                header_dict[header_key] = header_value\n            else:\n                raise ValueError(f\"Could not parse header without colon: '{header_line}'.\")\n        else:\n            header_key, header_value = header_line.split(\":\", 1)\n            header_key = header_key.strip()\n            header_value = header_value.strip()\n\n            if parse_cookies:\n                if header_key.lower() == \"cookie\":\n                    try:\n                        cookie_dict = {key: value for key, value in _CookieParser(header_value)}\n                    except Exception as e:  # pragma: no cover\n                        raise ValueError(f\"Could not parse cookie string from header '{header_value}': {e}\")\n                else:\n                    header_dict[header_key] = header_value\n            else:\n                header_dict[header_key] = header_value\n\n    return header_dict, cookie_dict\n"
  },
  {
    "path": "scrapling/core/utils/_utils.py",
    "content": "import logging\nfrom itertools import chain\nfrom re import compile as re_compile\nfrom contextvars import ContextVar, Token\n\nfrom lxml import html\n\nfrom scrapling.core._types import Any, Dict, Iterable, List\n\n# Using cache on top of a class is a brilliant way to achieve a Singleton design pattern without much code\nfrom functools import lru_cache  # isort:skip\n\nhtml_forbidden = (html.HtmlComment,)\n\n__CLEANING_TABLE__ = str.maketrans({\"\\t\": \" \", \"\\n\": None, \"\\r\": None})\n__CONSECUTIVE_SPACES_REGEX__ = re_compile(r\" +\")\n\n\n@lru_cache(1, typed=True)\ndef setup_logger():\n    \"\"\"Create and configure a logger with a standard format.\n\n    :returns: logging.Logger: Configured logger instance\n    \"\"\"\n    logger = logging.getLogger(\"scrapling\")\n    logger.setLevel(logging.INFO)\n\n    formatter = logging.Formatter(fmt=\"[%(asctime)s] %(levelname)s: %(message)s\", datefmt=\"%Y-%m-%d %H:%M:%S\")\n\n    console_handler = logging.StreamHandler()\n    console_handler.setFormatter(formatter)\n\n    # Add handler to logger (if not already added)\n    if not logger.handlers:\n        logger.addHandler(console_handler)\n\n    return logger\n\n\n_current_logger: ContextVar[logging.Logger] = ContextVar(\"scrapling_logger\", default=setup_logger())\n\n\nclass LoggerProxy:\n    def __getattr__(self, name: str):\n        return getattr(_current_logger.get(), name)\n\n\nlog = LoggerProxy()\n\n\ndef set_logger(logger: logging.Logger) -> Token:\n    \"\"\"Set the current context logger. Returns token for reset.\"\"\"\n    return _current_logger.set(logger)\n\n\ndef reset_logger(token: Token) -> None:\n    \"\"\"Reset logger to previous state using token.\"\"\"\n    _current_logger.reset(token)\n\n\ndef flatten(lst: Iterable[Any]) -> List[Any]:\n    return list(chain.from_iterable(lst))\n\n\ndef _is_iterable(obj: Any) -> bool:\n    # This will be used only in regex functions to make sure it's iterable but not string/bytes\n    return isinstance(\n        obj,\n        (\n            list,\n            tuple,\n        ),\n    )\n\n\nclass _StorageTools:\n    @staticmethod\n    def __clean_attributes(element: html.HtmlElement, forbidden: tuple = ()) -> Dict:\n        if not element.attrib:\n            return {}\n        return {k: v.strip() for k, v in element.attrib.items() if v and v.strip() and k not in forbidden}\n\n    @classmethod\n    def element_to_dict(cls, element: html.HtmlElement) -> Dict:\n        parent = element.getparent()\n        result = {\n            \"tag\": str(element.tag),\n            \"attributes\": cls.__clean_attributes(element),\n            \"text\": element.text.strip() if element.text else None,\n            \"path\": cls._get_element_path(element),\n        }\n        if parent is not None:\n            result.update(\n                {\n                    \"parent_name\": parent.tag,\n                    \"parent_attribs\": dict(parent.attrib),\n                    \"parent_text\": parent.text.strip() if parent.text else None,\n                }\n            )\n\n            siblings = [child.tag for child in parent.iterchildren() if child != element]\n            if siblings:\n                result.update({\"siblings\": tuple(siblings)})\n\n        children = [child.tag for child in element.iterchildren() if not isinstance(child, html_forbidden)]\n        if children:\n            result.update({\"children\": tuple(children)})\n\n        return result\n\n    @classmethod\n    def _get_element_path(cls, element: html.HtmlElement):\n        parent = element.getparent()\n        return tuple((element.tag,) if parent is None else (cls._get_element_path(parent) + (element.tag,)))\n\n\n@lru_cache(128, typed=True)\ndef clean_spaces(string):\n    string = string.translate(__CLEANING_TABLE__)\n    return __CONSECUTIVE_SPACES_REGEX__.sub(\" \", string)\n"
  },
  {
    "path": "scrapling/engines/__init__.py",
    "content": ""
  },
  {
    "path": "scrapling/engines/_browsers/__init__.py",
    "content": ""
  },
  {
    "path": "scrapling/engines/_browsers/_base.py",
    "content": "from time import time\nfrom asyncio import sleep as asyncio_sleep, Lock\nfrom contextlib import contextmanager, asynccontextmanager\n\nfrom playwright.sync_api._generated import Page\nfrom playwright.sync_api import (\n    Frame,\n    BrowserContext,\n    Response as SyncPlaywrightResponse,\n)\nfrom playwright.async_api._generated import Page as AsyncPage\nfrom playwright.async_api import (\n    Frame as AsyncFrame,\n    Response as AsyncPlaywrightResponse,\n    BrowserContext as AsyncBrowserContext,\n)\nfrom playwright._impl._errors import Error as PlaywrightError\n\nfrom scrapling.parser import Selector\nfrom scrapling.engines._browsers._page import PageInfo, PagePool\nfrom scrapling.engines._browsers._validators import validate, PlaywrightConfig, StealthConfig\nfrom scrapling.engines._browsers._config_tools import __default_chrome_useragent__, __default_useragent__\nfrom scrapling.engines.toolbelt.navigation import (\n    construct_proxy_dict,\n    create_intercept_handler,\n    create_async_intercept_handler,\n)\nfrom scrapling.core._types import (\n    Any,\n    Dict,\n    List,\n    Set,\n    Optional,\n    Callable,\n    TYPE_CHECKING,\n    cast,\n    overload,\n    Tuple,\n    ProxyType,\n    Generator,\n    AsyncGenerator,\n)\nfrom scrapling.engines.constants import STEALTH_ARGS, HARMFUL_ARGS, DEFAULT_ARGS\n\n\nclass SyncSession:\n    _config: \"PlaywrightConfig | StealthConfig\"\n    _context_options: Dict[str, Any]\n\n    def _build_context_with_proxy(self, proxy: Optional[ProxyType] = None) -> Dict[str, Any]:\n        raise NotImplementedError  # pragma: no cover\n\n    def __init__(self, max_pages: int = 1):\n        self.max_pages = max_pages\n        self.page_pool = PagePool(max_pages)\n        self._max_wait_for_page = 60\n        self.playwright: Any = None\n        self.context: Any = None\n        self.browser: Any = None\n        self._is_alive = False\n\n    def start(self) -> None:\n        pass\n\n    def close(self):  # pragma: no cover\n        \"\"\"Close all resources\"\"\"\n        if not self._is_alive:\n            return\n\n        if self.context:\n            self.context.close()\n            self.context = None\n\n        if self.browser:\n            self.browser.close()\n            self.browser = None\n\n        if self.playwright:\n            self.playwright.stop()\n            self.playwright = None  # pyright: ignore\n\n        self._is_alive = False\n\n    def __enter__(self):\n        self.start()\n        return self\n\n    def __exit__(self, exc_type, exc_val, exc_tb):\n        self.close()\n\n    def _initialize_context(self, config: PlaywrightConfig | StealthConfig, ctx: BrowserContext) -> BrowserContext:\n        \"\"\"Initialize the browser context.\"\"\"\n        if config.init_script:\n            ctx.add_init_script(path=config.init_script)\n\n        if config.cookies:  # pragma: no cover\n            ctx.add_cookies(config.cookies)\n\n        return ctx\n\n    def _get_page(\n        self,\n        timeout: int | float,\n        extra_headers: Optional[Dict[str, str]],\n        disable_resources: bool,\n        blocked_domains: Optional[Set[str]] = None,\n        context: Optional[BrowserContext] = None,\n    ) -> PageInfo[Page]:  # pragma: no cover\n        \"\"\"Get a new page to use\"\"\"\n        # No need to check if a page is available or not in sync code because the code blocked before reaching here till the page closed, ofc.\n        ctx = context if context is not None else self.context\n        assert ctx is not None, \"Browser context not initialized\"\n        page = ctx.new_page()\n        page.set_default_navigation_timeout(timeout)\n        page.set_default_timeout(timeout)\n        if extra_headers:\n            page.set_extra_http_headers(extra_headers)\n\n        if disable_resources or blocked_domains:\n            page.route(\"**/*\", create_intercept_handler(disable_resources, blocked_domains))\n        page_info = self.page_pool.add_page(page)\n        page_info.mark_busy()\n        return page_info\n\n    def get_pool_stats(self) -> Dict[str, int]:\n        \"\"\"Get statistics about the current page pool\"\"\"\n        return {\n            \"total_pages\": self.page_pool.pages_count,\n            \"busy_pages\": self.page_pool.busy_count,\n            \"max_pages\": self.max_pages,\n        }\n\n    @staticmethod\n    def _wait_for_networkidle(page: Page | Frame, timeout: Optional[int] = None):\n        \"\"\"Wait for the page to become idle (no network activity) even if there are never-ending requests.\"\"\"\n        try:\n            page.wait_for_load_state(\"networkidle\", timeout=timeout)\n        except (PlaywrightError, Exception):\n            pass\n\n    def _wait_for_page_stability(self, page: Page | Frame, load_dom: bool, network_idle: bool):\n        page.wait_for_load_state(state=\"load\")\n        if load_dom:\n            page.wait_for_load_state(state=\"domcontentloaded\")\n        if network_idle:\n            self._wait_for_networkidle(page)\n\n    @staticmethod\n    def _create_response_handler(page_info: PageInfo[Page], response_container: List) -> Callable:\n        \"\"\"Create a response handler that captures the final navigation response.\n\n        :param page_info: The PageInfo object containing the page\n        :param response_container: A list to store the final response (mutable container)\n        :return: A callback function for page.on(\"response\", ...)\n        \"\"\"\n\n        def handle_response(finished_response: SyncPlaywrightResponse):\n            if (\n                finished_response.request.resource_type == \"document\"\n                and finished_response.request.is_navigation_request()\n                and finished_response.request.frame == page_info.page.main_frame\n            ):\n                response_container[0] = finished_response\n\n        return handle_response\n\n    @contextmanager\n    def _page_generator(\n        self,\n        timeout: int | float,\n        extra_headers: Optional[Dict[str, str]],\n        disable_resources: bool,\n        proxy: Optional[ProxyType] = None,\n        blocked_domains: Optional[Set[str]] = None,\n    ) -> Generator[\"PageInfo[Page]\", None, None]:\n        \"\"\"Acquire a page - either from persistent context or fresh context with proxy.\"\"\"\n        if proxy:\n            # Rotation mode: create fresh context with the provided proxy\n            if not self.browser:  # pragma: no cover\n                raise RuntimeError(\"Browser not initialized for proxy rotation mode\")\n            context_options = self._build_context_with_proxy(proxy)\n            context: BrowserContext = self.browser.new_context(**context_options)\n\n            try:\n                context = self._initialize_context(self._config, context)\n                page_info = self._get_page(timeout, extra_headers, disable_resources, blocked_domains, context=context)\n                yield page_info\n            finally:\n                context.close()\n        else:\n            # Standard mode: use PagePool with persistent context\n            page_info = self._get_page(timeout, extra_headers, disable_resources, blocked_domains)\n            try:\n                yield page_info\n            finally:\n                page_info.page.close()\n                self.page_pool.pages.remove(page_info)\n\n\nclass AsyncSession:\n    _config: \"PlaywrightConfig | StealthConfig\"\n    _context_options: Dict[str, Any]\n\n    def _build_context_with_proxy(self, proxy: Optional[ProxyType] = None) -> Dict[str, Any]:\n        raise NotImplementedError  # pragma: no cover\n\n    def __init__(self, max_pages: int = 1):\n        self.max_pages = max_pages\n        self.page_pool = PagePool(max_pages)\n        self._max_wait_for_page = 60\n        self.playwright: Any = None\n        self.context: Any = None\n        self.browser: Any = None\n        self._is_alive = False\n        self._lock = Lock()\n\n    async def start(self) -> None:\n        pass\n\n    async def close(self):\n        \"\"\"Close all resources\"\"\"\n        if not self._is_alive:  # pragma: no cover\n            return\n\n        if self.context:\n            await self.context.close()\n            self.context = None  # pyright: ignore\n\n        if self.browser:\n            await self.browser.close()\n            self.browser = None\n\n        if self.playwright:\n            await self.playwright.stop()\n            self.playwright = None  # pyright: ignore\n\n        self._is_alive = False\n\n    async def __aenter__(self):\n        await self.start()\n        return self\n\n    async def __aexit__(self, exc_type, exc_val, exc_tb):\n        await self.close()\n\n    async def _initialize_context(\n        self, config: PlaywrightConfig | StealthConfig, ctx: AsyncBrowserContext\n    ) -> AsyncBrowserContext:\n        \"\"\"Initialize the browser context.\"\"\"\n        if config.init_script:  # pragma: no cover\n            await ctx.add_init_script(path=config.init_script)\n\n        if config.cookies:  # pragma: no cover\n            await ctx.add_cookies(config.cookies)\n\n        return ctx\n\n    async def _get_page(\n        self,\n        timeout: int | float,\n        extra_headers: Optional[Dict[str, str]],\n        disable_resources: bool,\n        blocked_domains: Optional[Set[str]] = None,\n        context: Optional[AsyncBrowserContext] = None,\n    ) -> PageInfo[AsyncPage]:  # pragma: no cover\n        \"\"\"Get a new page to use\"\"\"\n        ctx = context if context is not None else self.context\n        if TYPE_CHECKING:\n            assert ctx is not None, \"Browser context not initialized\"\n\n        async with self._lock:\n            # If we're at max capacity after cleanup, wait for busy pages to finish\n            if context is None and self.page_pool.pages_count >= self.max_pages:\n                # Only applies when using persistent context\n                start_time = time()\n                while time() - start_time < self._max_wait_for_page:\n                    await asyncio_sleep(0.05)\n                    if self.page_pool.pages_count < self.max_pages:\n                        break\n                else:\n                    raise TimeoutError(\n                        f\"No pages finished to clear place in the pool within the {self._max_wait_for_page}s timeout period\"\n                    )\n\n            page = await ctx.new_page()\n            page.set_default_navigation_timeout(timeout)\n            page.set_default_timeout(timeout)\n            if extra_headers:\n                await page.set_extra_http_headers(extra_headers)\n\n            if disable_resources or blocked_domains:\n                await page.route(\"**/*\", create_async_intercept_handler(disable_resources, blocked_domains))\n\n            return self.page_pool.add_page(page)\n\n    def get_pool_stats(self) -> Dict[str, int]:\n        \"\"\"Get statistics about the current page pool\"\"\"\n        return {\n            \"total_pages\": self.page_pool.pages_count,\n            \"busy_pages\": self.page_pool.busy_count,\n            \"max_pages\": self.max_pages,\n        }\n\n    @staticmethod\n    async def _wait_for_networkidle(page: AsyncPage | AsyncFrame, timeout: Optional[int] = None):\n        \"\"\"Wait for the page to become idle (no network activity) even if there are never-ending requests.\"\"\"\n        try:\n            await page.wait_for_load_state(\"networkidle\", timeout=timeout)\n        except (PlaywrightError, Exception):\n            pass\n\n    async def _wait_for_page_stability(self, page: AsyncPage | AsyncFrame, load_dom: bool, network_idle: bool):\n        await page.wait_for_load_state(state=\"load\")\n        if load_dom:\n            await page.wait_for_load_state(state=\"domcontentloaded\")\n        if network_idle:\n            await self._wait_for_networkidle(page)\n\n    @staticmethod\n    def _create_response_handler(page_info: PageInfo[AsyncPage], response_container: List) -> Callable:\n        \"\"\"Create an async response handler that captures the final navigation response.\n\n        :param page_info: The PageInfo object containing the page\n        :param response_container: A list to store the final response (mutable container)\n        :return: A callback function for page.on(\"response\", ...)\n        \"\"\"\n\n        async def handle_response(finished_response: AsyncPlaywrightResponse):\n            if (\n                finished_response.request.resource_type == \"document\"\n                and finished_response.request.is_navigation_request()\n                and finished_response.request.frame == page_info.page.main_frame\n            ):\n                response_container[0] = finished_response\n\n        return handle_response\n\n    @asynccontextmanager\n    async def _page_generator(\n        self,\n        timeout: int | float,\n        extra_headers: Optional[Dict[str, str]],\n        disable_resources: bool,\n        proxy: Optional[ProxyType] = None,\n        blocked_domains: Optional[Set[str]] = None,\n    ) -> AsyncGenerator[\"PageInfo[AsyncPage]\", None]:\n        \"\"\"Acquire a page - either from persistent context or fresh context with proxy.\"\"\"\n        if proxy:\n            # Rotation mode: create fresh context with the provided proxy\n            if not self.browser:  # pragma: no cover\n                raise RuntimeError(\"Browser not initialized for proxy rotation mode\")\n            context_options = self._build_context_with_proxy(proxy)\n            context: AsyncBrowserContext = await self.browser.new_context(**context_options)\n\n            try:\n                context = await self._initialize_context(self._config, context)\n                page_info = await self._get_page(\n                    timeout, extra_headers, disable_resources, blocked_domains, context=context\n                )\n                yield page_info\n            finally:\n                await context.close()\n        else:\n            # Standard mode: use PagePool with persistent context\n            page_info = await self._get_page(timeout, extra_headers, disable_resources, blocked_domains)\n            try:\n                yield page_info\n            finally:\n                await page_info.page.close()\n                self.page_pool.pages.remove(page_info)\n\n\nclass BaseSessionMixin:\n    _config: \"PlaywrightConfig | StealthConfig\"\n\n    @overload\n    def __validate_routine__(self, params: Dict, model: type[StealthConfig]) -> StealthConfig: ...\n\n    @overload\n    def __validate_routine__(self, params: Dict, model: type[PlaywrightConfig]) -> PlaywrightConfig: ...\n\n    def __validate_routine__(\n        self, params: Dict, model: type[PlaywrightConfig] | type[StealthConfig]\n    ) -> PlaywrightConfig | StealthConfig:\n        # Dark color scheme bypasses the 'prefersLightColor' check in creepjs\n        self._context_options: Dict[str, Any] = {\"color_scheme\": \"dark\", \"device_scale_factor\": 2}\n        self._browser_options: Dict[str, Any] = {\n            \"args\": DEFAULT_ARGS,\n            \"ignore_default_args\": HARMFUL_ARGS,\n        }\n        if \"__max_pages\" in params:\n            params[\"max_pages\"] = params.pop(\"__max_pages\")\n\n        config = validate(params, model=model)\n        self._headers_keys = (\n            {header.lower() for header in config.extra_headers.keys()} if config.extra_headers else set()\n        )\n\n        return config\n\n    def __generate_options__(self, extra_flags: Tuple | None = None) -> None:\n        config: PlaywrightConfig | StealthConfig = self._config\n        self._context_options.update(\n            {\n                \"proxy\": config.proxy,\n                \"locale\": config.locale,\n                \"timezone_id\": config.timezone_id,\n                \"extra_http_headers\": config.extra_headers,\n            }\n        )\n        # The default useragent in the headful is always correct now in the current versions of Playwright\n        if config.useragent:\n            self._context_options[\"user_agent\"] = config.useragent\n        elif not config.useragent and config.headless:\n            self._context_options[\"user_agent\"] = (\n                __default_chrome_useragent__ if config.real_chrome else __default_useragent__\n            )\n\n        if not config.cdp_url:\n            flags = self._browser_options[\"args\"]\n            if config.extra_flags or extra_flags:\n                flags = list(set(tuple(flags) + tuple(config.extra_flags or extra_flags or ())))\n\n            self._browser_options.update(\n                {\n                    \"args\": flags,\n                    \"headless\": config.headless,\n                    \"channel\": \"chrome\" if config.real_chrome else \"chromium\",\n                }\n            )\n\n            self._user_data_dir = config.user_data_dir\n        else:\n            self._browser_options = {}\n\n        if config.additional_args:\n            self._context_options.update(config.additional_args)\n\n    def _build_context_with_proxy(self, proxy: Optional[ProxyType] = None) -> Dict[str, Any]:\n        \"\"\"\n        Build context options with a specific proxy for rotation mode.\n\n        :param proxy: Proxy URL string or Playwright-style proxy dict to use for this context.\n        :return: Dictionary of context options for browser.new_context().\n        \"\"\"\n\n        context_options = self._context_options.copy()\n\n        # Override proxy if provided\n        if proxy:\n            context_options[\"proxy\"] = construct_proxy_dict(proxy)\n\n        return context_options\n\n\nclass DynamicSessionMixin(BaseSessionMixin):\n    def __validate__(self, **params):\n        self._config = self.__validate_routine__(params, model=PlaywrightConfig)\n        self.__generate_options__()\n\n\nclass StealthySessionMixin(BaseSessionMixin):\n    def __validate__(self, **params):\n        self._config = self.__validate_routine__(params, model=StealthConfig)\n        self._context_options.update(\n            {\n                \"is_mobile\": False,\n                \"has_touch\": False,\n                # I'm thinking about disabling it to rest from all Service Workers' headache, but let's keep it as it is for now\n                \"service_workers\": \"allow\",\n                \"ignore_https_errors\": True,\n                \"screen\": {\"width\": 1920, \"height\": 1080},\n                \"viewport\": {\"width\": 1920, \"height\": 1080},\n                \"permissions\": [\"geolocation\", \"notifications\"],\n            }\n        )\n        self.__generate_stealth_options()\n\n    def __generate_stealth_options(self) -> None:\n        config = cast(StealthConfig, self._config)\n        flags: Tuple[str, ...] = tuple()\n        if not config.cdp_url:\n            flags = tuple(DEFAULT_ARGS) + tuple(STEALTH_ARGS)\n\n            if config.block_webrtc:\n                flags += (\n                    \"--webrtc-ip-handling-policy=disable_non_proxied_udp\",\n                    \"--force-webrtc-ip-handling-policy\",  # Ensures the policy is enforced\n                )\n            if not config.allow_webgl:\n                flags += (\n                    \"--disable-webgl\",\n                    \"--disable-webgl-image-chromium\",\n                    \"--disable-webgl2\",\n                )\n            if config.hide_canvas:\n                flags += (\"--fingerprinting-canvas-image-data-noise\",)\n\n        super(StealthySessionMixin, self).__generate_options__(flags)\n\n    @staticmethod\n    def _detect_cloudflare(page_content: str) -> str | None:\n        \"\"\"\n        Detect the type of Cloudflare challenge present in the provided page content.\n\n        This function analyzes the given page content to identify whether a specific\n        type of Cloudflare challenge is present. It checks for three predefined\n        challenge types: non-interactive, managed, and interactive. If a challenge\n        type is detected, it returns the corresponding type as a string. If no\n        challenge type is detected, it returns None.\n\n        Args:\n            page_content (str): The content of the page to analyze for Cloudflare\n                challenge types.\n\n        Returns:\n            str: A string representing the detected Cloudflare challenge type, if\n                found. Returns None if no challenge matches.\n        \"\"\"\n        challenge_types = (\n            \"non-interactive\",\n            \"managed\",\n            \"interactive\",\n        )\n        for ctype in challenge_types:\n            if f\"cType: '{ctype}'\" in page_content:\n                return ctype\n\n        # Check if turnstile captcha is embedded inside the page (Usually inside a closed Shadow iframe)\n        selector = Selector(content=page_content)\n        if selector.css('script[src*=\"challenges.cloudflare.com/turnstile/v\"]'):\n            return \"embedded\"\n\n        return None\n"
  },
  {
    "path": "scrapling/engines/_browsers/_config_tools.py",
    "content": "from scrapling.engines.toolbelt.fingerprints import generate_headers\n\n__default_useragent__ = generate_headers(browser_mode=True).get(\"User-Agent\")\n__default_chrome_useragent__ = generate_headers(browser_mode=\"chrome\").get(\"User-Agent\")\n"
  },
  {
    "path": "scrapling/engines/_browsers/_controllers.py",
    "content": "from time import sleep as time_sleep\nfrom asyncio import sleep as asyncio_sleep\n\nfrom playwright.sync_api import (\n    Locator,\n    sync_playwright,\n)\nfrom playwright.async_api import (\n    async_playwright,\n    Locator as AsyncLocator,\n)\n\nfrom scrapling.core.utils import log\nfrom scrapling.core._types import Optional, ProxyType, Unpack\nfrom scrapling.engines.toolbelt.proxy_rotation import is_proxy_error\nfrom scrapling.engines.toolbelt.convertor import Response, ResponseFactory\nfrom scrapling.engines._browsers._types import PlaywrightSession, PlaywrightFetchParams\nfrom scrapling.engines._browsers._base import SyncSession, AsyncSession, DynamicSessionMixin\nfrom scrapling.engines._browsers._validators import validate_fetch as _validate, PlaywrightConfig\n\n\nclass DynamicSession(SyncSession, DynamicSessionMixin):\n    \"\"\"A Browser session manager with page pooling.\"\"\"\n\n    __slots__ = (\n        \"_config\",\n        \"_context_options\",\n        \"_browser_options\",\n        \"_user_data_dir\",\n        \"_headers_keys\",\n        \"max_pages\",\n        \"page_pool\",\n        \"_max_wait_for_page\",\n        \"playwright\",\n        \"context\",\n    )\n\n    def __init__(self, **kwargs: Unpack[PlaywrightSession]):\n        \"\"\"A Browser session manager with page pooling, it's using a persistent browser Context by default with a temporary user profile directory.\n\n        :param headless: Run the browser in headless/hidden (default), or headful/visible mode.\n        :param disable_resources: Drop requests for unnecessary resources for a speed boost.\n            Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.\n        :param blocked_domains: A set of domain names to block requests to. Subdomains are also matched (e.g., ``\"example.com\"`` blocks ``\"sub.example.com\"`` too).\n        :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.\n        :param cookies: Set cookies for the next request.\n        :param network_idle: Wait for the page until there are no network connections for at least 500 ms.\n        :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000\n        :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.\n        :param page_action: Added for automation. A function that takes the `page` object and does the automation you need.\n        :param wait_selector: Wait for a specific CSS selector to be in a specific state.\n        :param init_script: An absolute path to a JavaScript file to be executed on page creation for all pages in this session.\n        :param locale: Specify user locale, for example, `en-GB`, `de-DE`, etc. Locale will affect navigator.language value, Accept-Language request header value as well as number and date formatting\n            rules. Defaults to the system default locale.\n        :param timezone_id: Changes the timezone of the browser. Defaults to the system timezone.\n        :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.\n        :param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.\n        :param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.\n        :param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.\n        :param google_search: Enabled by default, Scrapling will set a Google referer header.\n        :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by `google_search` takes priority over the referer set here if used together._\n        :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.\n        :param user_data_dir: Path to a User Data Directory, which stores browser session data like cookies and local storage. The default is to create a temporary directory.\n        :param extra_flags: A list of additional browser flags to pass to the browser on launch.\n        :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.\n        :param additional_args: Additional arguments to be passed to Playwright's context as additional settings, and it takes higher priority than Scrapling's settings.\n        \"\"\"\n        self.__validate__(**kwargs)\n        super().__init__()\n\n    def start(self):\n        \"\"\"Create a browser for this instance and context.\"\"\"\n        if not self.playwright:\n            self.playwright = sync_playwright().start()\n\n            try:\n                if self._config.cdp_url:  # pragma: no cover\n                    self.browser = self.playwright.chromium.connect_over_cdp(endpoint_url=self._config.cdp_url)\n                    if not self._config.proxy_rotator and self.browser:\n                        self.context = self.browser.new_context(**self._context_options)\n                elif self._config.proxy_rotator:\n                    self.browser = self.playwright.chromium.launch(**self._browser_options)\n                else:\n                    persistent_options = (\n                        self._browser_options | self._context_options | {\"user_data_dir\": self._user_data_dir}\n                    )\n                    self.context = self.playwright.chromium.launch_persistent_context(**persistent_options)\n\n                if self.context:\n                    self.context = self._initialize_context(self._config, self.context)\n\n                self._is_alive = True\n            except Exception:\n                # Clean up playwright if browser setup fails\n                self.playwright.stop()\n                self.playwright = None\n                raise\n        else:\n            raise RuntimeError(\"Session has been already started\")\n\n    def fetch(self, url: str, **kwargs: Unpack[PlaywrightFetchParams]) -> Response:\n        \"\"\"Opens up the browser and do your request based on your chosen options.\n\n        :param url: The Target url.\n        :param google_search: Enabled by default, Scrapling will set a Google referer header.\n        :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000\n        :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.\n        :param page_action: Added for automation. A function that takes the `page` object and does the automation you need.\n        :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by `google_search` takes priority over the referer set here if used together._\n        :param disable_resources: Drop requests for unnecessary resources for a speed boost.\n            Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.\n        :param blocked_domains: A set of domain names to block requests to. Subdomains are also matched (e.g., ``\"example.com\"`` blocks ``\"sub.example.com\"`` too).\n        :param wait_selector: Wait for a specific CSS selector to be in a specific state.\n        :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.\n        :param network_idle: Wait for the page until there are no network connections for at least 500 ms.\n        :param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.\n        :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.\n        :param proxy: Static proxy to override rotator and session proxy. A new browser context will be created and used with it.\n        :return: A `Response` object.\n        \"\"\"\n        static_proxy = kwargs.pop(\"proxy\", None)\n\n        params = _validate(kwargs, self, PlaywrightConfig)\n        if not self._is_alive:  # pragma: no cover\n            raise RuntimeError(\"Context manager has been closed\")\n\n        request_headers_keys = {h.lower() for h in params.extra_headers.keys()} if params.extra_headers else set()\n        referer = (\n            \"https://www.google.com/\" if (params.google_search and \"referer\" not in request_headers_keys) else None\n        )\n\n        for attempt in range(self._config.retries):\n            proxy: Optional[ProxyType] = None\n            if self._config.proxy_rotator and static_proxy is None:\n                proxy = self._config.proxy_rotator.get_proxy()\n            else:\n                proxy = static_proxy\n\n            with self._page_generator(\n                params.timeout, params.extra_headers, params.disable_resources, proxy, params.blocked_domains\n            ) as page_info:\n                final_response = [None]\n                page = page_info.page\n                page.on(\"response\", self._create_response_handler(page_info, final_response))\n\n                try:\n                    first_response = page.goto(url, referer=referer)\n                    self._wait_for_page_stability(page, params.load_dom, params.network_idle)\n\n                    if not first_response:\n                        raise RuntimeError(f\"Failed to get response for {url}\")\n\n                    if params.page_action:\n                        try:\n                            _ = params.page_action(page)\n                        except Exception as e:  # pragma: no cover\n                            log.error(f\"Error executing page_action: {e}\")\n\n                    if params.wait_selector:\n                        try:\n                            waiter: Locator = page.locator(params.wait_selector)\n                            waiter.first.wait_for(state=params.wait_selector_state)\n                            self._wait_for_page_stability(page, params.load_dom, params.network_idle)\n                        except Exception as e:  # pragma: no cover\n                            log.error(f\"Error waiting for selector {params.wait_selector}: {e}\")\n\n                    page.wait_for_timeout(params.wait)\n\n                    response = ResponseFactory.from_playwright_response(\n                        page, first_response, final_response[0], params.selector_config, meta={\"proxy\": proxy}\n                    )\n                    return response\n\n                except Exception as e:\n                    page_info.mark_error()\n                    if attempt < self._config.retries - 1:\n                        if is_proxy_error(e):\n                            log.warning(\n                                f\"Proxy '{proxy}' failed (attempt {attempt + 1}) | Retrying in {self._config.retry_delay}s...\"\n                            )\n                        else:\n                            log.warning(\n                                f\"Attempt {attempt + 1} failed: {e}. Retrying in {self._config.retry_delay}s...\"\n                            )\n                        time_sleep(self._config.retry_delay)\n                    else:\n                        log.error(f\"Failed after {self._config.retries} attempts: {e}\")\n                        raise\n\n        raise RuntimeError(\"Request failed\")  # pragma: no cover\n\n\nclass AsyncDynamicSession(AsyncSession, DynamicSessionMixin):\n    \"\"\"An async Browser session manager with page pooling, it's using a persistent browser Context by default with a temporary user profile directory.\"\"\"\n\n    __slots__ = (\n        \"_config\",\n        \"_context_options\",\n        \"_browser_options\",\n        \"_user_data_dir\",\n        \"_headers_keys\",\n    )\n\n    def __init__(self, **kwargs: Unpack[PlaywrightSession]):\n        \"\"\"A Browser session manager with page pooling\n\n        :param headless: Run the browser in headless/hidden (default), or headful/visible mode.\n        :param disable_resources: Drop requests for unnecessary resources for a speed boost.\n            Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.\n        :param blocked_domains: A set of domain names to block requests to. Subdomains are also matched (e.g., ``\"example.com\"`` blocks ``\"sub.example.com\"`` too).\n        :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.\n        :param cookies: Set cookies for the next request.\n        :param network_idle: Wait for the page until there are no network connections for at least 500 ms.\n        :param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.\n        :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000\n        :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.\n        :param page_action: Added for automation. A function that takes the `page` object and does the automation you need.\n        :param wait_selector: Wait for a specific CSS selector to be in a specific state.\n        :param init_script: An absolute path to a JavaScript file to be executed on page creation for all pages in this session.\n        :param locale: Specify user locale, for example, `en-GB`, `de-DE`, etc. Locale will affect navigator.language value, Accept-Language request header value as well as number and date formatting\n            rules. Defaults to the system default locale.\n        :param timezone_id: Changes the timezone of the browser. Defaults to the system timezone.\n        :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.\n        :param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.\n        :param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.\n        :param google_search: Enabled by default, Scrapling will set a Google referer header.\n        :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by `google_search` takes priority over the referer set here if used together._\n        :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.\n        :param max_pages: The maximum number of tabs to be opened at the same time. It will be used in rotation through a PagePool.\n        :param user_data_dir: Path to a User Data Directory, which stores browser session data like cookies and local storage. The default is to create a temporary directory.\n        :param extra_flags: A list of additional browser flags to pass to the browser on launch.\n        :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.\n        :param additional_args: Additional arguments to be passed to Playwright's context as additional settings, and it takes higher priority than Scrapling's settings.\n        \"\"\"\n        self.__validate__(**kwargs)\n        super().__init__(max_pages=self._config.max_pages)\n\n    async def start(self) -> None:\n        \"\"\"Create a browser for this instance and context.\"\"\"\n        if not self.playwright:\n            self.playwright = await async_playwright().start()\n            try:\n                if self._config.cdp_url:\n                    self.browser = await self.playwright.chromium.connect_over_cdp(endpoint_url=self._config.cdp_url)\n                    if not self._config.proxy_rotator and self.browser:\n                        self.context = await self.browser.new_context(**self._context_options)\n                elif self._config.proxy_rotator:\n                    self.browser = await self.playwright.chromium.launch(**self._browser_options)\n                else:\n                    persistent_options = (\n                        self._browser_options | self._context_options | {\"user_data_dir\": self._user_data_dir}\n                    )\n                    self.context = await self.playwright.chromium.launch_persistent_context(**persistent_options)\n\n                if self.context:\n                    self.context = await self._initialize_context(self._config, self.context)\n\n                self._is_alive = True\n            except Exception:\n                # Clean up playwright if browser setup fails\n                await self.playwright.stop()\n                self.playwright = None\n                raise\n        else:\n            raise RuntimeError(\"Session has been already started\")\n\n    async def fetch(self, url: str, **kwargs: Unpack[PlaywrightFetchParams]) -> Response:\n        \"\"\"Opens up the browser and do your request based on your chosen options.\n\n        :param url: The Target url.\n        :param google_search: Enabled by default, Scrapling will set a Google referer header.\n        :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000\n        :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.\n        :param page_action: Added for automation. A function that takes the `page` object and does the automation you need.\n        :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by `google_search` takes priority over the referer set here if used together._\n        :param disable_resources: Drop requests for unnecessary resources for a speed boost.\n            Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.\n        :param blocked_domains: A set of domain names to block requests to. Subdomains are also matched (e.g., ``\"example.com\"`` blocks ``\"sub.example.com\"`` too).\n        :param wait_selector: Wait for a specific CSS selector to be in a specific state.\n        :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.\n        :param network_idle: Wait for the page until there are no network connections for at least 500 ms.\n        :param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.\n        :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.\n        :param proxy: Static proxy to override rotator and session proxy. A new browser context will be created and used with it.\n        :return: A `Response` object.\n        \"\"\"\n        static_proxy = kwargs.pop(\"proxy\", None)\n\n        params = _validate(kwargs, self, PlaywrightConfig)\n\n        if not self._is_alive:  # pragma: no cover\n            raise RuntimeError(\"Context manager has been closed\")\n\n        request_headers_keys = {h.lower() for h in params.extra_headers.keys()} if params.extra_headers else set()\n        referer = (\n            \"https://www.google.com/\" if (params.google_search and \"referer\" not in request_headers_keys) else None\n        )\n\n        for attempt in range(self._config.retries):\n            proxy: Optional[ProxyType] = None\n            if self._config.proxy_rotator and static_proxy is None:\n                proxy = self._config.proxy_rotator.get_proxy()\n            else:\n                proxy = static_proxy\n\n            async with self._page_generator(\n                params.timeout, params.extra_headers, params.disable_resources, proxy, params.blocked_domains\n            ) as page_info:\n                final_response = [None]\n                page = page_info.page\n                page.on(\"response\", self._create_response_handler(page_info, final_response))\n\n                try:\n                    first_response = await page.goto(url, referer=referer)\n                    await self._wait_for_page_stability(page, params.load_dom, params.network_idle)\n\n                    if not first_response:\n                        raise RuntimeError(f\"Failed to get response for {url}\")\n\n                    if params.page_action:\n                        try:\n                            _ = await params.page_action(page)\n                        except Exception as e:  # pragma: no cover\n                            log.error(f\"Error executing page_action: {e}\")\n\n                    if params.wait_selector:\n                        try:\n                            waiter: AsyncLocator = page.locator(params.wait_selector)\n                            await waiter.first.wait_for(state=params.wait_selector_state)\n                            await self._wait_for_page_stability(page, params.load_dom, params.network_idle)\n                        except Exception as e:  # pragma: no cover\n                            log.error(f\"Error waiting for selector {params.wait_selector}: {e}\")\n\n                    await page.wait_for_timeout(params.wait)\n\n                    response = await ResponseFactory.from_async_playwright_response(\n                        page, first_response, final_response[0], params.selector_config, meta={\"proxy\": proxy}\n                    )\n                    return response\n\n                except Exception as e:\n                    page_info.mark_error()\n                    if attempt < self._config.retries - 1:\n                        if is_proxy_error(e):\n                            log.warning(\n                                f\"Proxy '{proxy}' failed (attempt {attempt + 1}) | Retrying in {self._config.retry_delay}s...\"\n                            )\n                        else:\n                            log.warning(\n                                f\"Attempt {attempt + 1} failed: {e}. Retrying in {self._config.retry_delay}s...\"\n                            )\n                        await asyncio_sleep(self._config.retry_delay)\n                    else:\n                        log.error(f\"Failed after {self._config.retries} attempts: {e}\")\n                        raise\n\n        raise RuntimeError(\"Request failed\")  # pragma: no cover\n"
  },
  {
    "path": "scrapling/engines/_browsers/_page.py",
    "content": "from threading import RLock\nfrom dataclasses import dataclass\n\nfrom playwright.sync_api._generated import Page as SyncPage\nfrom playwright.async_api._generated import Page as AsyncPage\n\nfrom scrapling.core._types import Optional, List, Literal, overload, TypeVar, Generic, cast\n\nPageState = Literal[\"ready\", \"busy\", \"error\"]  # States that a page can be in\nPageType = TypeVar(\"PageType\", SyncPage, AsyncPage)\n\n\n@dataclass\nclass PageInfo(Generic[PageType]):\n    \"\"\"Information about the page and its current state\"\"\"\n\n    __slots__ = (\"page\", \"state\", \"url\")\n    page: PageType\n    state: PageState\n    url: Optional[str]\n\n    def mark_busy(self, url: str = \"\"):\n        \"\"\"Mark the page as busy\"\"\"\n        self.state = \"busy\"\n        self.url = url\n\n    def mark_error(self):\n        \"\"\"Mark the page as having an error\"\"\"\n        self.state = \"error\"\n\n    def __repr__(self):\n        return f'Page(URL=\"{self.url!r}\", state={self.state!r})'\n\n    def __eq__(self, other_page):\n        \"\"\"Comparing this page to another page object.\"\"\"\n        if other_page.__class__ is not self.__class__:\n            return NotImplemented\n        return self.page == other_page.page\n\n\nclass PagePool:\n    \"\"\"Manages a pool of browser pages/tabs with state tracking\"\"\"\n\n    __slots__ = (\"max_pages\", \"pages\", \"_lock\")\n\n    def __init__(self, max_pages: int = 5):\n        self.max_pages = max_pages\n        self.pages: List[PageInfo[SyncPage] | PageInfo[AsyncPage]] = []\n        self._lock = RLock()\n\n    @overload\n    def add_page(self, page: SyncPage) -> PageInfo[SyncPage]: ...\n\n    @overload\n    def add_page(self, page: AsyncPage) -> PageInfo[AsyncPage]: ...\n\n    def add_page(self, page: SyncPage | AsyncPage) -> PageInfo[SyncPage] | PageInfo[AsyncPage]:\n        \"\"\"Add a new page to the pool\"\"\"\n        with self._lock:\n            if len(self.pages) >= self.max_pages:\n                raise RuntimeError(f\"Maximum page limit ({self.max_pages}) reached\")\n\n            if isinstance(page, AsyncPage):\n                page_info: PageInfo[SyncPage] | PageInfo[AsyncPage] = cast(\n                    PageInfo[AsyncPage], PageInfo(page, \"ready\", \"\")\n                )\n            else:\n                page_info = cast(PageInfo[SyncPage], PageInfo(page, \"ready\", \"\"))\n\n            self.pages.append(page_info)\n            return page_info\n\n    @property\n    def pages_count(self) -> int:\n        \"\"\"Get the total number of pages\"\"\"\n        return len(self.pages)\n\n    @property\n    def busy_count(self) -> int:\n        \"\"\"Get the number of busy pages\"\"\"\n        with self._lock:\n            return sum(1 for p in self.pages if p.state == \"busy\")\n\n    def cleanup_error_pages(self):\n        \"\"\"Remove pages in error state\"\"\"\n        with self._lock:\n            self.pages = [p for p in self.pages if p.state != \"error\"]\n"
  },
  {
    "path": "scrapling/engines/_browsers/_stealth.py",
    "content": "from random import randint\nfrom re import compile as re_compile\nfrom time import sleep as time_sleep\nfrom asyncio import sleep as asyncio_sleep\n\nfrom playwright.sync_api import Locator, Page, BrowserContext\nfrom playwright.async_api import (\n    Page as async_Page,\n    Locator as AsyncLocator,\n    BrowserContext as AsyncBrowserContext,\n)\nfrom patchright.sync_api import sync_playwright\nfrom patchright.async_api import async_playwright\n\nfrom scrapling.core.utils import log\nfrom scrapling.core._types import Any, Optional, ProxyType, Unpack\nfrom scrapling.engines.toolbelt.proxy_rotation import is_proxy_error\nfrom scrapling.engines.toolbelt.convertor import Response, ResponseFactory\nfrom scrapling.engines._browsers._types import StealthSession, StealthFetchParams\nfrom scrapling.engines._browsers._base import SyncSession, AsyncSession, StealthySessionMixin\nfrom scrapling.engines._browsers._validators import validate_fetch as _validate, StealthConfig\n\n__CF_PATTERN__ = re_compile(r\"^https?://challenges\\.cloudflare\\.com/cdn-cgi/challenge-platform/.*\")\n\n\nclass StealthySession(SyncSession, StealthySessionMixin):\n    \"\"\"A Stealthy Browser session manager with page pooling.\"\"\"\n\n    __slots__ = (\n        \"_config\",\n        \"_context_options\",\n        \"_browser_options\",\n        \"_user_data_dir\",\n        \"_headers_keys\",\n        \"max_pages\",\n        \"page_pool\",\n        \"_max_wait_for_page\",\n        \"playwright\",\n        \"context\",\n    )\n\n    def __init__(self, **kwargs: Unpack[StealthSession]):\n        \"\"\"A Browser session manager with page pooling, it's using a persistent browser Context by default with a temporary user profile directory.\n\n        :param headless: Run the browser in headless/hidden (default), or headful/visible mode.\n        :param disable_resources: Drop requests for unnecessary resources for a speed boost.\n            Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.\n        :param blocked_domains: A set of domain names to block requests to. Subdomains are also matched (e.g., ``\"example.com\"`` blocks ``\"sub.example.com\"`` too).\n        :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.\n        :param cookies: Set cookies for the next request.\n        :param network_idle: Wait for the page until there are no network connections for at least 500 ms.\n        :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000\n        :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.\n        :param page_action: Added for automation. A function that takes the `page` object and does the automation you need.\n        :param wait_selector: Wait for a specific CSS selector to be in a specific state.\n        :param init_script: An absolute path to a JavaScript file to be executed on page creation for all pages in this session.\n        :param locale: Specify user locale, for example, `en-GB`, `de-DE`, etc. Locale will affect navigator.language value, Accept-Language request header value as well as number and date formatting\n            rules. Defaults to the system default locale.\n        :param timezone_id: Changes the timezone of the browser. Defaults to the system timezone.\n        :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.\n        :param solve_cloudflare: Solves all types of the Cloudflare's Turnstile/Interstitial challenges before returning the response to you.\n        :param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.\n        :param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.\n        :param block_webrtc: Forces WebRTC to respect proxy settings to prevent local IP address leak.\n        :param allow_webgl: Enabled by default. Disabling it disables WebGL and WebGL 2.0 support entirely. Disabling WebGL is not recommended as many WAFs now check if WebGL is enabled.\n        :param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.\n        :param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.\n        :param google_search: Enabled by default, Scrapling will set a Google referer header.\n        :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by `google_search` takes priority over the referer set here if used together._\n        :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.\n        :param user_data_dir: Path to a User Data Directory, which stores browser session data like cookies and local storage. The default is to create a temporary directory.\n        :param extra_flags: A list of additional browser flags to pass to the browser on launch.\n        :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.\n        :param additional_args: Additional arguments to be passed to Playwright's context as additional settings, and it takes higher priority than Scrapling's settings.\n        \"\"\"\n        self.__validate__(**kwargs)\n        super().__init__()\n\n    def start(self) -> None:\n        \"\"\"Create a browser for this instance and context.\"\"\"\n        if not self.playwright:\n            self.playwright = sync_playwright().start()\n\n            try:\n                if self._config.cdp_url:  # pragma: no cover\n                    self.browser = self.playwright.chromium.connect_over_cdp(endpoint_url=self._config.cdp_url)\n                    if not self._config.proxy_rotator:\n                        assert self.browser is not None\n                        self.context = self.browser.new_context(**self._context_options)\n                elif self._config.proxy_rotator:\n                    self.browser = self.playwright.chromium.launch(**self._browser_options)\n                else:\n                    persistent_options = (\n                        self._browser_options | self._context_options | {\"user_data_dir\": self._user_data_dir}\n                    )\n                    self.context = self.playwright.chromium.launch_persistent_context(**persistent_options)\n\n                if self.context:\n                    self.context = self._initialize_context(self._config, self.context)\n\n                self._is_alive = True\n            except Exception:\n                # Clean up playwright if browser setup fails\n                self.playwright.stop()\n                self.playwright = None\n                raise\n        else:\n            raise RuntimeError(\"Session has been already started\")\n\n    def _cloudflare_solver(self, page: Page) -> None:  # pragma: no cover\n        \"\"\"Solve the cloudflare challenge displayed on the playwright page passed\n\n        :param page: The targeted page\n        :return:\n        \"\"\"\n        self._wait_for_networkidle(page, timeout=5000)\n        challenge_type = self._detect_cloudflare(ResponseFactory._get_page_content(page))\n        if not challenge_type:\n            log.error(\"No Cloudflare challenge found.\")\n            return None\n        else:\n            log.info(f'The turnstile version discovered is \"{challenge_type}\"')\n            if challenge_type == \"non-interactive\":\n                while \"<title>Just a moment...</title>\" in (ResponseFactory._get_page_content(page)):\n                    log.info(\"Waiting for Cloudflare wait page to disappear.\")\n                    page.wait_for_timeout(1000)\n                    page.wait_for_load_state()\n                log.info(\"Cloudflare captcha is solved\")\n                return None\n\n            else:\n                box_selector = \"#cf_turnstile div, #cf-turnstile div, .turnstile>div>div\"\n                if challenge_type != \"embedded\":\n                    box_selector = \".main-content p+div>div>div\"\n                    while \"Verifying you are human.\" in ResponseFactory._get_page_content(page):\n                        # Waiting for the verify spinner to disappear, checking every 1s if it disappeared\n                        page.wait_for_timeout(500)\n\n                outer_box: Any = {}\n                iframe = page.frame(url=__CF_PATTERN__)\n                if iframe is not None:\n                    self._wait_for_page_stability(iframe, True, False)\n\n                    if challenge_type != \"embedded\":\n                        while not iframe.frame_element().is_visible():\n                            # Double-checking that the iframe is loaded\n                            page.wait_for_timeout(500)\n\n                    outer_box = iframe.frame_element().bounding_box()\n\n                if not iframe or not outer_box:\n                    if \"<title>Just a moment...</title>\" not in (ResponseFactory._get_page_content(page)):\n                        log.info(\"Cloudflare captcha is solved\")\n                        return None\n\n                    outer_box = page.locator(box_selector).last.bounding_box()\n\n                # Calculate the Captcha coordinates for any viewport\n                captcha_x, captcha_y = outer_box[\"x\"] + randint(26, 28), outer_box[\"y\"] + randint(25, 27)\n\n                # Move the mouse to the center of the window, then press and hold the left mouse button\n                page.mouse.click(captcha_x, captcha_y, delay=randint(100, 200), button=\"left\")\n                self._wait_for_networkidle(page)\n\n                if challenge_type != \"embedded\":\n                    attempts = 0\n                    while \"<title>Just a moment...</title>\" in ResponseFactory._get_page_content(page):\n                        # Wait for the page\n                        if attempts >= 100:\n                            log.info(\"Cloudflare page didn't disappear after 10s, continuing...\")\n                            break\n                        page.wait_for_timeout(100)\n                        attempts += 1\n\n                    # page.locator(box_selector).last.wait_for(state=\"detached\")\n                    # page.locator(\".zone-name-title\").wait_for(state=\"hidden\")\n\n                self._wait_for_page_stability(page, True, False)\n\n                if \"<title>Just a moment...</title>\" not in (ResponseFactory._get_page_content(page)):\n                    log.info(\"Cloudflare captcha is solved\")\n                    return None\n                else:\n                    log.info(\"Looks like Cloudflare captcha is still present, solving again\")\n                    return self._cloudflare_solver(page)\n\n    def fetch(self, url: str, **kwargs: Unpack[StealthFetchParams]) -> Response:\n        \"\"\"Opens up the browser and do your request based on your chosen options.\n\n        :param url: The Target url.\n        :param google_search: Enabled by default, Scrapling will set a Google referer header.\n        :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000\n        :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.\n        :param page_action: Added for automation. A function that takes the `page` object and does the automation you need.\n        :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by `google_search` takes priority over the referer set here if used together._\n        :param disable_resources: Drop requests for unnecessary resources for a speed boost.\n            Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.\n        :param blocked_domains: A set of domain names to block requests to. Subdomains are also matched (e.g., ``\"example.com\"`` blocks ``\"sub.example.com\"`` too).\n        :param wait_selector: Wait for a specific CSS selector to be in a specific state.\n        :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.\n        :param network_idle: Wait for the page until there are no network connections for at least 500 ms.\n        :param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.\n        :param solve_cloudflare: Solves all types of the Cloudflare's Turnstile/Interstitial challenges before returning the response to you.\n        :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.\n        :param proxy: Static proxy to override rotator and session proxy. A new browser context will be created and used with it.\n        :return: A `Response` object.\n        \"\"\"\n        static_proxy = kwargs.pop(\"proxy\", None)\n\n        params = _validate(kwargs, self, StealthConfig)\n        if not self._is_alive:  # pragma: no cover\n            raise RuntimeError(\"Context manager has been closed\")\n\n        request_headers_keys = {h.lower() for h in params.extra_headers.keys()} if params.extra_headers else set()\n        referer = (\n            \"https://www.google.com/\" if (params.google_search and \"referer\" not in request_headers_keys) else None\n        )\n\n        for attempt in range(self._config.retries):\n            proxy: Optional[ProxyType] = None\n            if self._config.proxy_rotator and static_proxy is None:\n                proxy = self._config.proxy_rotator.get_proxy()\n            else:\n                proxy = static_proxy\n\n            with self._page_generator(\n                params.timeout, params.extra_headers, params.disable_resources, proxy, params.blocked_domains\n            ) as page_info:\n                final_response = [None]\n                page = page_info.page\n                page.on(\"response\", self._create_response_handler(page_info, final_response))\n\n                try:\n                    first_response = page.goto(url, referer=referer)\n                    self._wait_for_page_stability(page, params.load_dom, params.network_idle)\n\n                    if not first_response:\n                        raise RuntimeError(f\"Failed to get response for {url}\")\n\n                    if params.solve_cloudflare:\n                        self._cloudflare_solver(page)\n                        # Make sure the page is fully loaded after the captcha\n                        self._wait_for_page_stability(page, params.load_dom, params.network_idle)\n\n                    if params.page_action:\n                        try:\n                            _ = params.page_action(page)\n                        except Exception as e:  # pragma: no cover\n                            log.error(f\"Error executing page_action: {e}\")\n\n                    if params.wait_selector:\n                        try:\n                            waiter: Locator = page.locator(params.wait_selector)\n                            waiter.first.wait_for(state=params.wait_selector_state)\n                            self._wait_for_page_stability(page, params.load_dom, params.network_idle)\n                        except Exception as e:  # pragma: no cover\n                            log.error(f\"Error waiting for selector {params.wait_selector}: {e}\")\n\n                    page.wait_for_timeout(params.wait)\n\n                    response = ResponseFactory.from_playwright_response(\n                        page, first_response, final_response[0], params.selector_config, meta={\"proxy\": proxy}\n                    )\n                    return response\n\n                except Exception as e:\n                    page_info.mark_error()\n                    if attempt < self._config.retries - 1:\n                        if is_proxy_error(e):\n                            log.warning(\n                                f\"Proxy '{proxy}' failed (attempt {attempt + 1}) | Retrying in {self._config.retry_delay}s...\"\n                            )\n                        else:\n                            log.warning(\n                                f\"Attempt {attempt + 1} failed: {e}. Retrying in {self._config.retry_delay}s...\"\n                            )\n                        time_sleep(self._config.retry_delay)\n                    else:\n                        log.error(f\"Failed after {self._config.retries} attempts: {e}\")\n                        raise\n\n        raise RuntimeError(\"Request failed\")  # pragma: no cover\n\n\nclass AsyncStealthySession(AsyncSession, StealthySessionMixin):\n    \"\"\"An async Stealthy Browser session manager with page pooling.\"\"\"\n\n    __slots__ = (\n        \"_config\",\n        \"_context_options\",\n        \"_browser_options\",\n        \"_user_data_dir\",\n        \"_headers_keys\",\n    )\n\n    def __init__(self, **kwargs: Unpack[StealthSession]):\n        \"\"\"A Browser session manager with page pooling, it's using a persistent browser Context by default with a temporary user profile directory.\n\n        :param headless: Run the browser in headless/hidden (default), or headful/visible mode.\n        :param disable_resources: Drop requests for unnecessary resources for a speed boost.\n            Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.\n        :param blocked_domains: A set of domain names to block requests to. Subdomains are also matched (e.g., ``\"example.com\"`` blocks ``\"sub.example.com\"`` too).\n        :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.\n        :param cookies: Set cookies for the next request.\n        :param network_idle: Wait for the page until there are no network connections for at least 500 ms.\n        :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000\n        :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.\n        :param page_action: Added for automation. A function that takes the `page` object and does the automation you need.\n        :param wait_selector: Wait for a specific CSS selector to be in a specific state.\n        :param init_script: An absolute path to a JavaScript file to be executed on page creation for all pages in this session.\n        :param locale: Specify user locale, for example, `en-GB`, `de-DE`, etc. Locale will affect navigator.language value, Accept-Language request header value as well as number and date formatting\n            rules. Defaults to the system default locale.\n        :param timezone_id: Changes the timezone of the browser. Defaults to the system timezone.\n        :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.\n        :param solve_cloudflare: Solves all types of the Cloudflare's Turnstile/Interstitial challenges before returning the response to you.\n        :param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.\n        :param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.\n        :param block_webrtc: Forces WebRTC to respect proxy settings to prevent local IP address leak.\n        :param allow_webgl: Enabled by default. Disabling it disables WebGL and WebGL 2.0 support entirely. Disabling WebGL is not recommended as many WAFs now check if WebGL is enabled.\n        :param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.\n        :param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.\n        :param google_search: Enabled by default, Scrapling will set a Google referer header.\n        :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by `google_search` takes priority over the referer set here if used together._\n        :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.\n        :param user_data_dir: Path to a User Data Directory, which stores browser session data like cookies and local storage. The default is to create a temporary directory.\n        :param extra_flags: A list of additional browser flags to pass to the browser on launch.\n        :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.\n        :param additional_args: Additional arguments to be passed to Playwright's context as additional settings, and it takes higher priority than Scrapling's settings.\n        \"\"\"\n        self.__validate__(**kwargs)\n        super().__init__(max_pages=self._config.max_pages)\n\n    async def start(self) -> None:\n        \"\"\"Create a browser for this instance and context.\"\"\"\n        if not self.playwright:\n            self.playwright = await async_playwright().start()\n            try:\n                if self._config.cdp_url:\n                    self.browser = await self.playwright.chromium.connect_over_cdp(endpoint_url=self._config.cdp_url)\n                    if not self._config.proxy_rotator:\n                        assert self.browser is not None\n                        self.context = await self.browser.new_context(**self._context_options)\n                elif self._config.proxy_rotator:\n                    self.browser = await self.playwright.chromium.launch(**self._browser_options)\n                else:\n                    persistent_options = (\n                        self._browser_options | self._context_options | {\"user_data_dir\": self._user_data_dir}\n                    )\n                    self.context = await self.playwright.chromium.launch_persistent_context(**persistent_options)\n\n                if self.context:\n                    self.context = await self._initialize_context(self._config, self.context)\n\n                self._is_alive = True\n            except Exception:\n                # Clean up playwright if browser setup fails\n                await self.playwright.stop()\n                self.playwright = None\n                raise\n        else:\n            raise RuntimeError(\"Session has been already started\")\n\n    async def _cloudflare_solver(self, page: async_Page) -> None:  # pragma: no cover\n        \"\"\"Solve the cloudflare challenge displayed on the playwright page passed\n\n        :param page: The targeted page\n        :return:\n        \"\"\"\n        await self._wait_for_networkidle(page, timeout=5000)\n        challenge_type = self._detect_cloudflare(await ResponseFactory._get_async_page_content(page))\n        if not challenge_type:\n            log.error(\"No Cloudflare challenge found.\")\n            return None\n        else:\n            log.info(f'The turnstile version discovered is \"{challenge_type}\"')\n            if challenge_type == \"non-interactive\":\n                while \"<title>Just a moment...</title>\" in (await ResponseFactory._get_async_page_content(page)):\n                    log.info(\"Waiting for Cloudflare wait page to disappear.\")\n                    await page.wait_for_timeout(1000)\n                    await page.wait_for_load_state()\n                log.info(\"Cloudflare captcha is solved\")\n                return None\n\n            else:\n                box_selector = \"#cf_turnstile div, #cf-turnstile div, .turnstile>div>div\"\n                if challenge_type != \"embedded\":\n                    box_selector = \".main-content p+div>div>div\"\n                    while \"Verifying you are human.\" in (await ResponseFactory._get_async_page_content(page)):\n                        # Waiting for the verify spinner to disappear, checking every 1s if it disappeared\n                        await page.wait_for_timeout(500)\n\n                outer_box: Any = {}\n                iframe = page.frame(url=__CF_PATTERN__)\n                if iframe is not None:\n                    await self._wait_for_page_stability(iframe, True, False)\n\n                    if challenge_type != \"embedded\":\n                        while not await (await iframe.frame_element()).is_visible():\n                            # Double-checking that the iframe is loaded\n                            await page.wait_for_timeout(500)\n\n                    outer_box = await (await iframe.frame_element()).bounding_box()\n\n                if not iframe or not outer_box:\n                    if \"<title>Just a moment...</title>\" not in (await ResponseFactory._get_async_page_content(page)):\n                        log.info(\"Cloudflare captcha is solved\")\n                        return None\n\n                    outer_box = await page.locator(box_selector).last.bounding_box()\n\n                # Calculate the Captcha coordinates for any viewport\n                captcha_x, captcha_y = outer_box[\"x\"] + randint(26, 28), outer_box[\"y\"] + randint(25, 27)\n\n                # Move the mouse to the center of the window, then press and hold the left mouse button\n                await page.mouse.click(captcha_x, captcha_y, delay=randint(100, 200), button=\"left\")\n                await self._wait_for_networkidle(page)\n\n                if challenge_type != \"embedded\":\n                    attempts = 0\n                    while \"<title>Just a moment...</title>\" in (await ResponseFactory._get_async_page_content(page)):\n                        # Wait for the page\n                        if attempts >= 100:\n                            log.info(\"Cloudflare page didn't disappear after 10s, continuing...\")\n                            break\n                        await page.wait_for_timeout(100)\n                        attempts += 1\n\n                    # await page.locator(box_selector).last.wait_for(state=\"detached\")\n                    # await page.locator(\".zone-name-title\").wait_for(state=\"hidden\")\n\n                await self._wait_for_page_stability(page, True, False)\n\n                if \"<title>Just a moment...</title>\" not in (await ResponseFactory._get_async_page_content(page)):\n                    log.info(\"Cloudflare captcha is solved\")\n                    return None\n                else:\n                    log.info(\"Looks like Cloudflare captcha is still present, solving again\")\n                    return await self._cloudflare_solver(page)\n\n    async def fetch(self, url: str, **kwargs: Unpack[StealthFetchParams]) -> Response:\n        \"\"\"Opens up the browser and do your request based on your chosen options.\n\n        :param url: The Target url.\n        :param google_search: Enabled by default, Scrapling will set a Google referer header.\n        :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000\n        :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.\n        :param page_action: Added for automation. A function that takes the `page` object and does the automation you need.\n        :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by `google_search` takes priority over the referer set here if used together._\n        :param disable_resources: Drop requests for unnecessary resources for a speed boost.\n            Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.\n        :param blocked_domains: A set of domain names to block requests to. Subdomains are also matched (e.g., ``\"example.com\"`` blocks ``\"sub.example.com\"`` too).\n        :param wait_selector: Wait for a specific CSS selector to be in a specific state.\n        :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.\n        :param network_idle: Wait for the page until there are no network connections for at least 500 ms.\n        :param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.\n        :param solve_cloudflare: Solves all types of the Cloudflare's Turnstile/Interstitial challenges before returning the response to you.\n        :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.\n        :param proxy: Static proxy to override rotator and session proxy. A new browser context will be created and used with it.\n        :return: A `Response` object.\n        \"\"\"\n        static_proxy = kwargs.pop(\"proxy\", None)\n\n        params = _validate(kwargs, self, StealthConfig)\n\n        if not self._is_alive:  # pragma: no cover\n            raise RuntimeError(\"Context manager has been closed\")\n\n        request_headers_keys = {h.lower() for h in params.extra_headers.keys()} if params.extra_headers else set()\n        referer = (\n            \"https://www.google.com/\" if (params.google_search and \"referer\" not in request_headers_keys) else None\n        )\n\n        for attempt in range(self._config.retries):\n            proxy: Optional[ProxyType] = None\n            if self._config.proxy_rotator and static_proxy is None:\n                proxy = self._config.proxy_rotator.get_proxy()\n            else:\n                proxy = static_proxy\n\n            async with self._page_generator(\n                params.timeout, params.extra_headers, params.disable_resources, proxy, params.blocked_domains\n            ) as page_info:\n                final_response = [None]\n                page = page_info.page\n                page.on(\"response\", self._create_response_handler(page_info, final_response))\n\n                try:\n                    first_response = await page.goto(url, referer=referer)\n                    await self._wait_for_page_stability(page, params.load_dom, params.network_idle)\n\n                    if not first_response:\n                        raise RuntimeError(f\"Failed to get response for {url}\")\n\n                    if params.solve_cloudflare:\n                        await self._cloudflare_solver(page)\n                        # Make sure the page is fully loaded after the captcha\n                        await self._wait_for_page_stability(page, params.load_dom, params.network_idle)\n\n                    if params.page_action:\n                        try:\n                            _ = await params.page_action(page)\n                        except Exception as e:  # pragma: no cover\n                            log.error(f\"Error executing page_action: {e}\")\n\n                    if params.wait_selector:\n                        try:\n                            waiter: AsyncLocator = page.locator(params.wait_selector)\n                            await waiter.first.wait_for(state=params.wait_selector_state)\n                            await self._wait_for_page_stability(page, params.load_dom, params.network_idle)\n                        except Exception as e:  # pragma: no cover\n                            log.error(f\"Error waiting for selector {params.wait_selector}: {e}\")\n\n                    await page.wait_for_timeout(params.wait)\n\n                    response = await ResponseFactory.from_async_playwright_response(\n                        page, first_response, final_response[0], params.selector_config, meta={\"proxy\": proxy}\n                    )\n                    return response\n\n                except Exception as e:\n                    page_info.mark_error()\n                    if attempt < self._config.retries - 1:\n                        if is_proxy_error(e):\n                            log.warning(\n                                f\"Proxy '{proxy}' failed (attempt {attempt + 1}) | Retrying in {self._config.retry_delay}s...\"\n                            )\n                        else:\n                            log.warning(\n                                f\"Attempt {attempt + 1} failed: {e}. Retrying in {self._config.retry_delay}s...\"\n                            )\n                        await asyncio_sleep(self._config.retry_delay)\n                    else:\n                        log.error(f\"Failed after {self._config.retries} attempts: {e}\")\n                        raise\n\n        raise RuntimeError(\"Request failed\")  # pragma: no cover\n"
  },
  {
    "path": "scrapling/engines/_browsers/_types.py",
    "content": "from io import BytesIO\n\nfrom curl_cffi.requests import (\n    ProxySpec,\n    CookieTypes,\n    BrowserTypeLiteral,\n)\n\nfrom scrapling.core._types import (\n    Dict,\n    List,\n    Set,\n    Tuple,\n    Mapping,\n    Optional,\n    Callable,\n    Sequence,\n    TypedDict,\n    TypeAlias,\n    SetCookieParam,\n    SelectorWaitStates,\n)\nfrom scrapling.engines.toolbelt.proxy_rotation import ProxyRotator\n\n# Type alias for `impersonate` parameter - accepts a single browser or list of browsers\nImpersonateType: TypeAlias = BrowserTypeLiteral | List[BrowserTypeLiteral] | None\n\n\n# Types for session initialization\nclass RequestsSession(TypedDict, total=False):\n    impersonate: ImpersonateType\n    http3: Optional[bool]\n    stealthy_headers: Optional[bool]\n    proxies: Optional[ProxySpec]\n    proxy: Optional[str]\n    proxy_auth: Optional[Tuple[str, str]]\n    proxy_rotator: Optional[ProxyRotator]\n    timeout: Optional[int | float]\n    headers: Optional[Mapping[str, Optional[str]]]\n    retries: Optional[int]\n    retry_delay: Optional[int]\n    follow_redirects: Optional[bool]\n    max_redirects: Optional[int]\n    verify: Optional[bool]\n    cert: Optional[str | Tuple[str, str]]\n    selector_config: Optional[Dict]\n\n\n# Types for GET request method parameters\nclass GetRequestParams(RequestsSession, total=False):\n    params: Optional[Dict | List | Tuple]\n    cookies: Optional[CookieTypes]\n    auth: Optional[Tuple[str, str]]\n\n\n# Types for POST/PUT/DELETE request method parameters\nclass DataRequestParams(GetRequestParams, total=False):\n    data: Optional[Dict[str, str] | List[Tuple] | str | BytesIO | bytes]\n    json: Optional[Dict | List]\n\n\n# Types for browser session\nclass PlaywrightSession(TypedDict, total=False):\n    max_pages: int\n    headless: bool\n    disable_resources: bool\n    network_idle: bool\n    load_dom: bool\n    wait_selector: Optional[str]\n    wait_selector_state: SelectorWaitStates\n    cookies: Sequence[SetCookieParam] | None\n    google_search: bool\n    wait: int | float\n    timezone_id: str | None\n    page_action: Optional[Callable]\n    proxy: Optional[str | Dict[str, str] | Tuple]\n    proxy_rotator: Optional[ProxyRotator]\n    extra_headers: Optional[Dict[str, str]]\n    timeout: int | float\n    init_script: Optional[str]\n    user_data_dir: str\n    selector_config: Optional[Dict]\n    additional_args: Optional[Dict]\n    locale: Optional[str]\n    real_chrome: bool\n    cdp_url: Optional[str]\n    useragent: Optional[str]\n    extra_flags: Optional[List[str]]\n    blocked_domains: Optional[Set[str]]\n    retries: int\n    retry_delay: int | float\n\n\nclass PlaywrightFetchParams(TypedDict, total=False):\n    load_dom: bool\n    wait: int | float\n    network_idle: bool\n    google_search: bool\n    timeout: int | float\n    disable_resources: bool\n    wait_selector: Optional[str]\n    page_action: Optional[Callable]\n    selector_config: Optional[Dict]\n    extra_headers: Optional[Dict[str, str]]\n    wait_selector_state: SelectorWaitStates\n    blocked_domains: Optional[Set[str]]\n    proxy: Optional[str | Dict[str, str]]\n\n\nclass StealthSession(PlaywrightSession, total=False):\n    allow_webgl: bool\n    hide_canvas: bool\n    block_webrtc: bool\n    solve_cloudflare: bool\n\n\nclass StealthFetchParams(PlaywrightFetchParams, total=False):\n    solve_cloudflare: bool\n"
  },
  {
    "path": "scrapling/engines/_browsers/_validators.py",
    "content": "from pathlib import Path\nfrom typing import Annotated\nfrom functools import lru_cache\nfrom urllib.parse import urlparse\nfrom dataclasses import dataclass, fields\n\nfrom msgspec import Struct, Meta, convert, ValidationError\n\nfrom scrapling.core._types import (\n    Any,\n    Dict,\n    List,\n    Set,\n    Tuple,\n    Optional,\n    Callable,\n    Sequence,\n    overload,\n    SetCookieParam,\n    SelectorWaitStates,\n)\nfrom scrapling.engines.toolbelt.proxy_rotation import ProxyRotator\nfrom scrapling.engines.toolbelt.navigation import construct_proxy_dict\nfrom scrapling.engines._browsers._types import PlaywrightFetchParams, StealthFetchParams\n\n\n# Custom validators for msgspec\n@lru_cache(8)\ndef _is_invalid_file_path(value: str) -> bool | str:  # pragma: no cover\n    \"\"\"Fast file path validation\"\"\"\n    path = Path(value)\n    if not path.exists():\n        return f\"Init script path not found: {value}\"\n    if not path.is_file():\n        return f\"Init script is not a file: {value}\"\n    if not path.is_absolute():\n        return f\"Init script is not a absolute path: {value}\"\n    return False\n\n\n@lru_cache(2)\ndef _is_invalid_cdp_url(cdp_url: str) -> bool | str:\n    \"\"\"Fast CDP URL validation\"\"\"\n    if not cdp_url.startswith((\"ws://\", \"wss://\")):\n        return \"CDP URL must use 'ws://' or 'wss://' scheme\"\n\n    netloc = urlparse(cdp_url).netloc\n    if not netloc:  # pragma: no cover\n        return \"Invalid hostname for the CDP URL\"\n    return False\n\n\n# Type aliases for cleaner annotations\nPagesCount = Annotated[int, Meta(ge=1, le=50)]\nRetriesCount = Annotated[int, Meta(ge=1, le=10)]\nSeconds = Annotated[int, float, Meta(ge=0)]\n\n\nclass PlaywrightConfig(Struct, kw_only=True, frozen=False, weakref=True):\n    \"\"\"Configuration struct for validation\"\"\"\n\n    max_pages: PagesCount = 1\n    headless: bool = True\n    disable_resources: bool = False\n    network_idle: bool = False\n    load_dom: bool = True\n    wait_selector: Optional[str] = None\n    wait_selector_state: SelectorWaitStates = \"attached\"\n    cookies: Sequence[SetCookieParam] | None = []\n    google_search: bool = True\n    wait: Seconds = 0\n    timezone_id: str | None = \"\"\n    page_action: Optional[Callable] = None\n    proxy: Optional[str | Dict[str, str] | Tuple] = None  # The default value for proxy in Playwright's source is `None`\n    proxy_rotator: Optional[ProxyRotator] = None\n    extra_headers: Optional[Dict[str, str]] = None\n    timeout: Seconds = 30000\n    init_script: Optional[str] = None\n    user_data_dir: str = \"\"\n    selector_config: Optional[Dict] = {}\n    additional_args: Optional[Dict] = {}\n    locale: str | None = None\n    real_chrome: bool = False\n    cdp_url: Optional[str] = None\n    useragent: Optional[str] = None\n    extra_flags: Optional[List[str]] = None\n    blocked_domains: Optional[Set[str]] = None\n    retries: RetriesCount = 3\n    retry_delay: Seconds = 1\n\n    def __post_init__(self):  # pragma: no cover\n        \"\"\"Custom validation after msgspec validation\"\"\"\n        if self.page_action and not callable(self.page_action):\n            raise TypeError(f\"page_action must be callable, got {type(self.page_action).__name__}\")\n        if self.proxy and self.proxy_rotator:\n            raise ValueError(\n                \"Cannot use 'proxy_rotator' together with 'proxy'. \"\n                \"Use either a static proxy or proxy rotation, not both.\"\n            )\n        if self.proxy:\n            self.proxy = construct_proxy_dict(self.proxy)\n        if self.cdp_url:\n            cdp_msg = _is_invalid_cdp_url(self.cdp_url)\n            if cdp_msg:\n                raise ValueError(cdp_msg)\n\n        if not self.cookies:\n            self.cookies = []\n        if not self.extra_flags:\n            self.extra_flags = []\n        if not self.selector_config:\n            self.selector_config = {}\n        if not self.additional_args:\n            self.additional_args = {}\n\n        if self.init_script is not None:\n            validation_msg = _is_invalid_file_path(self.init_script)\n            if validation_msg:\n                raise ValueError(validation_msg)\n\n\nclass StealthConfig(PlaywrightConfig, kw_only=True, frozen=False, weakref=True):\n    allow_webgl: bool = True\n    hide_canvas: bool = False\n    block_webrtc: bool = False\n    solve_cloudflare: bool = False\n\n    def __post_init__(self):\n        \"\"\"Custom validation after msgspec validation\"\"\"\n        super(StealthConfig, self).__post_init__()\n        # Cloudflare timeout adjustment\n        if self.solve_cloudflare and self.timeout < 60_000:\n            self.timeout = 60_000\n\n\n@dataclass\nclass _fetch_params:\n    \"\"\"A dataclass of all parameters used by `fetch` calls\"\"\"\n\n    google_search: bool\n    timeout: Seconds\n    wait: Seconds\n    page_action: Optional[Callable]\n    extra_headers: Optional[Dict[str, str]]\n    disable_resources: bool\n    wait_selector: Optional[str]\n    wait_selector_state: SelectorWaitStates\n    network_idle: bool\n    load_dom: bool\n    blocked_domains: Optional[Set[str]]\n    solve_cloudflare: bool\n    selector_config: Dict\n\n\ndef validate_fetch(\n    method_kwargs: Dict | PlaywrightFetchParams | StealthFetchParams,\n    session: Any,\n    model: type[PlaywrightConfig] | type[StealthConfig],\n) -> _fetch_params:  # pragma: no cover\n    result: Dict[str, Any] = {}\n    overrides: Dict[str, Any] = {}\n    kwargs_dict: Dict[str, Any] = dict(method_kwargs)\n\n    # Get all field names that _fetch_params needs\n    fetch_param_fields = {f.name for f in fields(_fetch_params)}\n\n    for key in fetch_param_fields:\n        if key in kwargs_dict:\n            overrides[key] = kwargs_dict[key]\n        elif hasattr(session, \"_config\") and hasattr(session._config, key):\n            result[key] = getattr(session._config, key)\n\n    if overrides:\n        validated_config = validate(overrides, model)\n        # Extract ONLY the fields that were actually overridden (not all fields)\n        # This prevents validated defaults from overwriting session config values\n        validated_dict = {\n            field: getattr(validated_config, field) for field in overrides.keys() if hasattr(validated_config, field)\n        }\n\n        # Preserve solve_cloudflare if the user explicitly provided it, even if the model doesn't have it\n        if \"solve_cloudflare\" in overrides:\n            validated_dict[\"solve_cloudflare\"] = overrides[\"solve_cloudflare\"]\n\n        # Start with session defaults, then overwrite with validated overrides\n        result.update(validated_dict)\n\n    # solve_cloudflare defaults to False for models that don't have it (PlaywrightConfig)\n    result.setdefault(\"solve_cloudflare\", False)\n    result.setdefault(\"blocked_domains\", None)\n\n    return _fetch_params(**result)\n\n\n# Cache default values for each model to reduce validation overhead\nmodels_default_values = {}\n\nfor _model in (StealthConfig, PlaywrightConfig):\n    _defaults = {}\n    if hasattr(_model, \"__struct_defaults__\") and hasattr(_model, \"__struct_fields__\"):\n        for field_name, default_value in zip(_model.__struct_fields__, _model.__struct_defaults__):  # type: ignore\n            # Skip factory defaults - these are msgspec._core.Factory instances\n            if type(default_value).__name__ != \"Factory\":\n                _defaults[field_name] = default_value\n\n    models_default_values[_model.__name__] = _defaults.copy()\n\n\ndef _filter_defaults(params: Dict, model: str) -> Dict:\n    \"\"\"Filter out parameters that match their default values to reduce validation overhead.\"\"\"\n    defaults = models_default_values[model]\n    return {k: v for k, v in params.items() if k not in defaults or v != defaults[k]}\n\n\n@overload\ndef validate(params: Dict, model: type[StealthConfig]) -> StealthConfig: ...\n\n\n@overload\ndef validate(params: Dict, model: type[PlaywrightConfig]) -> PlaywrightConfig: ...\n\n\ndef validate(params: Dict, model: type[PlaywrightConfig] | type[StealthConfig]) -> PlaywrightConfig | StealthConfig:\n    try:\n        # Filter out params with the default values (no need to validate them) to speed up validation\n        filtered = _filter_defaults(params, model.__name__)\n        return convert(filtered, model)\n    except ValidationError as e:\n        raise TypeError(f\"Invalid argument type: {e}\") from e\n"
  },
  {
    "path": "scrapling/engines/constants.py",
    "content": "# Disable loading these resources for speed\nEXTRA_RESOURCES = {\n    \"font\",\n    \"image\",\n    \"media\",\n    \"beacon\",\n    \"object\",\n    \"imageset\",\n    \"texttrack\",\n    \"websocket\",\n    \"csp_report\",\n    \"stylesheet\",\n}\n\nHARMFUL_ARGS = (\n    # This will be ignored to avoid detection more and possibly avoid the popup crashing bug abuse: https://issues.chromium.org/issues/340836884\n    \"--enable-automation\",\n    \"--disable-popup-blocking\",\n    \"--disable-component-update\",\n    \"--disable-default-apps\",\n    \"--disable-extensions\",\n)\n\nDEFAULT_ARGS = (\n    # Speed up chromium browsers by default\n    \"--no-pings\",\n    \"--no-first-run\",\n    \"--disable-infobars\",\n    \"--disable-breakpad\",\n    \"--no-service-autorun\",\n    \"--homepage=about:blank\",\n    \"--password-store=basic\",\n    \"--disable-hang-monitor\",\n    \"--no-default-browser-check\",\n    \"--disable-session-crashed-bubble\",\n    \"--disable-search-engine-choice-screen\",\n)\n\nSTEALTH_ARGS = (\n    # Explanation: https://peter.sh/experiments/chromium-command-line-switches/\n    # Generally this will make the browser faster and less detectable\n    # \"--incognito\",\n    \"--test-type\",\n    \"--lang=en-US\",\n    \"--mute-audio\",\n    \"--disable-sync\",\n    \"--hide-scrollbars\",\n    \"--disable-logging\",\n    \"--start-maximized\",  # For headless check bypass\n    \"--enable-async-dns\",\n    \"--accept-lang=en-US\",\n    \"--use-mock-keychain\",\n    \"--disable-translate\",\n    \"--disable-voice-input\",\n    \"--window-position=0,0\",\n    \"--disable-wake-on-wifi\",\n    \"--ignore-gpu-blocklist\",\n    \"--enable-tcp-fast-open\",\n    \"--enable-web-bluetooth\",\n    \"--disable-cloud-import\",\n    \"--disable-print-preview\",\n    \"--disable-dev-shm-usage\",\n    # '--disable-popup-blocking',\n    \"--metrics-recording-only\",\n    \"--disable-crash-reporter\",\n    \"--disable-partial-raster\",\n    \"--disable-gesture-typing\",\n    \"--disable-checker-imaging\",\n    \"--disable-prompt-on-repost\",\n    \"--force-color-profile=srgb\",\n    \"--font-render-hinting=none\",\n    \"--aggressive-cache-discard\",\n    \"--disable-cookie-encryption\",\n    \"--disable-domain-reliability\",\n    \"--disable-threaded-animation\",\n    \"--disable-threaded-scrolling\",\n    \"--enable-simple-cache-backend\",\n    \"--disable-background-networking\",\n    \"--enable-surface-synchronization\",\n    \"--disable-image-animation-resync\",\n    \"--disable-renderer-backgrounding\",\n    \"--disable-ipc-flooding-protection\",\n    \"--prerender-from-omnibox=disabled\",\n    \"--safebrowsing-disable-auto-update\",\n    \"--disable-offer-upload-credit-cards\",\n    \"--disable-background-timer-throttling\",\n    \"--disable-new-content-rendering-timeout\",\n    \"--run-all-compositor-stages-before-draw\",\n    \"--disable-client-side-phishing-detection\",\n    \"--disable-backgrounding-occluded-windows\",\n    \"--disable-layer-tree-host-memory-pressure\",\n    \"--autoplay-policy=user-gesture-required\",\n    \"--disable-offer-store-unmasked-wallet-cards\",\n    \"--disable-blink-features=AutomationControlled\",\n    \"--disable-component-extensions-with-background-pages\",\n    \"--enable-features=NetworkService,NetworkServiceInProcess,TrustTokens,TrustTokensAlwaysAllowIssuance\",\n    \"--blink-settings=primaryHoverType=2,availableHoverTypes=2,primaryPointerType=4,availablePointerTypes=4\",\n    \"--disable-features=AudioServiceOutOfProcess,TranslateUI,BlinkGenPropertyTrees\",\n)\n"
  },
  {
    "path": "scrapling/engines/static.py",
    "content": "from abc import ABC\nfrom random import choice\nfrom time import sleep as time_sleep\nfrom asyncio import sleep as asyncio_sleep\n\nfrom curl_cffi.curl import CurlError\nfrom curl_cffi import CurlHttpVersion\nfrom curl_cffi.requests import (\n    BrowserTypeLiteral,\n    Session as CurlSession,\n    AsyncSession as AsyncCurlSession,\n)\n\nfrom scrapling.core.utils import log\nfrom scrapling.core._types import (\n    Any,\n    Dict,\n    Tuple,\n    Unpack,\n    Optional,\n    Awaitable,\n    SUPPORTED_HTTP_METHODS,\n)\n\nfrom .toolbelt.custom import Response\nfrom .toolbelt.convertor import ResponseFactory\nfrom .toolbelt.proxy_rotation import ProxyRotator, is_proxy_error\nfrom ._browsers._types import RequestsSession, GetRequestParams, DataRequestParams, ImpersonateType\nfrom .toolbelt.fingerprints import generate_headers, __default_useragent__\n\n_NO_SESSION: Any = object()\n\n\ndef _select_random_browser(impersonate: ImpersonateType) -> Optional[BrowserTypeLiteral]:\n    \"\"\"\n    Handle browser selection logic for the ` impersonate ` parameter.\n\n    If impersonate is a list, randomly select one browser from it.\n    If it's a string or None, return as is.\n    \"\"\"\n    if isinstance(impersonate, list):\n        if not impersonate:\n            return None\n        return choice(impersonate)\n    return impersonate\n\n\nclass _ConfigurationLogic(ABC):\n    # Core Logic Handler (Internal Engine)\n    __slots__ = (\n        \"_default_impersonate\",\n        \"_stealth\",\n        \"_default_proxies\",\n        \"_default_proxy\",\n        \"_default_proxy_auth\",\n        \"_default_timeout\",\n        \"_default_headers\",\n        \"_default_retries\",\n        \"_default_retry_delay\",\n        \"_default_follow_redirects\",\n        \"_default_max_redirects\",\n        \"_default_verify\",\n        \"_default_cert\",\n        \"_default_http3\",\n        \"selector_config\",\n        \"_is_alive\",\n        \"_proxy_rotator\",\n    )\n\n    def __init__(self, **kwargs: Unpack[RequestsSession]):\n        self._default_impersonate = kwargs.get(\"impersonate\", \"chrome\")\n        self._stealth = kwargs.get(\"stealthy_headers\", True)\n        self._default_proxies = kwargs.get(\"proxies\") or {}\n        self._default_proxy = kwargs.get(\"proxy\") or None\n        self._default_proxy_auth = kwargs.get(\"proxy_auth\") or None\n        self._default_timeout = kwargs.get(\"timeout\", 30)\n        self._default_headers = kwargs.get(\"headers\") or {}\n        self._default_retries = kwargs.get(\"retries\", 3)\n        self._default_retry_delay = kwargs.get(\"retry_delay\", 1)\n        self._default_follow_redirects = kwargs.get(\"follow_redirects\", True)\n        self._default_max_redirects = kwargs.get(\"max_redirects\", 30)\n        self._default_verify = kwargs.get(\"verify\", True)\n        self._default_cert = kwargs.get(\"cert\") or None\n        self._default_http3 = kwargs.get(\"http3\", False)\n        self.selector_config = kwargs.get(\"selector_config\") or {}\n        self._is_alive = False\n        self._proxy_rotator: Optional[ProxyRotator] = kwargs.get(\"proxy_rotator\")\n\n        if self._proxy_rotator and (self._default_proxy or self._default_proxies):\n            raise ValueError(\n                \"Cannot use 'proxy_rotator' together with 'proxy' or 'proxies'. \"\n                \"Use either a static proxy or proxy rotation, not both.\"\n            )\n\n    @staticmethod\n    def _get_param(kwargs: Dict, key: str, default: Any) -> Any:\n        \"\"\"Get parameter from kwargs if present, otherwise return default.\"\"\"\n        return kwargs[key] if key in kwargs else default\n\n    def _merge_request_args(self, **method_kwargs) -> Dict[str, Any]:\n        \"\"\"Merge request-specific arguments with default session arguments.\"\"\"\n        url = method_kwargs.pop(\"url\")\n\n        # Get parameters from kwargs or use defaults\n        impersonate = self._get_param(method_kwargs, \"impersonate\", self._default_impersonate)\n        impersonate = _select_random_browser(impersonate)\n        http3_enabled = self._get_param(method_kwargs, \"http3\", self._default_http3)\n        stealth = self._get_param(method_kwargs, \"stealth\", self._stealth)\n\n        final_args = {\n            \"url\": url,\n            # Curl automatically generates the suitable browser headers when you use `impersonate`\n            \"headers\": self._headers_job(\n                url,\n                self._get_param(method_kwargs, \"headers\", self._default_headers),\n                stealth,\n                bool(impersonate),\n            ),\n            \"proxies\": self._get_param(method_kwargs, \"proxies\", self._default_proxies),\n            \"proxy\": self._get_param(method_kwargs, \"proxy\", self._default_proxy),\n            \"proxy_auth\": self._get_param(method_kwargs, \"proxy_auth\", self._default_proxy_auth),\n            \"timeout\": self._get_param(method_kwargs, \"timeout\", self._default_timeout),\n            \"allow_redirects\": self._get_param(method_kwargs, \"follow_redirects\", self._default_follow_redirects),\n            \"max_redirects\": self._get_param(method_kwargs, \"max_redirects\", self._default_max_redirects),\n            \"verify\": self._get_param(method_kwargs, \"verify\", self._default_verify),\n            \"cert\": self._get_param(method_kwargs, \"cert\", self._default_cert),\n            \"impersonate\": impersonate,\n        }\n\n        # Add any remaining parameters that weren't explicitly handled above\n        # Skip the ones we already processed plus internal params\n        skip_keys = {\n            \"impersonate\",\n            \"http3\",\n            \"stealth\",\n            \"headers\",\n            \"proxies\",\n            \"proxy\",\n            \"proxy_auth\",\n            \"timeout\",\n            \"follow_redirects\",\n            \"max_redirects\",\n            \"verify\",\n            \"cert\",\n            \"retries\",\n            \"retry_delay\",\n            \"selector_config\",\n            # Browser session params (ignored by HTTP sessions)\n            \"extra_headers\",\n            \"google_search\",\n        }\n        for k, v in method_kwargs.items():\n            if k not in skip_keys and v is not None:\n                final_args[k] = v\n\n        if http3_enabled:  # pragma: no cover\n            final_args[\"http_version\"] = CurlHttpVersion.V3ONLY\n            if impersonate:\n                log.warning(\n                    \"The argument `http3` might cause errors if used with `impersonate` argument, try switching it off if you encounter any curl errors.\"\n                )\n\n        return final_args\n\n    def _headers_job(self, url, headers: Dict, stealth: bool, impersonate_enabled: bool) -> Dict:\n        \"\"\"\n        1. Adds a useragent to the headers if it doesn't have one\n        2. Generates real headers and append them to current headers\n        3. Sets a Google referer header.\n        \"\"\"\n        # Merge session headers with request headers, request takes precedence (if it was set)\n        final_headers = {**self._default_headers, **(headers if headers else {})}\n        headers_keys = {k.lower() for k in final_headers}\n        if stealth:\n            if \"referer\" not in headers_keys:\n                final_headers[\"referer\"] = \"https://www.google.com/\"\n\n            if not impersonate_enabled:  # Curl will generate the suitable headers\n                extra_headers = generate_headers(browser_mode=False)\n                final_headers.update(\n                    {k: v for k, v in extra_headers.items() if k.lower() not in headers_keys}\n                )  # Don't overwrite user-supplied headers\n\n        elif \"user-agent\" not in headers_keys and not impersonate_enabled:  # pragma: no cover\n            final_headers[\"User-Agent\"] = __default_useragent__\n            log.debug(f\"Can't find useragent in headers so '{final_headers['User-Agent']}' was used.\")\n\n        return final_headers\n\n\nclass _SyncSessionLogic(_ConfigurationLogic):\n    __slots__ = (\"_curl_session\",)\n\n    def __init__(self, **kwargs: Unpack[RequestsSession]):\n        super().__init__(**kwargs)\n        self._curl_session: Optional[CurlSession] = None\n\n    def __enter__(self):\n        \"\"\"Creates and returns a new synchronous Fetcher Session\"\"\"\n        if self._is_alive:\n            raise RuntimeError(\"This FetcherSession instance already has an active synchronous session.\")\n\n        self._curl_session = CurlSession()\n        self._is_alive = True\n        return self\n\n    def __exit__(self, exc_type, exc_val, exc_tb):\n        \"\"\"Closes the active synchronous session managed by this instance, if any.\"\"\"\n        # For type checking (not accessed error)\n        _ = (\n            exc_type,\n            exc_val,\n            exc_tb,\n        )\n        if self._curl_session:\n            self._curl_session.close()\n            self._curl_session = None\n\n        self._is_alive = False\n\n    def _make_request(self, method: SUPPORTED_HTTP_METHODS, stealth: Optional[bool] = None, **kwargs) -> Response:\n        \"\"\"\n        Perform an HTTP request using the configured session.\n        \"\"\"\n        stealth = self._stealth if stealth is None else stealth\n\n        selector_config = self._get_param(kwargs, \"selector_config\", self.selector_config) or self.selector_config\n        max_retries = self._get_param(kwargs, \"retries\", self._default_retries)\n        retry_delay = self._get_param(kwargs, \"retry_delay\", self._default_retry_delay)\n        static_proxy = kwargs.pop(\"proxy\", None)\n\n        session = self._curl_session\n        one_off_request = False\n        if session is _NO_SESSION and self.__enter__ is None:\n            # For usage inside FetcherClient\n            # It turns out `curl_cffi` caches impersonation state, so if you turned it off, then on then off, it won't be off on the last time.\n            session = CurlSession()\n            one_off_request = True\n\n        if not session:\n            raise RuntimeError(\"No active session available.\")  # pragma: no cover\n\n        try:\n            for attempt in range(max_retries):\n                if self._proxy_rotator and static_proxy is None:\n                    proxy = self._proxy_rotator.get_proxy()\n                else:\n                    proxy = static_proxy\n\n                request_args = self._merge_request_args(stealth=stealth, proxy=proxy, **kwargs)\n                try:\n                    response = session.request(method, **request_args)\n                    result = ResponseFactory.from_http_request(response, selector_config, meta={\"proxy\": proxy})\n                    return result\n                except CurlError as e:  # pragma: no cover\n                    if attempt < max_retries - 1:\n                        # Now if the rotator is enabled, we will try again with the new proxy\n                        # If it's not enabled, then we will try again with the same proxy\n                        if is_proxy_error(e):\n                            log.warning(\n                                f\"Proxy '{proxy}' failed (attempt {attempt + 1}) | Retrying in {retry_delay} seconds...\"\n                            )\n                        else:\n                            log.warning(f\"Attempt {attempt + 1} failed: {e}. Retrying in {retry_delay} seconds...\")\n                        time_sleep(retry_delay)\n                    else:\n                        log.error(f\"Failed after {max_retries} attempts: {e}\")\n                        raise  # Raise the exception if all retries fail\n        finally:\n            if session and one_off_request:\n                session.close()\n\n        raise RuntimeError(\"No active session available.\")  # pragma: no cover\n\n    def get(self, url: str, **kwargs: Unpack[GetRequestParams]) -> Response:\n        \"\"\"\n        Perform a GET request.\n\n        Any additional keyword arguments are passed to the `curl_cffi.requests.Session().request()` method.\n\n        :param url: Target URL for the request.\n        :param kwargs: Additional keyword arguments including:\n            - params: Query string parameters for the request.\n            - headers: Headers to include in the request.\n            - cookies: Cookies to use in the request.\n            - timeout: Number of seconds to wait before timing out.\n            - follow_redirects: Whether to follow redirects. Defaults to True.\n            - max_redirects: Maximum number of redirects. Default 30, use -1 for unlimited.\n            - retries: Number of retry attempts. Defaults to 3.\n            - retry_delay: Number of seconds to wait between retry attempts. Defaults to 1 second.\n            - proxies: Dict of proxies to use.\n            - proxy: Proxy URL to use. Format: \"http://username:password@localhost:8030\".\n            - proxy_auth: HTTP basic auth for proxy, tuple of (username, password).\n            - auth: HTTP basic auth tuple of (username, password). Only basic auth is supported.\n            - verify: Whether to verify HTTPS certificates.\n            - cert: Tuple of (cert, key) filenames for the client certificate.\n            - impersonate: Browser version to impersonate. Automatically defaults to the latest available Chrome version.\n            - http3: Whether to use HTTP3. Defaults to False. It might be problematic if used it with `impersonate`.\n            - stealthy_headers: If enabled (default), it creates and adds real browser headers.\n        :return: A `Response` object.\n        \"\"\"\n        stealthy_headers = kwargs.pop(\"stealthy_headers\", None)\n        return self._make_request(\"GET\", stealth=stealthy_headers, url=url, **kwargs)\n\n    def post(self, url: str, **kwargs: Unpack[DataRequestParams]) -> Response:\n        \"\"\"\n        Perform a POST request.\n\n        Any additional keyword arguments are passed to the `curl_cffi.requests.Session().request()` method.\n\n        :param url: Target URL for the request.\n        :param kwargs: Additional keyword arguments including:\n            - data: Form data to include in the request body.\n            - json: A JSON serializable object to include in the body of the request.\n            - params: Query string parameters for the request.\n            - headers: Headers to include in the request.\n            - cookies: Cookies to use in the request.\n            - timeout: Number of seconds to wait before timing out.\n            - follow_redirects: Whether to follow redirects. Defaults to True.\n            - max_redirects: Maximum number of redirects. Default 30, use -1 for unlimited.\n            - retries: Number of retry attempts. Defaults to 3.\n            - retry_delay: Number of seconds to wait between retry attempts. Defaults to 1 second.\n            - proxies: Dict of proxies to use.\n            - proxy: Proxy URL to use. Format: \"http://username:password@localhost:8030\".\n            - proxy_auth: HTTP basic auth for proxy, tuple of (username, password).\n            - auth: HTTP basic auth tuple of (username, password). Only basic auth is supported.\n            - verify: Whether to verify HTTPS certificates.\n            - cert: Tuple of (cert, key) filenames for the client certificate.\n            - impersonate: Browser version to impersonate. Automatically defaults to the latest available Chrome version.\n            - http3: Whether to use HTTP3. Defaults to False. It might be problematic if used it with `impersonate`.\n            - stealthy_headers: If enabled (default), it creates and adds real browser headers.\n        :return: A `Response` object.\n        \"\"\"\n        stealthy_headers = kwargs.pop(\"stealthy_headers\", None)\n        return self._make_request(\"POST\", stealth=stealthy_headers, url=url, **kwargs)\n\n    def put(self, url: str, **kwargs: Unpack[DataRequestParams]) -> Response:\n        \"\"\"\n        Perform a PUT request.\n\n        Any additional keyword arguments are passed to the `curl_cffi.requests.Session().request()` method.\n\n        :param url: Target URL for the request.\n        :param kwargs: Additional keyword arguments including:\n            - data: Form data to include in the request body.\n            - json: A JSON serializable object to include in the body of the request.\n            - params: Query string parameters for the request.\n            - headers: Headers to include in the request.\n            - cookies: Cookies to use in the request.\n            - timeout: Number of seconds to wait before timing out.\n            - follow_redirects: Whether to follow redirects. Defaults to True.\n            - max_redirects: Maximum number of redirects. Default 30, use -1 for unlimited.\n            - retries: Number of retry attempts. Defaults to 3.\n            - retry_delay: Number of seconds to wait between retry attempts. Defaults to 1 second.\n            - proxies: Dict of proxies to use.\n            - proxy: Proxy URL to use. Format: \"http://username:password@localhost:8030\".\n            - proxy_auth: HTTP basic auth for proxy, tuple of (username, password).\n            - auth: HTTP basic auth tuple of (username, password). Only basic auth is supported.\n            - verify: Whether to verify HTTPS certificates.\n            - cert: Tuple of (cert, key) filenames for the client certificate.\n            - impersonate: Browser version to impersonate. Automatically defaults to the latest available Chrome version.\n            - http3: Whether to use HTTP3. Defaults to False. It might be problematic if used it with `impersonate`.\n            - stealthy_headers: If enabled (default), it creates and adds real browser headers.\n        :return: A `Response` object.\n        \"\"\"\n        stealthy_headers = kwargs.pop(\"stealthy_headers\", None)\n        return self._make_request(\"PUT\", stealth=stealthy_headers, url=url, **kwargs)\n\n    def delete(self, url: str, **kwargs: Unpack[DataRequestParams]) -> Response:\n        \"\"\"\n        Perform a DELETE request.\n\n        Any additional keyword arguments are passed to the `curl_cffi.requests.Session().request()` method.\n\n        :param url: Target URL for the request.\n        :param kwargs: Additional keyword arguments including:\n            - data: Form data to include in the request body.\n            - json: A JSON serializable object to include in the body of the request.\n            - params: Query string parameters for the request.\n            - headers: Headers to include in the request.\n            - cookies: Cookies to use in the request.\n            - timeout: Number of seconds to wait before timing out.\n            - follow_redirects: Whether to follow redirects. Defaults to True.\n            - max_redirects: Maximum number of redirects. Default 30, use -1 for unlimited.\n            - retries: Number of retry attempts. Defaults to 3.\n            - retry_delay: Number of seconds to wait between retry attempts. Defaults to 1 second.\n            - proxies: Dict of proxies to use.\n            - proxy: Proxy URL to use. Format: \"http://username:password@localhost:8030\".\n            - proxy_auth: HTTP basic auth for proxy, tuple of (username, password).\n            - auth: HTTP basic auth tuple of (username, password). Only basic auth is supported.\n            - verify: Whether to verify HTTPS certificates.\n            - cert: Tuple of (cert, key) filenames for the client certificate.\n            - impersonate: Browser version to impersonate. Automatically defaults to the latest available Chrome version.\n            - http3: Whether to use HTTP3. Defaults to False. It might be problematic if used it with `impersonate`.\n            - stealthy_headers: If enabled (default), it creates and adds real browser headers.\n        :return: A `Response` object.\n        \"\"\"\n        # Careful of sending a body in a DELETE request, it might cause some websites to reject the request as per https://www.rfc-editor.org/rfc/rfc7231#section-4.3.5,\n        # But some websites accept it, it depends on the implementation used.\n        stealthy_headers = kwargs.pop(\"stealthy_headers\", None)\n        return self._make_request(\"DELETE\", stealth=stealthy_headers, url=url, **kwargs)\n\n\nclass _ASyncSessionLogic(_ConfigurationLogic):\n    __slots__ = (\"_async_curl_session\",)\n\n    def __init__(self, **kwargs: Unpack[RequestsSession]):\n        super().__init__(**kwargs)\n        self._async_curl_session: Optional[AsyncCurlSession] = None\n\n    async def __aenter__(self):  # pragma: no cover\n        \"\"\"Creates and returns a new asynchronous Session.\"\"\"\n        if self._is_alive:\n            raise RuntimeError(\"This FetcherSession instance already has an active asynchronous session.\")\n\n        self._async_curl_session = AsyncCurlSession()\n        self._is_alive = True\n        return self\n\n    async def __aexit__(self, exc_type, exc_val, exc_tb):\n        \"\"\"Closes the active asynchronous session managed by this instance, if any.\"\"\"\n        # For type checking (not accessed error)\n        _ = (\n            exc_type,\n            exc_val,\n            exc_tb,\n        )\n        if self._async_curl_session:\n            await self._async_curl_session.close()\n            self._async_curl_session = None\n\n        self._is_alive = False\n\n    async def _make_request(self, method: SUPPORTED_HTTP_METHODS, stealth: Optional[bool] = None, **kwargs) -> Response:\n        \"\"\"\n        Perform an HTTP request using the configured session.\n        \"\"\"\n        stealth = self._stealth if stealth is None else stealth\n\n        selector_config = self._get_param(kwargs, \"selector_config\", self.selector_config) or self.selector_config\n        max_retries = self._get_param(kwargs, \"retries\", self._default_retries)\n        retry_delay = self._get_param(kwargs, \"retry_delay\", self._default_retry_delay)\n        static_proxy = kwargs.pop(\"proxy\", None)\n\n        session = self._async_curl_session\n        one_off_request = False\n        if session is _NO_SESSION and self.__aenter__ is None:\n            # For usage inside the ` AsyncFetcherClient ` class, and that's for several reasons\n            # 1. It turns out `curl_cffi` caches impersonation state, so if you turned it off, then on then off, it won't be off on the last time.\n            # 2. `curl_cffi` doesn't support making async requests without sessions\n            # 3. Using a single session for many requests at the same time in async doesn't sit well with curl_cffi.\n            session = AsyncCurlSession()\n            one_off_request = True\n\n        if not session:\n            raise RuntimeError(\"No active session available.\")  # pragma: no cover\n\n        try:\n            # Determine if we should use proxy rotation\n            for attempt in range(max_retries):\n                if self._proxy_rotator and static_proxy is None:\n                    proxy = self._proxy_rotator.get_proxy()\n                else:\n                    proxy = static_proxy\n\n                request_args = self._merge_request_args(stealth=stealth, proxy=proxy, **kwargs)\n                try:\n                    response = await session.request(method, **request_args)\n                    result = ResponseFactory.from_http_request(response, selector_config, meta={\"proxy\": proxy})\n                    return result\n                except CurlError as e:  # pragma: no cover\n                    if attempt < max_retries - 1:\n                        # Now if the rotator is enabled, we will try again with the new proxy\n                        # If it's not enabled, then we will try again with the same proxy\n                        if is_proxy_error(e):\n                            log.warning(\n                                f\"Proxy '{proxy}' failed (attempt {attempt + 1}) | Retrying in {retry_delay} seconds...\"\n                            )\n                        else:\n                            log.warning(f\"Attempt {attempt + 1} failed: {e}. Retrying in {retry_delay} seconds...\")\n\n                        await asyncio_sleep(retry_delay)\n                    else:\n                        log.error(f\"Failed after {max_retries} attempts: {e}\")\n                        raise  # Raise the exception if all retries fail\n        finally:\n            if session and one_off_request:\n                await session.close()\n\n        raise RuntimeError(\"No active session available.\")  # pragma: no cover\n\n    def get(self, url: str, **kwargs: Unpack[GetRequestParams]) -> Awaitable[Response]:\n        \"\"\"\n        Perform a GET request.\n\n        Any additional keyword arguments are passed to the `curl_cffi.requests.AsyncSession().request()` method.\n\n        :param url: Target URL for the request.\n        :param kwargs: Additional keyword arguments including:\n            - params: Query string parameters for the request.\n            - headers: Headers to include in the request.\n            - cookies: Cookies to use in the request.\n            - timeout: Number of seconds to wait before timing out.\n            - follow_redirects: Whether to follow redirects. Defaults to True.\n            - max_redirects: Maximum number of redirects. Default 30, use -1 for unlimited.\n            - retries: Number of retry attempts. Defaults to 3.\n            - retry_delay: Number of seconds to wait between retry attempts. Defaults to 1 second.\n            - proxies: Dict of proxies to use.\n            - proxy: Proxy URL to use. Format: \"http://username:password@localhost:8030\".\n            - proxy_auth: HTTP basic auth for proxy, tuple of (username, password).\n            - auth: HTTP basic auth tuple of (username, password). Only basic auth is supported.\n            - verify: Whether to verify HTTPS certificates.\n            - cert: Tuple of (cert, key) filenames for the client certificate.\n            - impersonate: Browser version to impersonate. Automatically defaults to the latest available Chrome version.\n            - http3: Whether to use HTTP3. Defaults to False. It might be problematic if used it with `impersonate`.\n            - stealthy_headers: If enabled (default), it creates and adds real browser headers.\n        :return: A `Response` object.\n        \"\"\"\n        stealthy_headers = kwargs.pop(\"stealthy_headers\", None)\n        return self._make_request(\"GET\", stealth=stealthy_headers, url=url, **kwargs)\n\n    def post(self, url: str, **kwargs: Unpack[DataRequestParams]) -> Awaitable[Response]:\n        \"\"\"\n        Perform a POST request.\n\n        Any additional keyword arguments are passed to the `curl_cffi.requests.AsyncSession().request()` method.\n\n        :param url: Target URL for the request.\n        :param kwargs: Additional keyword arguments including:\n            - data: Form data to include in the request body.\n            - json: A JSON serializable object to include in the body of the request.\n            - params: Query string parameters for the request.\n            - headers: Headers to include in the request.\n            - cookies: Cookies to use in the request.\n            - timeout: Number of seconds to wait before timing out.\n            - follow_redirects: Whether to follow redirects. Defaults to True.\n            - max_redirects: Maximum number of redirects. Default 30, use -1 for unlimited.\n            - retries: Number of retry attempts. Defaults to 3.\n            - retry_delay: Number of seconds to wait between retry attempts. Defaults to 1 second.\n            - proxies: Dict of proxies to use.\n            - proxy: Proxy URL to use. Format: \"http://username:password@localhost:8030\".\n            - proxy_auth: HTTP basic auth for proxy, tuple of (username, password).\n            - auth: HTTP basic auth tuple of (username, password). Only basic auth is supported.\n            - verify: Whether to verify HTTPS certificates.\n            - cert: Tuple of (cert, key) filenames for the client certificate.\n            - impersonate: Browser version to impersonate. Automatically defaults to the latest available Chrome version.\n            - http3: Whether to use HTTP3. Defaults to False. It might be problematic if used it with `impersonate`.\n            - stealthy_headers: If enabled (default), it creates and adds real browser headers.\n        :return: A `Response` object.\n        \"\"\"\n        stealthy_headers = kwargs.pop(\"stealthy_headers\", None)\n        return self._make_request(\"POST\", stealth=stealthy_headers, url=url, **kwargs)\n\n    def put(self, url: str, **kwargs: Unpack[DataRequestParams]) -> Awaitable[Response]:\n        \"\"\"\n        Perform a PUT request.\n\n        Any additional keyword arguments are passed to the `curl_cffi.requests.AsyncSession().request()` method.\n\n        :param url: Target URL for the request.\n        :param kwargs: Additional keyword arguments including:\n            - data: Form data to include in the request body.\n            - json: A JSON serializable object to include in the body of the request.\n            - params: Query string parameters for the request.\n            - headers: Headers to include in the request.\n            - cookies: Cookies to use in the request.\n            - timeout: Number of seconds to wait before timing out.\n            - follow_redirects: Whether to follow redirects. Defaults to True.\n            - max_redirects: Maximum number of redirects. Default 30, use -1 for unlimited.\n            - retries: Number of retry attempts. Defaults to 3.\n            - retry_delay: Number of seconds to wait between retry attempts. Defaults to 1 second.\n            - proxies: Dict of proxies to use.\n            - proxy: Proxy URL to use. Format: \"http://username:password@localhost:8030\".\n            - proxy_auth: HTTP basic auth for proxy, tuple of (username, password).\n            - auth: HTTP basic auth tuple of (username, password). Only basic auth is supported.\n            - verify: Whether to verify HTTPS certificates.\n            - cert: Tuple of (cert, key) filenames for the client certificate.\n            - impersonate: Browser version to impersonate. Automatically defaults to the latest available Chrome version.\n            - http3: Whether to use HTTP3. Defaults to False. It might be problematic if used it with `impersonate`.\n            - stealthy_headers: If enabled (default), it creates and adds real browser headers.\n        :return: A `Response` object.\n        \"\"\"\n        stealthy_headers = kwargs.pop(\"stealthy_headers\", None)\n        return self._make_request(\"PUT\", stealth=stealthy_headers, url=url, **kwargs)\n\n    def delete(self, url: str, **kwargs: Unpack[DataRequestParams]) -> Awaitable[Response]:\n        \"\"\"\n        Perform a DELETE request.\n\n        Any additional keyword arguments are passed to the `curl_cffi.requests.AsyncSession().request()` method.\n\n        :param url: Target URL for the request.\n        :param kwargs: Additional keyword arguments including:\n            - data: Form data to include in the request body.\n            - json: A JSON serializable object to include in the body of the request.\n            - params: Query string parameters for the request.\n            - headers: Headers to include in the request.\n            - cookies: Cookies to use in the request.\n            - timeout: Number of seconds to wait before timing out.\n            - follow_redirects: Whether to follow redirects. Defaults to True.\n            - max_redirects: Maximum number of redirects. Default 30, use -1 for unlimited.\n            - retries: Number of retry attempts. Defaults to 3.\n            - retry_delay: Number of seconds to wait between retry attempts. Defaults to 1 second.\n            - proxies: Dict of proxies to use.\n            - proxy: Proxy URL to use. Format: \"http://username:password@localhost:8030\".\n            - proxy_auth: HTTP basic auth for proxy, tuple of (username, password).\n            - auth: HTTP basic auth tuple of (username, password). Only basic auth is supported.\n            - verify: Whether to verify HTTPS certificates.\n            - cert: Tuple of (cert, key) filenames for the client certificate.\n            - impersonate: Browser version to impersonate. Automatically defaults to the latest available Chrome version.\n            - http3: Whether to use HTTP3. Defaults to False. It might be problematic if used it with `impersonate`.\n            - stealthy_headers: If enabled (default), it creates and adds real browser headers.\n        :return: A `Response` object.\n        \"\"\"\n        # Careful of sending a body in a DELETE request, it might cause some websites to reject the request as per https://www.rfc-editor.org/rfc/rfc7231#section-4.3.5,\n        # But some websites accept it, it depends on the implementation used.\n        stealthy_headers = kwargs.pop(\"stealthy_headers\", None)\n        return self._make_request(\"DELETE\", stealth=stealthy_headers, url=url, **kwargs)\n\n\nclass FetcherSession:\n    \"\"\"\n    A factory context manager that provides configured Fetcher sessions.\n\n    When this manager is used in a 'with' or 'async with' block,\n    it yields a new session configured with the manager's defaults.\n    A single instance of this manager should ideally be used for one active\n    session at a time (or sequentially). Re-entering a context with the\n    same manager instance while a session is already active is disallowed.\n    \"\"\"\n\n    __slots__ = (\n        \"_default_impersonate\",\n        \"_stealth\",\n        \"_default_proxies\",\n        \"_default_proxy\",\n        \"_default_proxy_auth\",\n        \"_default_timeout\",\n        \"_default_headers\",\n        \"_default_retries\",\n        \"_default_retry_delay\",\n        \"_default_follow_redirects\",\n        \"_default_max_redirects\",\n        \"_default_verify\",\n        \"_default_cert\",\n        \"_default_http3\",\n        \"selector_config\",\n        \"_client\",\n        \"_is_alive\",\n        \"_proxy_rotator\",\n    )\n\n    def __init__(\n        self,\n        impersonate: ImpersonateType = \"chrome\",\n        http3: Optional[bool] = False,\n        stealthy_headers: Optional[bool] = True,\n        proxies: Optional[Dict[str, str]] = None,\n        proxy: Optional[str] = None,\n        proxy_auth: Optional[Tuple[str, str]] = None,\n        timeout: Optional[int | float] = 30,\n        headers: Optional[Dict[str, str]] = None,\n        retries: Optional[int] = 3,\n        retry_delay: Optional[int] = 1,\n        follow_redirects: bool = True,\n        max_redirects: int = 30,\n        verify: bool = True,\n        cert: Optional[str | Tuple[str, str]] = None,\n        selector_config: Optional[Dict] = None,\n        proxy_rotator: Optional[ProxyRotator] = None,\n    ):\n        \"\"\"\n        :param impersonate: Browser version to impersonate. Can be a single browser string or a list of browser strings for random selection. (Default: latest available Chrome version)\n        :param http3: Whether to use HTTP3. Defaults to False. It might be problematic if used it with `impersonate`.\n        :param stealthy_headers: If enabled (default), it creates and adds real browser headers. It also sets a Google referer header.\n        :param proxies: Dict of proxies to use. Format: {\"http\": proxy_url, \"https\": proxy_url}.\n        :param proxy: Proxy URL to use. Format: \"http://username:password@localhost:8030\".\n                     Cannot be used together with the `proxies` parameter.\n        :param proxy_auth: HTTP basic auth for proxy, tuple of (username, password).\n        :param timeout: Number of seconds to wait before timing out.\n        :param headers: Headers to include in the session with every request.\n        :param retries: Number of retry attempts. Defaults to 3.\n        :param retry_delay: Number of seconds to wait between retry attempts. Defaults to 1 second.\n        :param follow_redirects: Whether to follow redirects. Defaults to True.\n        :param max_redirects: Maximum number of redirects. Default 30, use -1 for unlimited.\n        :param verify: Whether to verify HTTPS certificates. Defaults to True.\n        :param cert: Tuple of (cert, key) filenames for the client certificate.\n        :param selector_config: Arguments passed when creating the final Selector class.\n        :param proxy_rotator: A ProxyRotator instance for automatic proxy rotation.\n        \"\"\"\n        self._default_impersonate: ImpersonateType = impersonate\n        self._stealth = stealthy_headers\n        self._default_proxies = proxies or {}\n        self._default_proxy = proxy or None\n        self._default_proxy_auth = proxy_auth or None\n        self._default_timeout = timeout\n        self._default_headers = headers or {}\n        self._default_retries = retries\n        self._default_retry_delay = retry_delay\n        self._default_follow_redirects = follow_redirects\n        self._default_max_redirects = max_redirects\n        self._default_verify = verify\n        self._default_cert = cert\n        self._default_http3 = http3\n        self.selector_config = selector_config or {}\n        self._is_alive = False\n        self._client: _SyncSessionLogic | _ASyncSessionLogic | None = None\n        self._proxy_rotator = proxy_rotator\n\n    def __enter__(self) -> _SyncSessionLogic:\n        \"\"\"Creates and returns a new synchronous Fetcher Session\"\"\"\n        if self._client is None:\n            # Use **vars(self) to avoid repeating all parameters\n            config = {k.replace(\"_default_\", \"\"): getattr(self, k) for k in self.__slots__ if k.startswith(\"_default\")}\n            config[\"stealthy_headers\"] = self._stealth\n            config[\"selector_config\"] = self.selector_config\n            config[\"proxy_rotator\"] = self._proxy_rotator\n            self._client = _SyncSessionLogic(**config)\n            self._is_alive = True\n            return self._client.__enter__()\n        raise RuntimeError(\"This FetcherSession instance already has an active synchronous session.\")\n\n    def __exit__(self, exc_type, exc_val, exc_tb):\n        if self._client is not None and isinstance(self._client, _SyncSessionLogic):\n            self._client.__exit__(exc_type, exc_val, exc_tb)\n            self._client = None\n            self._is_alive = False\n            return\n        raise RuntimeError(\"Cannot exit invalid session\")\n\n    async def __aenter__(self) -> _ASyncSessionLogic:\n        \"\"\"Creates and returns a new asynchronous Session.\"\"\"\n        if self._client is None:\n            # Use **vars(self) to avoid repeating all parameters\n            config = {k.replace(\"_default_\", \"\"): getattr(self, k) for k in self.__slots__ if k.startswith(\"_default\")}\n            config[\"stealthy_headers\"] = self._stealth\n            config[\"selector_config\"] = self.selector_config\n            config[\"proxy_rotator\"] = self._proxy_rotator\n            self._client = _ASyncSessionLogic(**config)\n            self._is_alive = True\n            return await self._client.__aenter__()\n        raise RuntimeError(\"This FetcherSession instance already has an active asynchronous session.\")\n\n    async def __aexit__(self, exc_type, exc_val, exc_tb):\n        if self._client is not None and isinstance(self._client, _ASyncSessionLogic):\n            await self._client.__aexit__(exc_type, exc_val, exc_tb)\n            self._client = None\n            self._is_alive = False\n            return\n        raise RuntimeError(\"Cannot exit invalid session\")\n\n\nclass FetcherClient(_SyncSessionLogic):\n    __slots__ = (\"__enter__\", \"__exit__\")\n\n    def __init__(self, **kwargs: Any) -> None:\n        super().__init__(**kwargs)\n        self.__enter__: Any = None\n        self.__exit__: Any = None\n        self._curl_session: Any = _NO_SESSION\n\n\nclass AsyncFetcherClient(_ASyncSessionLogic):\n    __slots__ = (\"__aenter__\", \"__aexit__\")\n\n    def __init__(self, **kwargs: Any) -> None:\n        super().__init__(**kwargs)\n        self.__aenter__: Any = None\n        self.__aexit__: Any = None\n        self._async_curl_session: Any = _NO_SESSION\n"
  },
  {
    "path": "scrapling/engines/toolbelt/__init__.py",
    "content": "from .proxy_rotation import ProxyRotator, is_proxy_error, cyclic_rotation\n\n__all__ = [\"ProxyRotator\", \"is_proxy_error\", \"cyclic_rotation\"]\n"
  },
  {
    "path": "scrapling/engines/toolbelt/convertor.py",
    "content": "from functools import lru_cache\nfrom re import compile as re_compile\n\nfrom curl_cffi.requests import Response as CurlResponse\nfrom playwright._impl._errors import Error as PlaywrightError\nfrom playwright.sync_api import Page as SyncPage, Response as SyncResponse\nfrom playwright.async_api import Page as AsyncPage, Response as AsyncResponse\n\nfrom scrapling.core.utils import log\nfrom .custom import Response, StatusText\nfrom scrapling.core._types import Dict, Optional\n\n__CHARSET_RE__ = re_compile(r\"charset=([\\w-]+)\")\n\n\nclass ResponseFactory:\n    \"\"\"\n    Factory class for creating `Response` objects from various sources.\n\n    This class provides multiple static and instance methods for building standardized `Response` objects\n    from diverse input sources such as Playwright responses, asynchronous Playwright responses,\n    and raw HTTP request responses. It supports handling response histories, constructing the proper\n    response objects, and managing encoding, headers, cookies, and other attributes.\n    \"\"\"\n\n    @classmethod\n    @lru_cache(maxsize=16)\n    def __extract_browser_encoding(cls, content_type: str | None, default: str = \"utf-8\") -> str:\n        \"\"\"Extract browser encoding from headers.\n        Ex: from header \"content-type: text/html; charset=utf-8\" -> \"utf-8\n        \"\"\"\n        if content_type:\n            # Because Playwright can't do that by themselves like all libraries for some reason :3\n            match = __CHARSET_RE__.search(content_type)\n            return match.group(1) if match else default\n        return default\n\n    @classmethod\n    def _process_response_history(cls, first_response: SyncResponse, parser_arguments: Dict) -> list[Response]:\n        \"\"\"Process response history to build a list of `Response` objects\"\"\"\n        history: list[Response] = []\n        current_request = first_response.request.redirected_from\n\n        try:\n            while current_request:\n                try:\n                    current_response = current_request.response()\n                    history.insert(\n                        0,\n                        Response(\n                            **{\n                                \"url\": current_request.url,\n                                # using current_response.text() will trigger \"Error: Response.text: Response body is unavailable for redirect responses\"\n                                \"content\": \"\",\n                                \"status\": current_response.status if current_response else 301,\n                                \"reason\": (current_response.status_text or StatusText.get(current_response.status))\n                                if current_response\n                                else StatusText.get(301),\n                                \"encoding\": cls.__extract_browser_encoding(\n                                    current_response.headers.get(\"content-type\", \"\")\n                                )\n                                if current_response\n                                else \"utf-8\",\n                                \"cookies\": tuple(),\n                                \"headers\": current_response.all_headers() if current_response else {},\n                                \"request_headers\": current_request.all_headers(),\n                                **parser_arguments,\n                            }\n                        ),\n                    )\n                except Exception as e:  # pragma: no cover\n                    log.error(f\"Error processing redirect: {e}\")\n                    break\n\n                current_request = current_request.redirected_from\n        except Exception as e:  # pragma: no cover\n            log.error(f\"Error processing response history: {e}\")\n\n        return history\n\n    @classmethod\n    def from_playwright_response(\n        cls,\n        page: SyncPage,\n        first_response: SyncResponse,\n        final_response: Optional[SyncResponse],\n        parser_arguments: Dict,\n        meta: Optional[Dict] = None,\n    ) -> Response:\n        \"\"\"\n        Transforms a Playwright response into an internal `Response` object, encapsulating\n        the page's content, response status, headers, and relevant metadata.\n\n        The function handles potential issues, such as empty or missing final responses,\n        by falling back to the first response if necessary. Encoding and status text\n        are also derived from the provided response headers or reasonable defaults.\n        Additionally, the page content and cookies are extracted for further use.\n\n        :param page: A synchronous Playwright `Page` instance that represents the current browser page. Required to retrieve the page's URL, cookies, and content.\n        :param final_response: The last response received for the given request from the Playwright instance. Typically used as the main response object to derive status, headers, and other metadata.\n        :param first_response: An earlier or initial Playwright `Response` object that may serve as a fallback response in the absence of the final one.\n        :param parser_arguments: A dictionary containing additional arguments needed for parsing or further customization of the returned `Response`. These arguments are dynamically unpacked into\n            the `Response` object.\n        :param meta: Additional meta data to be saved with the response.\n\n        :return: A fully populated `Response` object containing the page's URL, content, status, headers, cookies, and other derived metadata.\n        :rtype: Response\n        \"\"\"\n        # In case we didn't catch a document type somehow\n        final_response = final_response if final_response else first_response\n        if not final_response:\n            raise ValueError(\"Failed to get a response from the page\")\n\n        encoding = cls.__extract_browser_encoding(final_response.headers.get(\"content-type\", \"\"))\n        # PlayWright API sometimes give empty status text for some reason!\n        status_text = final_response.status_text or StatusText.get(final_response.status)\n\n        history = cls._process_response_history(first_response, parser_arguments)\n        try:\n            if \"html\" in final_response.all_headers().get(\"content-type\", \"\"):\n                page_content = cls._get_page_content(page).encode(\"utf-8\")\n            else:\n                page_content = final_response.body()\n        except Exception as e:  # pragma: no cover\n            log.error(f\"Error getting page content: {e}\")\n            page_content = b\"\"\n\n        return Response(\n            **{\n                \"url\": page.url,\n                \"content\": page_content,\n                \"status\": final_response.status,\n                \"reason\": status_text,\n                \"encoding\": encoding,\n                \"cookies\": tuple(dict(cookie) for cookie in page.context.cookies()),\n                \"headers\": first_response.all_headers(),\n                \"request_headers\": first_response.request.all_headers(),\n                \"history\": history,\n                \"meta\": meta,\n                **parser_arguments,\n            }\n        )\n\n    @classmethod\n    async def _async_process_response_history(\n        cls, first_response: AsyncResponse, parser_arguments: Dict\n    ) -> list[Response]:\n        \"\"\"Process response history to build a list of `Response` objects\"\"\"\n        history: list[Response] = []\n        current_request = first_response.request.redirected_from\n\n        try:\n            while current_request:\n                try:\n                    current_response = await current_request.response()\n                    history.insert(\n                        0,\n                        Response(\n                            **{\n                                \"url\": current_request.url,\n                                # using current_response.text() will trigger \"Error: Response.text: Response body is unavailable for redirect responses\"\n                                \"content\": \"\",\n                                \"status\": current_response.status if current_response else 301,\n                                \"reason\": (current_response.status_text or StatusText.get(current_response.status))\n                                if current_response\n                                else StatusText.get(301),\n                                \"encoding\": cls.__extract_browser_encoding(\n                                    current_response.headers.get(\"content-type\", \"\")\n                                )\n                                if current_response\n                                else \"utf-8\",\n                                \"cookies\": tuple(),\n                                \"headers\": await current_response.all_headers() if current_response else {},\n                                \"request_headers\": await current_request.all_headers(),\n                                **parser_arguments,\n                            }\n                        ),\n                    )\n                except Exception as e:  # pragma: no cover\n                    log.error(f\"Error processing redirect: {e}\")\n                    break\n\n                current_request = current_request.redirected_from\n        except Exception as e:  # pragma: no cover\n            log.error(f\"Error processing response history: {e}\")\n\n        return history\n\n    @classmethod\n    def _get_page_content(cls, page: SyncPage) -> str:\n        \"\"\"\n        A workaround for the Playwright issue with `page.content()` on Windows. Ref.: https://github.com/microsoft/playwright/issues/16108\n        :param page: The page to extract content from.\n        :return:\n        \"\"\"\n        while True:\n            try:\n                return page.content() or \"\"\n            except PlaywrightError:\n                page.wait_for_timeout(500)\n                continue\n        return \"\"  # pyright: ignore\n\n    @classmethod\n    async def _get_async_page_content(cls, page: AsyncPage) -> str:\n        \"\"\"\n        A workaround for the Playwright issue with `page.content()` on Windows. Ref.: https://github.com/microsoft/playwright/issues/16108\n        :param page: The page to extract content from.\n        :return:\n        \"\"\"\n        while True:\n            try:\n                return (await page.content()) or \"\"\n            except PlaywrightError:\n                await page.wait_for_timeout(500)\n                continue\n        return \"\"  # pyright: ignore\n\n    @classmethod\n    async def from_async_playwright_response(\n        cls,\n        page: AsyncPage,\n        first_response: AsyncResponse,\n        final_response: Optional[AsyncResponse],\n        parser_arguments: Dict,\n        meta: Optional[Dict] = None,\n    ) -> Response:\n        \"\"\"\n        Transforms a Playwright response into an internal `Response` object, encapsulating\n        the page's content, response status, headers, and relevant metadata.\n\n        The function handles potential issues, such as empty or missing final responses,\n        by falling back to the first response if necessary. Encoding and status text\n        are also derived from the provided response headers or reasonable defaults.\n        Additionally, the page content and cookies are extracted for further use.\n\n        :param page: An asynchronous Playwright `Page` instance that represents the current browser page. Required to retrieve the page's URL, cookies, and content.\n        :param final_response: The last response received for the given request from the Playwright instance. Typically used as the main response object to derive status, headers, and other metadata.\n        :param first_response: An earlier or initial Playwright `Response` object that may serve as a fallback response in the absence of the final one.\n        :param parser_arguments: A dictionary containing additional arguments needed for parsing or further customization of the returned `Response`. These arguments are dynamically unpacked into\n            the `Response` object.\n        :param meta: Additional meta data to be saved with the response.\n\n        :return: A fully populated `Response` object containing the page's URL, content, status, headers, cookies, and other derived metadata.\n        :rtype: Response\n        \"\"\"\n        # In case we didn't catch a document type somehow\n        final_response = final_response if final_response else first_response\n        if not final_response:\n            raise ValueError(\"Failed to get a response from the page\")\n\n        encoding = cls.__extract_browser_encoding(final_response.headers.get(\"content-type\", \"\"))\n        # PlayWright API sometimes give empty status text for some reason!\n        status_text = final_response.status_text or StatusText.get(final_response.status)\n\n        history = await cls._async_process_response_history(first_response, parser_arguments)\n        try:\n            if \"html\" in (await final_response.all_headers()).get(\"content-type\", \"\"):\n                page_content = (await cls._get_async_page_content(page)).encode(\"utf-8\")\n            else:\n                page_content = await final_response.body()\n        except Exception as e:  # pragma: no cover\n            log.error(f\"Error getting page content in async: {e}\")\n            page_content = b\"\"\n\n        return Response(\n            **{\n                \"url\": page.url,\n                \"content\": page_content,\n                \"status\": final_response.status,\n                \"reason\": status_text,\n                \"encoding\": encoding,\n                \"cookies\": tuple(dict(cookie) for cookie in await page.context.cookies()),\n                \"headers\": await first_response.all_headers(),\n                \"request_headers\": await first_response.request.all_headers(),\n                \"history\": history,\n                \"meta\": meta,\n                **parser_arguments,\n            }\n        )\n\n    @staticmethod\n    def from_http_request(response: CurlResponse, parser_arguments: Dict, meta: Optional[Dict] = None) -> Response:\n        \"\"\"Takes `curl_cffi` response and generates `Response` object from it.\n\n        :param response: `curl_cffi` response object\n        :param parser_arguments: Additional arguments to be passed to the `Response` object constructor.\n        :param meta: Optional metadata dictionary to attach to the Response.\n        :return: A `Response` object that is the same as `Selector` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`\n        \"\"\"\n        return Response(\n            **{\n                \"url\": response.url,\n                \"content\": response.content,\n                \"status\": response.status_code,\n                \"reason\": response.reason,\n                \"encoding\": response.encoding or \"utf-8\",\n                \"cookies\": dict(response.cookies),\n                \"headers\": dict(response.headers),\n                \"request_headers\": dict(response.request.headers) if response.request else {},\n                \"method\": response.request.method if response.request else \"GET\",\n                \"history\": response.history,  # https://github.com/lexiforest/curl_cffi/issues/82\n                \"meta\": meta,\n                **parser_arguments,\n            }\n        )\n"
  },
  {
    "path": "scrapling/engines/toolbelt/custom.py",
    "content": "\"\"\"\nFunctions related to custom types or type checking\n\"\"\"\n\nfrom functools import lru_cache\n\nfrom scrapling.core.utils import log\nfrom scrapling.core._types import (\n    Any,\n    Dict,\n    cast,\n    List,\n    Tuple,\n    Union,\n    Optional,\n    Callable,\n    Sequence,\n    TYPE_CHECKING,\n    AsyncGenerator,\n)\nfrom scrapling.core.custom_types import MappingProxyType\nfrom scrapling.parser import Selector, SQLiteStorageSystem\n\nif TYPE_CHECKING:\n    from scrapling.spiders import Request\n\n\nclass Response(Selector):\n    \"\"\"This class is returned by all engines as a way to unify the response type between different libraries.\"\"\"\n\n    def __init__(\n        self,\n        url: str,\n        content: str | bytes,\n        status: int,\n        reason: str,\n        cookies: Tuple[Dict[str, str], ...] | Dict[str, str],\n        headers: Dict,\n        request_headers: Dict,\n        encoding: str = \"utf-8\",\n        method: str = \"GET\",\n        history: List | None = None,\n        meta: Dict[str, Any] | None = None,\n        **selector_config: Any,\n    ):\n        if isinstance(content, str):\n            content = content.encode(\"utf-8\")\n\n        adaptive_domain: str = cast(str, selector_config.pop(\"adaptive_domain\", \"\"))\n        self.status = status\n        self.reason = reason\n        self.cookies = cookies\n        self.headers = headers\n        self.request_headers = request_headers\n        self.history = history or []\n        super().__init__(\n            content=content,\n            url=adaptive_domain or url,\n            encoding=encoding,\n            **selector_config,\n        )\n        # For easier debugging while working from a Python shell\n        log.info(f\"Fetched ({status}) <{method} {url}> (referer: {request_headers.get('referer')})\")\n\n        if meta and not isinstance(meta, dict):\n            raise TypeError(f\"Response meta should be dictionary but got {type(meta).__name__} instead!\")\n\n        self.meta: Dict[str, Any] = meta or {}\n        self.request: Optional[\"Request\"] = None  # Will be set by crawler\n\n    @property\n    def body(self) -> bytes:\n        \"\"\"Return the raw body of the response as bytes.\"\"\"\n        return cast(bytes, cast(Sequence, self._raw_body))\n\n    def follow(\n        self,\n        url: str,\n        sid: str = \"\",\n        callback: Callable[[\"Response\"], AsyncGenerator[Union[Dict[str, Any], \"Request\", None], None]] | None = None,\n        priority: int | None = None,\n        dont_filter: bool = False,\n        meta: dict[str, Any] | None = None,\n        referer_flow: bool = True,\n        **kwargs: Any,\n    ) -> Any:\n        \"\"\"Create a Request to follow a URL.\n\n        This is a helper method for spiders to easily follow links found in pages.\n\n        **IMPORTANT**: The below arguments if left empty, the corresponding value from the previous request will be used. The only exception is `dont_filter`.\n\n        :param url: The URL to follow (can be relative, will be joined with current URL)\n        :param sid: The session id to use\n        :param callback: Spider callback method to use\n        :param priority: The priority number to use, the higher the number, the higher priority to be processed first.\n        :param dont_filter: If this request has been done before, disable the filter to allow it again.\n        :param meta: Additional meta data to included in the request\n        :param referer_flow: Enabled by default, set the current response url as referer for the new request url.\n        :param kwargs: Additional Request arguments\n        :return: Request object ready to be yielded\n        \"\"\"\n        from scrapling.spiders import Request\n\n        if not self.request or not isinstance(self.request, Request):\n            raise TypeError(\"This response has no request set yet.\")\n\n        # Merge original session kwargs with new kwargs (new takes precedence)\n        session_kwargs = {**self.request._session_kwargs, **kwargs}\n\n        if referer_flow:\n            # For requests\n            headers = session_kwargs.get(\"headers\", {})\n            headers[\"referer\"] = self.url\n            session_kwargs[\"headers\"] = headers\n\n            # For browsers\n            extra_headers = session_kwargs.get(\"extra_headers\", {})\n            extra_headers[\"referer\"] = self.url\n            session_kwargs[\"extra_headers\"] = extra_headers\n\n            session_kwargs[\"google_search\"] = False\n\n        return Request(\n            url=self.urljoin(url),\n            sid=sid or self.request.sid,\n            callback=callback or self.request.callback,\n            priority=priority if priority is not None else self.request.priority,\n            dont_filter=dont_filter,\n            meta={**(self.meta or {}), **(meta or {})},\n            **session_kwargs,\n        )\n\n    def __str__(self) -> str:\n        return f\"<{self.status} {self.url}>\"\n\n\nclass BaseFetcher:\n    __slots__ = ()\n    huge_tree: bool = True\n    adaptive: Optional[bool] = False\n    storage: Any = SQLiteStorageSystem\n    keep_cdata: Optional[bool] = False\n    storage_args: Optional[Dict] = None\n    keep_comments: Optional[bool] = False\n    adaptive_domain: str = \"\"\n    parser_keywords: Tuple = (\n        \"huge_tree\",\n        \"adaptive\",\n        \"storage\",\n        \"keep_cdata\",\n        \"storage_args\",\n        \"keep_comments\",\n        \"adaptive_domain\",\n    )  # Left open for the user\n\n    def __init__(self, *args, **kwargs):\n        # For backward-compatibility before 0.2.99\n        args_str = \", \".join(args) or \"\"\n        kwargs_str = \", \".join(f\"{k}={v}\" for k, v in kwargs.items()) or \"\"\n        if args_str:\n            args_str += \", \"\n\n        log.warning(\n            f\"This logic is deprecated now, and have no effect; It will be removed with v0.3. Use `{self.__class__.__name__}.configure({args_str}{kwargs_str})` instead before fetching\"\n        )\n        pass\n\n    @classmethod\n    def display_config(cls):\n        return dict(\n            huge_tree=cls.huge_tree,\n            keep_comments=cls.keep_comments,\n            keep_cdata=cls.keep_cdata,\n            adaptive=cls.adaptive,\n            storage=cls.storage,\n            storage_args=cls.storage_args,\n            adaptive_domain=cls.adaptive_domain,\n        )\n\n    @classmethod\n    def configure(cls, **kwargs):\n        \"\"\"Set multiple arguments for the parser at once globally\n\n        :param kwargs: The keywords can be any arguments of the following: huge_tree, keep_comments, keep_cdata, adaptive, storage, storage_args, adaptive_domain\n        \"\"\"\n        for key, value in kwargs.items():\n            key = key.strip().lower()\n            if hasattr(cls, key):\n                if key in cls.parser_keywords:\n                    setattr(cls, key, value)\n                else:\n                    # Yup, no fun allowed LOL\n                    raise AttributeError(f'Unknown parser argument: \"{key}\"; maybe you meant {cls.parser_keywords}?')\n            else:\n                raise ValueError(f'Unknown parser argument: \"{key}\"; maybe you meant {cls.parser_keywords}?')\n\n        if not kwargs:\n            raise AttributeError(f\"You must pass a keyword to configure, current keywords: {cls.parser_keywords}?\")\n\n    @classmethod\n    def _generate_parser_arguments(cls) -> Dict:\n        # Selector class parameters\n        # I won't validate Selector's class parameters here again, I will leave it to be validated later\n        parser_arguments = dict(\n            huge_tree=cls.huge_tree,\n            keep_comments=cls.keep_comments,\n            keep_cdata=cls.keep_cdata,\n            adaptive=cls.adaptive,\n            storage=cls.storage,\n            storage_args=cls.storage_args,\n            adaptive_domain=cls.adaptive_domain,\n        )\n\n        return parser_arguments\n\n\nclass StatusText:\n    \"\"\"A class that gets the status text of the response status code.\n\n    Reference: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status\n    \"\"\"\n\n    _phrases = MappingProxyType(\n        {\n            100: \"Continue\",\n            101: \"Switching Protocols\",\n            102: \"Processing\",\n            103: \"Early Hints\",\n            200: \"OK\",\n            201: \"Created\",\n            202: \"Accepted\",\n            203: \"Non-Authoritative Information\",\n            204: \"No Content\",\n            205: \"Reset Content\",\n            206: \"Partial Content\",\n            207: \"Multi-Status\",\n            208: \"Already Reported\",\n            226: \"IM Used\",\n            300: \"Multiple Choices\",\n            301: \"Moved Permanently\",\n            302: \"Found\",\n            303: \"See Other\",\n            304: \"Not Modified\",\n            305: \"Use Proxy\",\n            307: \"Temporary Redirect\",\n            308: \"Permanent Redirect\",\n            400: \"Bad Request\",\n            401: \"Unauthorized\",\n            402: \"Payment Required\",\n            403: \"Forbidden\",\n            404: \"Not Found\",\n            405: \"Method Not Allowed\",\n            406: \"Not Acceptable\",\n            407: \"Proxy Authentication Required\",\n            408: \"Request Timeout\",\n            409: \"Conflict\",\n            410: \"Gone\",\n            411: \"Length Required\",\n            412: \"Precondition Failed\",\n            413: \"Payload Too Large\",\n            414: \"URI Too Long\",\n            415: \"Unsupported Media Type\",\n            416: \"Range Not Satisfiable\",\n            417: \"Expectation Failed\",\n            418: \"I'm a teapot\",\n            421: \"Misdirected Request\",\n            422: \"Unprocessable Entity\",\n            423: \"Locked\",\n            424: \"Failed Dependency\",\n            425: \"Too Early\",\n            426: \"Upgrade Required\",\n            428: \"Precondition Required\",\n            429: \"Too Many Requests\",\n            431: \"Request Header Fields Too Large\",\n            451: \"Unavailable For Legal Reasons\",\n            500: \"Internal Server Error\",\n            501: \"Not Implemented\",\n            502: \"Bad Gateway\",\n            503: \"Service Unavailable\",\n            504: \"Gateway Timeout\",\n            505: \"HTTP Version Not Supported\",\n            506: \"Variant Also Negotiates\",\n            507: \"Insufficient Storage\",\n            508: \"Loop Detected\",\n            510: \"Not Extended\",\n            511: \"Network Authentication Required\",\n        }\n    )\n\n    @classmethod\n    @lru_cache(maxsize=128)\n    def get(cls, status_code: int) -> str:\n        \"\"\"Get the phrase for a given HTTP status code.\"\"\"\n        return cls._phrases.get(status_code, \"Unknown Status Code\")\n"
  },
  {
    "path": "scrapling/engines/toolbelt/fingerprints.py",
    "content": "\"\"\"\nFunctions related to generating headers and fingerprints generally\n\"\"\"\n\nfrom functools import lru_cache\nfrom platform import system as platform_system\n\nfrom browserforge.headers import Browser, HeaderGenerator\nfrom browserforge.headers.generator import SUPPORTED_OPERATING_SYSTEMS\n\nfrom scrapling.core._types import Dict, Literal, Tuple\n\n__OS_NAME__ = platform_system()\nOSName = Literal[\"linux\", \"macos\", \"windows\"]\n# Current versions hardcoded for now (Playwright doesn't allow to know the version of a browser without launching it)\nchromium_version = 145\nchrome_version = 145\n\n\n@lru_cache(1, typed=True)\ndef get_os_name() -> OSName | Tuple:\n    \"\"\"Get the current OS name in the same format needed for browserforge, if the OS is Unknown, return None so browserforge uses all.\n\n    :return: Current OS name or `None` otherwise\n    \"\"\"\n    match __OS_NAME__:  # pragma: no cover\n        case \"Linux\":\n            return \"linux\"\n        case \"Darwin\":\n            return \"macos\"\n        case \"Windows\":\n            return \"windows\"\n        case _:\n            return SUPPORTED_OPERATING_SYSTEMS\n\n\ndef generate_headers(browser_mode: bool | str = False) -> Dict:\n    \"\"\"Generate real browser-like headers using browserforge's generator\n\n    :param browser_mode: If enabled, the headers created are used for playwright, so it has to match everything\n    :return: A dictionary of the generated headers\n    \"\"\"\n    # In the browser mode, we don't care about anything other than matching the OS and the browser type with the browser we are using,\n    # So we don't raise any inconsistency red flags while websites fingerprinting us\n    os_name = get_os_name()\n    ver = chrome_version if browser_mode and browser_mode == \"chrome\" else chromium_version\n    browsers = [Browser(name=\"chrome\", min_version=ver, max_version=ver)]\n    if not browser_mode:\n        os_name = (\"windows\", \"macos\", \"linux\")\n        browsers.extend(\n            [\n                Browser(name=\"firefox\", min_version=142),\n                Browser(name=\"edge\", min_version=140),\n            ]\n        )\n    return HeaderGenerator(browser=browsers, os=os_name, device=\"desktop\").generate()\n\n\n__default_useragent__ = generate_headers(browser_mode=False).get(\"User-Agent\")\n"
  },
  {
    "path": "scrapling/engines/toolbelt/navigation.py",
    "content": "\"\"\"\nFunctions related to files and URLs\n\"\"\"\n\nfrom urllib.parse import urlparse\n\nfrom playwright.async_api import Route as async_Route\nfrom msgspec import Struct, structs, convert, ValidationError\nfrom playwright.sync_api import Route\n\nfrom scrapling.core.utils import log\nfrom scrapling.core._types import Dict, Set, Tuple, Optional, Callable\nfrom scrapling.engines.constants import EXTRA_RESOURCES\n\n\nclass ProxyDict(Struct):\n    server: str\n    username: str = \"\"\n    password: str = \"\"\n\n\ndef create_intercept_handler(disable_resources: bool, blocked_domains: Optional[Set[str]] = None) -> Callable:\n    \"\"\"Create a route handler that blocks both resource types and specific domains.\n\n    :param disable_resources: Whether to block default resource types.\n    :param blocked_domains: Set of domain names to block requests to.\n    :return: A sync route handler function.\n    \"\"\"\n    disabled_resources = EXTRA_RESOURCES if disable_resources else set()\n    domains = blocked_domains or set()\n\n    def handler(route: Route):\n        if route.request.resource_type in disabled_resources:\n            log.debug(f'Blocking background resource \"{route.request.url}\" of type \"{route.request.resource_type}\"')\n            route.abort()\n        elif domains:\n            hostname = urlparse(route.request.url).hostname or \"\"\n            if any(hostname == d or hostname.endswith(\".\" + d) for d in domains):\n                log.debug(f'Blocking request to blocked domain \"{hostname}\" ({route.request.url})')\n                route.abort()\n            else:\n                route.continue_()\n        else:\n            route.continue_()\n\n    return handler\n\n\ndef create_async_intercept_handler(disable_resources: bool, blocked_domains: Optional[Set[str]] = None) -> Callable:\n    \"\"\"Create an async route handler that blocks both resource types and specific domains.\n\n    :param disable_resources: Whether to block default resource types.\n    :param blocked_domains: Set of domain names to block requests to.\n    :return: An async route handler function.\n    \"\"\"\n    disabled_resources = EXTRA_RESOURCES if disable_resources else set()\n    domains = blocked_domains or set()\n\n    async def handler(route: async_Route):\n        if route.request.resource_type in disabled_resources:\n            log.debug(f'Blocking background resource \"{route.request.url}\" of type \"{route.request.resource_type}\"')\n            await route.abort()\n        elif domains:\n            hostname = urlparse(route.request.url).hostname or \"\"\n            if any(hostname == d or hostname.endswith(\".\" + d) for d in domains):\n                log.debug(f'Blocking request to blocked domain \"{hostname}\" ({route.request.url})')\n                await route.abort()\n            else:\n                await route.continue_()\n        else:\n            await route.continue_()\n\n    return handler\n\n\ndef construct_proxy_dict(proxy_string: str | Dict[str, str] | Tuple) -> Dict:\n    \"\"\"Validate a proxy and return it in the acceptable format for Playwright\n    Reference: https://playwright.dev/python/docs/network#http-proxy\n\n    :param proxy_string: A string or a dictionary representation of the proxy.\n    :return:\n    \"\"\"\n    if isinstance(proxy_string, str):\n        proxy = urlparse(proxy_string)\n        if proxy.scheme not in (\"http\", \"https\", \"socks4\", \"socks5\") or not proxy.hostname:\n            raise ValueError(\"Invalid proxy string!\")\n\n        try:\n            result = {\n                \"server\": f\"{proxy.scheme}://{proxy.hostname}\",\n                \"username\": proxy.username or \"\",\n                \"password\": proxy.password or \"\",\n            }\n            if proxy.port:\n                result[\"server\"] += f\":{proxy.port}\"\n            return result\n        except ValueError:\n            # Urllib will say that one of the parameters above can't be casted to the correct type like `int` for port etc...\n            raise ValueError(\"The proxy argument's string is in invalid format!\")\n\n    elif isinstance(proxy_string, dict):\n        try:\n            validated = convert(proxy_string, ProxyDict)\n            result_dict = structs.asdict(validated)\n            return result_dict\n        except ValidationError as e:\n            raise TypeError(f\"Invalid proxy dictionary: {e}\")\n\n    raise TypeError(f\"Invalid proxy string: {proxy_string}\")\n"
  },
  {
    "path": "scrapling/engines/toolbelt/proxy_rotation.py",
    "content": "from threading import Lock\n\nfrom scrapling.core._types import Callable, Dict, List, Tuple, ProxyType\n\n\nRotationStrategy = Callable[[List[ProxyType], int], Tuple[ProxyType, int]]\n_PROXY_ERROR_INDICATORS = {\n    \"net::err_proxy\",\n    \"net::err_tunnel\",\n    \"connection refused\",\n    \"connection reset\",\n    \"connection timed out\",\n    \"failed to connect\",\n    \"could not resolve proxy\",\n}\n\n\ndef _get_proxy_key(proxy: ProxyType) -> str:\n    \"\"\"Generate a unique key for a proxy (for dicts it's server plus username).\"\"\"\n    if isinstance(proxy, str):\n        return proxy\n    server = proxy.get(\"server\", \"\")\n    username = proxy.get(\"username\", \"\")\n    return f\"{server}|{username}\"\n\n\ndef is_proxy_error(error: Exception) -> bool:\n    \"\"\"Check if an error is proxy-related. Works for both HTTP and browser errors.\"\"\"\n    error_msg = str(error).lower()\n    return any(indicator in error_msg for indicator in _PROXY_ERROR_INDICATORS)\n\n\ndef cyclic_rotation(proxies: List[ProxyType], current_index: int) -> Tuple[ProxyType, int]:\n    \"\"\"Default cyclic rotation strategy — iterates through proxies sequentially, wrapping around at the end.\"\"\"\n    idx = current_index % len(proxies)\n    return proxies[idx], (idx + 1) % len(proxies)\n\n\nclass ProxyRotator:\n    \"\"\"\n    A thread-safe proxy rotator with pluggable rotation strategies.\n\n    Supports:\n    - Cyclic rotation (default)\n    - Custom rotation strategies via callable\n    - Both string URLs and Playwright-style dict proxies\n    \"\"\"\n\n    __slots__ = (\"_proxies\", \"_proxy_to_index\", \"_strategy\", \"_current_index\", \"_lock\")\n\n    def __init__(\n        self,\n        proxies: List[ProxyType],\n        strategy: RotationStrategy = cyclic_rotation,\n    ):\n        \"\"\"\n        Initialize the proxy rotator.\n\n        :param proxies: List of proxy URLs or Playwright-style proxy dicts.\n            - String format: \"http://proxy1:8080\" or \"http://user:pass@proxy:8080\"\n            - Dict format: {\"server\": \"http://proxy:8080\", \"username\": \"user\", \"password\": \"pass\"}\n        :param strategy: Rotation strategy function. Takes (proxies, current_index) and returns (proxy, next_index). Defaults to cyclic_rotation.\n        \"\"\"\n        if not proxies:\n            raise ValueError(\"At least one proxy must be provided\")\n\n        if not callable(strategy):\n            raise TypeError(f\"strategy must be callable, got {type(strategy).__name__}\")\n\n        self._strategy = strategy\n        self._lock = Lock()\n\n        # Validate and store proxies\n        self._proxies: List[ProxyType] = []\n        self._proxy_to_index: Dict[str, int] = {}  # O(1) lookup by unique key (server + username)\n        for i, proxy in enumerate(proxies):\n            if isinstance(proxy, (str, dict)):\n                if isinstance(proxy, dict) and \"server\" not in proxy:\n                    raise ValueError(\"Proxy dict must have a 'server' key\")\n\n                self._proxy_to_index[_get_proxy_key(proxy)] = i\n                self._proxies.append(proxy)\n            else:\n                raise TypeError(f\"Invalid proxy type: {type(proxy)}. Expected str or dict.\")\n\n        self._current_index = 0\n\n    def get_proxy(self) -> ProxyType:\n        \"\"\"Get the next proxy according to the rotation strategy.\"\"\"\n        with self._lock:\n            proxy, self._current_index = self._strategy(self._proxies, self._current_index)\n            return proxy\n\n    @property\n    def proxies(self) -> List[ProxyType]:\n        \"\"\"Get a copy of all configured proxies.\"\"\"\n        return list(self._proxies)\n\n    def __len__(self) -> int:\n        \"\"\"Return the total number of configured proxies.\"\"\"\n        return len(self._proxies)\n\n    def __repr__(self) -> str:\n        return f\"ProxyRotator(proxies={len(self._proxies)})\"\n"
  },
  {
    "path": "scrapling/fetchers/__init__.py",
    "content": "from typing import TYPE_CHECKING, Any\nfrom scrapling.engines.toolbelt import ProxyRotator\n\nif TYPE_CHECKING:\n    from scrapling.fetchers.requests import Fetcher, AsyncFetcher, FetcherSession\n    from scrapling.fetchers.chrome import DynamicFetcher, DynamicSession, AsyncDynamicSession\n    from scrapling.fetchers.stealth_chrome import StealthyFetcher, StealthySession, AsyncStealthySession\n\n\n# Lazy import mapping\n_LAZY_IMPORTS = {\n    \"Fetcher\": (\"scrapling.fetchers.requests\", \"Fetcher\"),\n    \"AsyncFetcher\": (\"scrapling.fetchers.requests\", \"AsyncFetcher\"),\n    \"FetcherSession\": (\"scrapling.fetchers.requests\", \"FetcherSession\"),\n    \"DynamicFetcher\": (\"scrapling.fetchers.chrome\", \"DynamicFetcher\"),\n    \"DynamicSession\": (\"scrapling.fetchers.chrome\", \"DynamicSession\"),\n    \"AsyncDynamicSession\": (\"scrapling.fetchers.chrome\", \"AsyncDynamicSession\"),\n    \"StealthyFetcher\": (\"scrapling.fetchers.stealth_chrome\", \"StealthyFetcher\"),\n    \"StealthySession\": (\"scrapling.fetchers.stealth_chrome\", \"StealthySession\"),\n    \"AsyncStealthySession\": (\"scrapling.fetchers.stealth_chrome\", \"AsyncStealthySession\"),\n}\n\n__all__ = [\n    \"Fetcher\",\n    \"AsyncFetcher\",\n    \"ProxyRotator\",\n    \"FetcherSession\",\n    \"DynamicFetcher\",\n    \"DynamicSession\",\n    \"AsyncDynamicSession\",\n    \"StealthyFetcher\",\n    \"StealthySession\",\n    \"AsyncStealthySession\",\n]\n\n\ndef __getattr__(name: str) -> Any:\n    if name in _LAZY_IMPORTS:\n        module_path, class_name = _LAZY_IMPORTS[name]\n        module = __import__(module_path, fromlist=[class_name])\n        return getattr(module, class_name)\n    else:\n        raise AttributeError(f\"module {__name__!r} has no attribute {name!r}\")\n\n\ndef __dir__() -> list[str]:\n    \"\"\"Support for dir() and autocomplete.\"\"\"\n    return sorted(list(_LAZY_IMPORTS.keys()))\n"
  },
  {
    "path": "scrapling/fetchers/chrome.py",
    "content": "from scrapling.core._types import Unpack\nfrom scrapling.engines._browsers._types import PlaywrightSession\nfrom scrapling.engines.toolbelt.custom import BaseFetcher, Response\nfrom scrapling.engines._browsers._controllers import DynamicSession, AsyncDynamicSession\n\n\nclass DynamicFetcher(BaseFetcher):\n    \"\"\"A `Fetcher` that provide many options to fetch/load websites' pages through chromium-based browsers.\"\"\"\n\n    @classmethod\n    def fetch(cls, url: str, **kwargs: Unpack[PlaywrightSession]) -> Response:\n        \"\"\"Opens up a browser and do your request based on your chosen options below.\n\n        :param url: Target url.\n        :param headless: Run the browser in headless/hidden (default), or headful/visible mode.\n        :param disable_resources: Drop requests for unnecessary resources for a speed boost.\n        :param blocked_domains: A set of domain names to block requests to. Subdomains are also matched (e.g., ``\"example.com\"`` blocks ``\"sub.example.com\"`` too).\n        :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.\n        :param cookies: Set cookies for the next request.\n        :param network_idle: Wait for the page until there are no network connections for at least 500 ms.\n        :param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.\n        :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000\n        :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the Response object.\n        :param page_action: Added for automation. A function that takes the `page` object and does the automation you need.\n        :param wait_selector: Wait for a specific CSS selector to be in a specific state.\n        :param init_script: An absolute path to a JavaScript file to be executed on page creation with this request.\n        :param locale: Set the locale for the browser if wanted. Defaults to the system default locale.\n        :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.\n        :param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.\n        :param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.\n        :param google_search: Enabled by default, Scrapling will set a Google referer header.\n        :param extra_headers: A dictionary of extra headers to add to the request.\n        :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.\n        :param extra_flags: A list of additional browser flags to pass to the browser on launch.\n        :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.\n        :param additional_args: Additional arguments to be passed to Playwright's context as additional settings.\n        :return: A `Response` object.\n        \"\"\"\n        selector_config = kwargs.get(\"selector_config\", {}) or kwargs.get(\n            \"custom_config\", {}\n        )  # Checking `custom_config` for backward compatibility\n        if not isinstance(selector_config, dict):\n            raise TypeError(\"Argument `selector_config` must be a dictionary.\")\n\n        kwargs[\"selector_config\"] = {**cls._generate_parser_arguments(), **selector_config}\n\n        with DynamicSession(**kwargs) as session:\n            return session.fetch(url)\n\n    @classmethod\n    async def async_fetch(cls, url: str, **kwargs: Unpack[PlaywrightSession]) -> Response:\n        \"\"\"Opens up a browser and do your request based on your chosen options below.\n\n        :param url: Target url.\n        :param headless: Run the browser in headless/hidden (default), or headful/visible mode.\n        :param disable_resources: Drop requests for unnecessary resources for a speed boost.\n        :param blocked_domains: A set of domain names to block requests to. Subdomains are also matched (e.g., ``\"example.com\"`` blocks ``\"sub.example.com\"`` too).\n        :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.\n        :param cookies: Set cookies for the next request.\n        :param network_idle: Wait for the page until there are no network connections for at least 500 ms.\n        :param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.\n        :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000\n        :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the Response object.\n        :param page_action: Added for automation. A function that takes the `page` object and does the automation you need.\n        :param wait_selector: Wait for a specific CSS selector to be in a specific state.\n        :param init_script: An absolute path to a JavaScript file to be executed on page creation with this request.\n        :param locale: Set the locale for the browser if wanted. Defaults to the system default locale.\n        :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.\n        :param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.\n        :param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.\n        :param google_search: Enabled by default, Scrapling will set a Google referer header.\n        :param extra_headers: A dictionary of extra headers to add to the request.\n        :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.\n        :param extra_flags: A list of additional browser flags to pass to the browser on launch.\n        :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.\n        :param additional_args: Additional arguments to be passed to Playwright's context as additional settings.\n        :return: A `Response` object.\n        \"\"\"\n        selector_config = kwargs.get(\"selector_config\", {}) or kwargs.get(\n            \"custom_config\", {}\n        )  # Checking `custom_config` for backward compatibility\n        if not isinstance(selector_config, dict):\n            raise TypeError(\"Argument `selector_config` must be a dictionary.\")\n\n        kwargs[\"selector_config\"] = {**cls._generate_parser_arguments(), **selector_config}\n\n        async with AsyncDynamicSession(**kwargs) as session:\n            return await session.fetch(url)\n\n\nPlayWrightFetcher = DynamicFetcher  # For backward-compatibility\n"
  },
  {
    "path": "scrapling/fetchers/requests.py",
    "content": "from scrapling.engines.static import (\n    FetcherSession,\n    FetcherClient as _FetcherClient,\n    AsyncFetcherClient as _AsyncFetcherClient,\n)\nfrom scrapling.engines.toolbelt.custom import BaseFetcher\n\n\n__FetcherClientInstance__ = _FetcherClient()\n__AsyncFetcherClientInstance__ = _AsyncFetcherClient()\n\n\nclass Fetcher(BaseFetcher):\n    \"\"\"A basic `Fetcher` class type that can only do basic GET, POST, PUT, and DELETE HTTP requests based on `curl_cffi`.\"\"\"\n\n    get = __FetcherClientInstance__.get\n    post = __FetcherClientInstance__.post\n    put = __FetcherClientInstance__.put\n    delete = __FetcherClientInstance__.delete\n\n\nclass AsyncFetcher(BaseFetcher):\n    \"\"\"A basic `Fetcher` class type that can only do basic GET, POST, PUT, and DELETE HTTP requests based on `curl_cffi`.\"\"\"\n\n    get = __AsyncFetcherClientInstance__.get\n    post = __AsyncFetcherClientInstance__.post\n    put = __AsyncFetcherClientInstance__.put\n    delete = __AsyncFetcherClientInstance__.delete\n"
  },
  {
    "path": "scrapling/fetchers/stealth_chrome.py",
    "content": "from scrapling.core._types import Unpack\nfrom scrapling.engines._browsers._types import StealthSession\nfrom scrapling.engines.toolbelt.custom import BaseFetcher, Response\nfrom scrapling.engines._browsers._stealth import StealthySession, AsyncStealthySession\n\n\nclass StealthyFetcher(BaseFetcher):\n    \"\"\"A `Fetcher` class type which is a completely stealthy built on top of Chromium.\n\n    It works as real browsers passing almost all online tests/protections with many customization options.\n    \"\"\"\n\n    @classmethod\n    def fetch(cls, url: str, **kwargs: Unpack[StealthSession]) -> Response:\n        \"\"\"\n        Opens up a browser and do your request based on your chosen options below.\n\n        :param url: Target url.\n        :param headless: Run the browser in headless/hidden (default), or headful/visible mode.\n        :param disable_resources: Drop requests for unnecessary resources for a speed boost.\n            Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.\n        :param blocked_domains: A set of domain names to block requests to. Subdomains are also matched (e.g., ``\"example.com\"`` blocks ``\"sub.example.com\"`` too).\n        :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.\n        :param cookies: Set cookies for the next request.\n        :param network_idle: Wait for the page until there are no network connections for at least 500 ms.\n        :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000\n        :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.\n        :param page_action: Added for automation. A function that takes the `page` object and does the automation you need.\n        :param wait_selector: Wait for a specific CSS selector to be in a specific state.\n        :param init_script: An absolute path to a JavaScript file to be executed on page creation for all pages in this session.\n        :param locale: Specify user locale, for example, `en-GB`, `de-DE`, etc. Locale will affect navigator.language value, Accept-Language request header value as well as number and date formatting\n            rules. Defaults to the system default locale.\n        :param timezone_id: Changes the timezone of the browser. Defaults to the system timezone.\n        :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.\n        :param solve_cloudflare: Solves all types of the Cloudflare's Turnstile/Interstitial challenges before returning the response to you.\n        :param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.\n        :param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.\n        :param block_webrtc: Forces WebRTC to respect proxy settings to prevent local IP address leak.\n        :param allow_webgl: Enabled by default. Disabling it disables WebGL and WebGL 2.0 support entirely. Disabling WebGL is not recommended as many WAFs now check if WebGL is enabled.\n        :param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.\n        :param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.\n        :param google_search: Enabled by default, Scrapling will set a Google referer header.\n        :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by `google_search` takes priority over the referer set here if used together._\n        :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.\n        :param user_data_dir: Path to a User Data Directory, which stores browser session data like cookies and local storage. The default is to create a temporary directory.\n        :param extra_flags: A list of additional browser flags to pass to the browser on launch.\n        :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.\n        :param additional_args: Additional arguments to be passed to Playwright's context as additional settings, and it takes higher priority than Scrapling's settings.\n        :return: A `Response` object.\n        \"\"\"\n        selector_config = kwargs.get(\"selector_config\", {}) or kwargs.get(\n            \"custom_config\", {}\n        )  # Checking `custom_config` for backward compatibility\n        if not isinstance(selector_config, dict):\n            raise TypeError(\"Argument `selector_config` must be a dictionary.\")\n\n        kwargs[\"selector_config\"] = {**cls._generate_parser_arguments(), **selector_config}\n\n        with StealthySession(**kwargs) as engine:\n            return engine.fetch(url)\n\n    @classmethod\n    async def async_fetch(cls, url: str, **kwargs: Unpack[StealthSession]) -> Response:\n        \"\"\"\n        Opens up a browser and do your request based on your chosen options below.\n\n        :param url: Target url.\n        :param headless: Run the browser in headless/hidden (default), or headful/visible mode.\n        :param disable_resources: Drop requests for unnecessary resources for a speed boost.\n            Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.\n        :param blocked_domains: A set of domain names to block requests to. Subdomains are also matched (e.g., ``\"example.com\"`` blocks ``\"sub.example.com\"`` too).\n        :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.\n        :param cookies: Set cookies for the next request.\n        :param network_idle: Wait for the page until there are no network connections for at least 500 ms.\n        :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000\n        :param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.\n        :param page_action: Added for automation. A function that takes the `page` object and does the automation you need.\n        :param wait_selector: Wait for a specific CSS selector to be in a specific state.\n        :param init_script: An absolute path to a JavaScript file to be executed on page creation for all pages in this session.\n        :param locale: Specify user locale, for example, `en-GB`, `de-DE`, etc. Locale will affect navigator.language value, Accept-Language request header value as well as number and date formatting\n            rules. Defaults to the system default locale.\n        :param timezone_id: Changes the timezone of the browser. Defaults to the system timezone.\n        :param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.\n        :param solve_cloudflare: Solves all types of the Cloudflare's Turnstile/Interstitial challenges before returning the response to you.\n        :param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.\n        :param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.\n        :param block_webrtc: Forces WebRTC to respect proxy settings to prevent local IP address leak.\n        :param allow_webgl: Enabled by default. Disabling it disables WebGL and WebGL 2.0 support entirely. Disabling WebGL is not recommended as many WAFs now check if WebGL is enabled.\n        :param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.\n        :param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.\n        :param google_search: Enabled by default, Scrapling will set a Google referer header.\n        :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by `google_search` takes priority over the referer set here if used together._\n        :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.\n        :param user_data_dir: Path to a User Data Directory, which stores browser session data like cookies and local storage. The default is to create a temporary directory.\n        :param extra_flags: A list of additional browser flags to pass to the browser on launch.\n        :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.\n        :param additional_args: Additional arguments to be passed to Playwright's context as additional settings, and it takes higher priority than Scrapling's settings.\n        :return: A `Response` object.\n        \"\"\"\n        selector_config = kwargs.get(\"selector_config\", {}) or kwargs.get(\n            \"custom_config\", {}\n        )  # Checking `custom_config` for backward compatibility\n        if not isinstance(selector_config, dict):\n            raise TypeError(\"Argument `selector_config` must be a dictionary.\")\n\n        kwargs[\"selector_config\"] = {**cls._generate_parser_arguments(), **selector_config}\n\n        async with AsyncStealthySession(**kwargs) as engine:\n            return await engine.fetch(url)\n"
  },
  {
    "path": "scrapling/parser.py",
    "content": "from pathlib import Path\nfrom inspect import signature\nfrom urllib.parse import urljoin\nfrom difflib import SequenceMatcher\nfrom re import Pattern as re_Pattern\n\nfrom lxml.html import HtmlElement, HTMLParser\nfrom cssselect import SelectorError, SelectorSyntaxError, parse as split_selectors\nfrom lxml.etree import (\n    XPath,\n    tostring,\n    fromstring,\n    XPathError,\n    XPathEvalError,\n    _ElementUnicodeResult,\n)\n\nfrom scrapling.core._types import (\n    Any,\n    Set,\n    Dict,\n    cast,\n    List,\n    Tuple,\n    Union,\n    TypeVar,\n    Pattern,\n    Callable,\n    Literal,\n    Optional,\n    Iterable,\n    overload,\n    Generator,\n    SupportsIndex,\n    TYPE_CHECKING,\n)\nfrom scrapling.core.custom_types import AttributesHandler, TextHandler, TextHandlers\nfrom scrapling.core.mixins import SelectorsGeneration\nfrom scrapling.core.storage import (\n    SQLiteStorageSystem,\n    StorageSystemMixin,\n    _StorageTools,\n)\nfrom scrapling.core.translator import css_to_xpath as _css_to_xpath\nfrom scrapling.core.utils import clean_spaces, flatten, html_forbidden, log\n\n__DEFAULT_DB_FILE__ = str(Path(__file__).parent / \"elements_storage.db\")\n# Attributes that are Python reserved words and can't be used directly\n# Ex: find_all('a', class=\"blah\") -> find_all('a', class_=\"blah\")\n# https://www.w3schools.com/python/python_ref_keywords.asp\n_whitelisted = {\n    \"class_\": \"class\",\n    \"for_\": \"for\",\n}\n_T = TypeVar(\"_T\")\n# Pre-compiled selectors for efficiency\n_find_all_elements = XPath(\".//*\")\n_find_all_elements_with_spaces = XPath(\n    \".//*[normalize-space(text())]\"\n)  # This selector gets all elements with text content\n_find_all_text_nodes = XPath(\".//text()\")\n\n\nclass Selector(SelectorsGeneration):\n    __slots__ = (\n        \"url\",\n        \"encoding\",\n        \"__adaptive_enabled\",\n        \"_root\",\n        \"_storage\",\n        \"__keep_comments\",\n        \"__huge_tree_enabled\",\n        \"__attributes\",\n        \"__text\",\n        \"__tag\",\n        \"__keep_cdata\",\n        \"_raw_body\",\n    )\n\n    def __init__(\n        self,\n        content: Optional[str | bytes] = None,\n        url: str = \"\",\n        encoding: str = \"utf-8\",\n        huge_tree: bool = True,\n        root: Optional[HtmlElement] = None,\n        keep_comments: Optional[bool] = False,\n        keep_cdata: Optional[bool] = False,\n        adaptive: Optional[bool] = False,\n        _storage: Optional[StorageSystemMixin] = None,\n        storage: Any = SQLiteStorageSystem,\n        storage_args: Optional[Dict] = None,\n        **_,\n    ):\n        \"\"\"The main class that works as a wrapper for the HTML input data. Using this class, you can search for elements\n        with expressions in CSS, XPath, or with simply text. Check the docs for more info.\n\n        Here we try to extend module ``lxml.html.HtmlElement`` while maintaining a simpler interface, We are not\n        inheriting from the ``lxml.html.HtmlElement`` because it's not pickleable, which makes a lot of reference jobs\n        not possible. You can test it here and see code explodes with `AssertionError: invalid Element proxy at...`.\n        It's an old issue with lxml, see `this entry <https://bugs.launchpad.net/lxml/+bug/736708>`\n\n        :param content: HTML content as either string or bytes.\n        :param url: It allows storing a URL with the HTML data for retrieving later.\n        :param encoding: The encoding type that will be used in HTML parsing, default is `UTF-8`\n        :param huge_tree: Enabled by default, should always be enabled when parsing large HTML documents. This controls\n             the libxml2 feature that forbids parsing certain large documents to protect from possible memory exhaustion.\n        :param root: Used internally to pass etree objects instead of text/body arguments, it takes the highest priority.\n            Don't use it unless you know what you are doing!\n        :param keep_comments: While parsing the HTML body, drop comments or not. Disabled by default for obvious reasons\n        :param keep_cdata: While parsing the HTML body, drop cdata or not. Disabled by default for cleaner HTML.\n        :param adaptive: Globally turn off the adaptive feature in all functions, this argument takes higher\n            priority over all adaptive related arguments/functions in the class.\n        :param storage: The storage class to be passed for adaptive functionalities, see ``Docs`` for more info.\n        :param storage_args: A dictionary of ``argument->value`` pairs to be passed for the storage class.\n            If empty, default values will be used.\n        \"\"\"\n        if root is None and content is None:\n            raise ValueError(\"Selector class needs HTML content, or root arguments to work\")\n\n        self.url = url\n        self._raw_body: str | bytes = \"\"\n        self.encoding = encoding\n        self.__keep_cdata = keep_cdata\n        self.__huge_tree_enabled = huge_tree\n        self.__keep_comments = keep_comments\n        # For selector stuff\n        self.__text: Optional[TextHandler] = None\n        self.__attributes: Optional[AttributesHandler] = None\n        self.__tag: Optional[str] = None\n        self._storage: Optional[StorageSystemMixin] = None\n        if root is None:\n            body: str | bytes\n            if isinstance(content, str):\n                body = content.strip().replace(\"\\x00\", \"\") or \"<html/>\"\n            elif isinstance(content, bytes):\n                body = content.replace(b\"\\x00\", b\"\")\n            else:\n                raise TypeError(f\"content argument must be str or bytes, got {type(content)}\")\n\n            # https://lxml.de/api/lxml.etree.HTMLParser-class.html\n            _parser_kwargs: Dict[str, Any] = dict(\n                recover=True,\n                remove_blank_text=True,\n                remove_comments=(not keep_comments),\n                encoding=encoding,\n                compact=True,\n                huge_tree=huge_tree,\n                default_doctype=True,  # Supported by lxml but missing from stubs\n                strip_cdata=(not keep_cdata),\n            )\n            parser = HTMLParser(**_parser_kwargs)\n            self._root = cast(HtmlElement, fromstring(body or \"<html/>\", parser=parser, base_url=url or \"\"))\n            self._raw_body = content\n\n        else:\n            self._root = cast(HtmlElement, root)\n\n            if self._is_text_node(root):\n                self.__adaptive_enabled = False\n                return\n\n        self.__adaptive_enabled = bool(adaptive)\n\n        if self.__adaptive_enabled:\n            if _storage is not None:\n                self._storage = _storage\n            else:\n                if not storage_args:\n                    storage_args = {\n                        \"storage_file\": __DEFAULT_DB_FILE__,\n                        \"url\": url,\n                    }\n\n                if not hasattr(storage, \"__wrapped__\"):\n                    raise ValueError(\"Storage class must be wrapped with lru_cache decorator, see docs for info\")\n\n                if not issubclass(storage.__wrapped__, StorageSystemMixin):  # pragma: no cover\n                    raise ValueError(\"Storage system must be inherited from class `StorageSystemMixin`\")\n\n                self._storage = storage(**storage_args)\n\n    def __getitem__(self, key: str) -> TextHandler:\n        if self._is_text_node(self._root):\n            raise TypeError(\"Text nodes do not have attributes\")\n        return self.attrib[key]\n\n    def __contains__(self, key: str) -> bool:\n        if self._is_text_node(self._root):\n            return False\n        return key in self.attrib\n\n    # Node functionalities, I wanted to move to a separate Mixin class, but it had a slight impact on performance\n    @staticmethod\n    def _is_text_node(\n        element: HtmlElement | _ElementUnicodeResult,\n    ) -> bool:\n        \"\"\"Return True if the given element is a result of a string expression\n        Examples:\n            XPath -> '/text()', '/@attribute', etc...\n            CSS3 -> '::text', '::attr(attrib)'...\n        \"\"\"\n        # Faster than checking `element.is_attribute or element.is_text or element.is_tail`\n        return issubclass(type(element), _ElementUnicodeResult)\n\n    def __element_convertor(self, element: HtmlElement | _ElementUnicodeResult) -> \"Selector\":\n        \"\"\"Used internally to convert a single HtmlElement or text node to Selector directly without checks\"\"\"\n        return Selector(\n            root=element,\n            url=self.url,\n            encoding=self.encoding,\n            adaptive=self.__adaptive_enabled,\n            _storage=self._storage,\n            keep_comments=self.__keep_comments,\n            keep_cdata=self.__keep_cdata,\n            huge_tree=self.__huge_tree_enabled,\n        )\n\n    def __elements_convertor(self, elements: List[HtmlElement | _ElementUnicodeResult]) -> \"Selectors\":\n        # Store them for non-repeated call-ups\n        url = self.url\n        encoding = self.encoding\n        adaptive = self.__adaptive_enabled\n        storage = self._storage\n        comments = self.__keep_comments\n        cdata = self.__keep_cdata\n        huge_tree = self.__huge_tree_enabled\n\n        return Selectors(\n            Selector(\n                root=el,\n                url=url,\n                encoding=encoding,\n                adaptive=adaptive,\n                _storage=storage,\n                keep_comments=comments,\n                keep_cdata=cdata,\n                huge_tree=huge_tree,\n            )\n            for el in elements\n        )\n\n    def __handle_elements(self, result: List[HtmlElement | _ElementUnicodeResult]) -> \"Selectors\":\n        \"\"\"Used internally in all functions to convert results to Selectors in bulk\"\"\"\n        if not result:\n            return Selectors()\n\n        return self.__elements_convertor(result)\n\n    def __getstate__(self) -> Any:\n        # lxml don't like it :)\n        raise TypeError(\"Can't pickle Selector objects\")\n\n    # The following four properties I made them into functions instead of variables directly\n    # So they don't slow down the process of initializing many instances of the class and gets executed only\n    # when the user needs them for the first time for that specific element and gets cached for next times\n    # Doing that only made the library performance test sky rocked multiple times faster than before\n    # because I was executing them on initialization before :))\n    @property\n    def tag(self) -> str:\n        \"\"\"Get the tag name of the element\"\"\"\n        if self._is_text_node(self._root):\n            return \"#text\"\n        if not self.__tag:\n            self.__tag = str(self._root.tag)\n        return self.__tag or \"\"\n\n    @property\n    def text(self) -> TextHandler:\n        \"\"\"Get text content of the element\"\"\"\n        if self._is_text_node(self._root):\n            return TextHandler(str(self._root))\n        if self.__text is None:\n            # If you want to escape lxml default behavior and remove comments like this `<span>CONDITION: <!-- -->Excellent</span>`\n            # before extracting text, then keep `keep_comments` set to False while initializing the first class\n            self.__text = TextHandler(self._root.text or \"\")\n        return self.__text\n\n    def get_all_text(\n        self,\n        separator: str = \"\\n\",\n        strip: bool = False,\n        ignore_tags: Tuple = (\n            \"script\",\n            \"style\",\n        ),\n        valid_values: bool = True,\n    ) -> TextHandler:\n        \"\"\"Get all child strings of this element, concatenated using the given separator.\n\n        :param separator: Strings will be concatenated using this separator.\n        :param strip: If True, strings will be stripped before being concatenated.\n        :param ignore_tags: A tuple of all tag names you want to ignore\n        :param valid_values: If enabled, elements with text-content that is empty or only whitespaces will be ignored\n\n        :return: A TextHandler\n        \"\"\"\n        if self._is_text_node(self._root):\n            return TextHandler(str(self._root))\n\n        ignored_elements: set[Any] = set()\n        if ignore_tags:\n            ignored_elements.update(self._root.iter(*ignore_tags))\n\n        _all_strings = []\n\n        def append_text(text: str) -> None:\n            processed_text = text.strip() if strip else text\n            if not valid_values or processed_text.strip():\n                _all_strings.append(processed_text)\n\n        def is_visible_text_node(text_node: _ElementUnicodeResult) -> bool:\n            parent = text_node.getparent()\n            if parent is None:\n                return False\n\n            owner = parent.getparent() if text_node.is_tail else parent\n            while owner is not None:\n                if owner in ignored_elements:\n                    return False\n                owner = owner.getparent()\n            return True\n\n        for text_node in cast(list[_ElementUnicodeResult], _find_all_text_nodes(self._root)):\n            text = str(text_node)\n            if text and is_visible_text_node(text_node):\n                append_text(text)\n\n        return cast(TextHandler, TextHandler(separator).join(_all_strings))\n\n    def urljoin(self, relative_url: str) -> str:\n        \"\"\"Join this Selector's url with a relative url to form an absolute full URL.\"\"\"\n        return urljoin(self.url, relative_url)\n\n    @property\n    def attrib(self) -> AttributesHandler:\n        \"\"\"Get attributes of the element\"\"\"\n        if self._is_text_node(self._root):\n            return AttributesHandler({})\n        if not self.__attributes:\n            self.__attributes = AttributesHandler(self._root.attrib)\n        return self.__attributes\n\n    @property\n    def html_content(self) -> TextHandler:\n        \"\"\"Return the inner HTML code of the element\"\"\"\n        if self._is_text_node(self._root):\n            return TextHandler(str(self._root))\n        content = tostring(self._root, encoding=self.encoding, method=\"html\", with_tail=False)\n        if isinstance(content, bytes):\n            content = content.strip().decode(self.encoding)\n        return TextHandler(content)\n\n    @property\n    def body(self) -> str | bytes:\n        \"\"\"Return the raw body of the current `Selector` without any processing. Useful for binary and non-HTML requests.\"\"\"\n        if self._is_text_node(self._root):\n            return \"\"\n        return self._raw_body\n\n    def prettify(self) -> TextHandler:\n        \"\"\"Return a prettified version of the element's inner html-code\"\"\"\n        if self._is_text_node(self._root):\n            return TextHandler(str(self._root))\n        content = tostring(\n            self._root,\n            encoding=self.encoding,\n            pretty_print=True,\n            method=\"html\",\n            with_tail=False,\n        )\n        if isinstance(content, bytes):\n            content = content.strip().decode(self.encoding)\n        return TextHandler(content)\n\n    def has_class(self, class_name: str) -> bool:\n        \"\"\"Check if the element has a specific class\n        :param class_name: The class name to check for\n        :return: True if element has class with that name otherwise False\n        \"\"\"\n        if self._is_text_node(self._root):\n            return False\n        return class_name in self._root.classes\n\n    @property\n    def parent(self) -> Optional[\"Selector\"]:\n        \"\"\"Return the direct parent of the element or ``None`` otherwise\"\"\"\n        _parent = self._root.getparent()\n        return self.__element_convertor(_parent) if _parent is not None else None\n\n    @property\n    def below_elements(self) -> \"Selectors\":\n        \"\"\"Return all elements under the current element in the DOM tree\"\"\"\n        if self._is_text_node(self._root):\n            return Selectors()\n        below = cast(List, _find_all_elements(self._root))\n        return self.__elements_convertor(below) if below is not None else Selectors()\n\n    @property\n    def children(self) -> \"Selectors\":\n        \"\"\"Return the children elements of the current element or empty list otherwise\"\"\"\n        if self._is_text_node(self._root):\n            return Selectors()\n        return Selectors(\n            self.__element_convertor(child)\n            for child in self._root.iterchildren()\n            if not isinstance(child, html_forbidden)\n        )\n\n    @property\n    def siblings(self) -> \"Selectors\":\n        \"\"\"Return other children of the current element's parent or empty list otherwise\"\"\"\n        if self.parent:\n            return Selectors(child for child in self.parent.children if child._root != self._root)\n        return Selectors()\n\n    def iterancestors(self) -> Generator[\"Selector\", None, None]:\n        \"\"\"Return a generator that loops over all ancestors of the element, starting with the element's parent.\"\"\"\n        if self._is_text_node(self._root):\n            return\n        for ancestor in self._root.iterancestors():\n            yield self.__element_convertor(ancestor)\n\n    def find_ancestor(self, func: Callable[[\"Selector\"], bool]) -> Optional[\"Selector\"]:\n        \"\"\"Loop over all ancestors of the element till one match the passed function\n        :param func: A function that takes each ancestor as an argument and returns True/False\n        :return: The first ancestor that match the function or ``None`` otherwise.\n        \"\"\"\n        for ancestor in self.iterancestors():\n            if func(ancestor):\n                return ancestor\n        return None\n\n    @property\n    def path(self) -> \"Selectors\":\n        \"\"\"Returns a list of type `Selectors` that contains the path leading to the current element from the root.\"\"\"\n        lst = list(self.iterancestors())\n        return Selectors(lst)\n\n    @property\n    def next(self) -> Optional[\"Selector\"]:\n        \"\"\"Returns the next element of the current element in the children of the parent or ``None`` otherwise.\"\"\"\n        if self._is_text_node(self._root):\n            return None\n        next_element = self._root.getnext()\n        while next_element is not None and isinstance(next_element, html_forbidden):\n            # Ignore HTML comments and unwanted types\n            next_element = next_element.getnext()\n\n        return self.__element_convertor(next_element) if next_element is not None else None\n\n    @property\n    def previous(self) -> Optional[\"Selector\"]:\n        \"\"\"Returns the previous element of the current element in the children of the parent or ``None`` otherwise.\"\"\"\n        if self._is_text_node(self._root):\n            return None\n        prev_element = self._root.getprevious()\n        while prev_element is not None and isinstance(prev_element, html_forbidden):\n            # Ignore HTML comments and unwanted types\n            prev_element = prev_element.getprevious()\n\n        return self.__element_convertor(prev_element) if prev_element is not None else None\n\n    def get(self) -> TextHandler:\n        \"\"\"\n        Serialize this element to a string.\n        For text nodes, returns the text value. For HTML elements, returns the outer HTML.\n        \"\"\"\n        if self._is_text_node(self._root):\n            return TextHandler(str(self._root))\n        return self.html_content\n\n    def getall(self) -> TextHandlers:\n        \"\"\"Return a single-element list containing this element's serialized string.\"\"\"\n        return TextHandlers([self.get()])\n\n    extract = getall\n    extract_first = get\n\n    def __str__(self) -> str:\n        if self._is_text_node(self._root):\n            return str(self._root)\n        return self.html_content\n\n    def __repr__(self) -> str:\n        length_limit = 40\n\n        if self._is_text_node(self._root):\n            text = str(self._root)\n            if len(text) > length_limit:\n                text = text[:length_limit].strip() + \"...\"\n            return f\"<text='{text}'>\"\n\n        content = clean_spaces(self.html_content)\n        if len(content) > length_limit:\n            content = content[:length_limit].strip() + \"...\"\n        data = f\"<data='{content}'\"\n\n        if self.parent:\n            parent_content = clean_spaces(self.parent.html_content)\n            if len(parent_content) > length_limit:\n                parent_content = parent_content[:length_limit].strip() + \"...\"\n\n            data += f\" parent='{parent_content}'\"\n\n        return data + \">\"\n\n    # From here we start with the selecting functions\n    @overload\n    def relocate(\n        self, element: Union[Dict, HtmlElement, \"Selector\"], percentage: int, selector_type: Literal[True]\n    ) -> \"Selectors\": ...\n\n    @overload\n    def relocate(\n        self, element: Union[Dict, HtmlElement, \"Selector\"], percentage: int, selector_type: Literal[False] = False\n    ) -> List[HtmlElement]: ...\n\n    def relocate(\n        self,\n        element: Union[Dict, HtmlElement, \"Selector\"],\n        percentage: int = 0,\n        selector_type: bool = False,\n    ) -> Union[List[HtmlElement], \"Selectors\"]:\n        \"\"\"This function will search again for the element in the page tree, used automatically on page structure change\n\n        :param element: The element we want to relocate in the tree\n        :param percentage: The minimum percentage to accept and not going lower than that. Be aware that the percentage\n         calculation depends solely on the page structure, so don't play with this number unless you must know\n         what you are doing!\n        :param selector_type: If True, the return result will be converted to `Selectors` object\n        :return: List of pure HTML elements that got the highest matching score or 'Selectors' object\n        \"\"\"\n        score_table: Dict[float, List[Any]] = {}\n        # Note: `element` will most likely always be a dictionary at this point.\n        if isinstance(element, self.__class__):\n            element = element._root\n\n        if issubclass(type(element), HtmlElement):\n            element = _StorageTools.element_to_dict(element)\n\n        for node in cast(List, _find_all_elements(self._root)):\n            # Collect all elements in the page, then for each element get the matching score of it against the node.\n            # Hence: the code doesn't stop even if the score was 100%\n            # because there might be another element(s) left in page with the same score\n            score = self.__calculate_similarity_score(cast(Dict, element), node)\n            score_table.setdefault(score, []).append(node)\n\n        if score_table:\n            highest_probability = max(score_table.keys())\n            if score_table[highest_probability] and highest_probability >= percentage:\n                if log.getEffectiveLevel() < 20:\n                    # No need to execute this part if the logging level is not debugging\n                    log.debug(f\"Highest probability was {highest_probability}%\")\n                    log.debug(\"Top 5 best matching elements are: \")\n                    for percent in tuple(sorted(score_table.keys(), reverse=True))[:5]:\n                        log.debug(f\"{percent} -> {self.__elements_convertor(score_table[percent])}\")\n\n                if not selector_type:\n                    return score_table[highest_probability]\n                return self.__elements_convertor(score_table[highest_probability])\n        return []\n\n    def css(\n        self,\n        selector: str,\n        identifier: str = \"\",\n        adaptive: bool = False,\n        auto_save: bool = False,\n        percentage: int = 0,\n    ) -> \"Selectors\":\n        \"\"\"Search the current tree with CSS3 selectors\n\n        **Important:\n        It's recommended to use the identifier argument if you plan to use a different selector later\n        and want to relocate the same element(s)**\n\n        :param selector: The CSS3 selector to be used.\n        :param adaptive: Enabled will make the function try to relocate the element if it was 'saved' before\n        :param identifier: A string that will be used to save/retrieve element's data in adaptive,\n         otherwise the selector will be used.\n        :param auto_save: Automatically save new elements for `adaptive` later\n        :param percentage: The minimum percentage to accept while `adaptive` is working and not going lower than that.\n         Be aware that the percentage calculation depends solely on the page structure, so don't play with this\n         number unless you must know what you are doing!\n\n        :return: `Selectors` class.\n        \"\"\"\n        if self._is_text_node(self._root):\n            return Selectors()\n\n        try:\n            if not self.__adaptive_enabled or \",\" not in selector:\n                # No need to split selectors in this case, let's save some CPU cycles :)\n                xpath_selector = _css_to_xpath(selector)\n                return self.xpath(\n                    xpath_selector,\n                    identifier or selector,\n                    adaptive,\n                    auto_save,\n                    percentage,\n                )\n\n            results = Selectors()\n            for single_selector in split_selectors(selector):\n                # I'm doing this only so the `save` function saves data correctly for combined selectors\n                # Like using the ',' to combine two different selectors that point to different elements.\n                xpath_selector = _css_to_xpath(single_selector.canonical())\n                results += self.xpath(\n                    xpath_selector,\n                    identifier or single_selector.canonical(),\n                    adaptive,\n                    auto_save,\n                    percentage,\n                )\n\n            return Selectors(results)\n        except (\n            SelectorError,\n            SelectorSyntaxError,\n        ) as e:\n            raise SelectorSyntaxError(f\"Invalid CSS selector '{selector}': {str(e)}\") from e\n\n    def xpath(\n        self,\n        selector: str,\n        identifier: str = \"\",\n        adaptive: bool = False,\n        auto_save: bool = False,\n        percentage: int = 0,\n        **kwargs: Any,\n    ) -> \"Selectors\":\n        \"\"\"Search the current tree with XPath selectors\n\n        **Important:\n        It's recommended to use the identifier argument if you plan to use a different selector later\n        and want to relocate the same element(s)**\n\n         Note: **Additional keyword arguments will be passed as XPath variables in the XPath expression!**\n\n        :param selector: The XPath selector to be used.\n        :param adaptive: Enabled will make the function try to relocate the element if it was 'saved' before\n        :param identifier: A string that will be used to save/retrieve element's data in adaptive,\n         otherwise the selector will be used.\n        :param auto_save: Automatically save new elements for `adaptive` later\n        :param percentage: The minimum percentage to accept while `adaptive` is working and not going lower than that.\n         Be aware that the percentage calculation depends solely on the page structure, so don't play with this\n         number unless you must know what you are doing!\n\n        :return: `Selectors` class.\n        \"\"\"\n        if self._is_text_node(self._root):\n            return Selectors()\n\n        try:\n            if elements := self._root.xpath(selector, **kwargs):\n                if not self.__adaptive_enabled and auto_save:\n                    log.warning(\n                        \"Argument `auto_save` will be ignored because `adaptive` wasn't enabled on initialization. Check docs for more info.\"\n                    )\n                elif self.__adaptive_enabled and auto_save:\n                    self.save(elements[0], identifier or selector)\n\n                return self.__handle_elements(elements)\n            elif self.__adaptive_enabled:\n                if adaptive:\n                    element_data = self.retrieve(identifier or selector)\n                    if element_data:\n                        elements = self.relocate(element_data, percentage)\n                        if elements is not None and auto_save:\n                            self.save(elements[0], identifier or selector)\n\n                return self.__handle_elements(elements)\n            else:\n                if adaptive:\n                    log.warning(\n                        \"Argument `adaptive` will be ignored because `adaptive` wasn't enabled on initialization. Check docs for more info.\"\n                    )\n                elif auto_save:\n                    log.warning(\n                        \"Argument `auto_save` will be ignored because `adaptive` wasn't enabled on initialization. Check docs for more info.\"\n                    )\n\n                return self.__handle_elements(elements)\n\n        except (\n            SelectorError,\n            SelectorSyntaxError,\n            XPathError,\n            XPathEvalError,\n        ) as e:\n            raise SelectorSyntaxError(f\"Invalid XPath selector: {selector}\") from e\n\n    def find_all(\n        self,\n        *args: str | Iterable[str] | Pattern | Callable | Dict[str, str],\n        **kwargs: str,\n    ) -> \"Selectors\":\n        \"\"\"Find elements by filters of your creations for ease.\n\n        :param args: Tag name(s), iterable of tag names, regex patterns, function, or a dictionary of elements' attributes. Leave empty for selecting all.\n        :param kwargs: The attributes you want to filter elements based on it.\n        :return: The `Selectors` object of the elements or empty list\n        \"\"\"\n        if self._is_text_node(self._root):\n            return Selectors()\n\n        if not args and not kwargs:\n            raise TypeError(\"You have to pass something to search with, like tag name(s), tag attributes, or both.\")\n\n        attributes: Dict[str, Any] = dict()\n        tags: Set[str] = set()\n        patterns: Set[Pattern] = set()\n        results, functions, selectors = Selectors(), [], []\n\n        # Brace yourself for a wonderful journey!\n        for arg in args:\n            if isinstance(arg, str):\n                tags.add(arg)\n\n            elif type(arg) in (list, tuple, set):\n                arg = cast(Iterable, arg)  # Type narrowing for type checkers like pyright\n                if not all(map(lambda x: isinstance(x, str), arg)):\n                    raise TypeError(\"Nested Iterables are not accepted, only iterables of tag names are accepted\")\n                tags.update(set(arg))\n\n            elif isinstance(arg, dict):\n                if not all([(isinstance(k, str) and isinstance(v, str)) for k, v in arg.items()]):\n                    raise TypeError(\n                        \"Nested dictionaries are not accepted, only string keys and string values are accepted\"\n                    )\n                attributes.update(arg)\n\n            elif isinstance(arg, re_Pattern):\n                patterns.add(arg)\n\n            elif callable(arg):\n                if len(signature(arg).parameters) > 0:\n                    functions.append(arg)\n                else:\n                    raise TypeError(\n                        \"Callable filter function must have at least one argument to take `Selector` objects.\"\n                    )\n\n            else:\n                raise TypeError(f'Argument with type \"{type(arg)}\" is not accepted, please read the docs.')\n\n        if not all([(isinstance(k, str) and isinstance(v, str)) for k, v in kwargs.items()]):\n            raise TypeError(\"Only string values are accepted for arguments\")\n\n        for attribute_name, value in kwargs.items():\n            # Only replace names for kwargs, replacing them in dictionaries doesn't make sense\n            attribute_name = _whitelisted.get(attribute_name, attribute_name)\n            attributes[attribute_name] = value\n\n        # It's easier and faster to build a selector than traversing the tree\n        tags = tags or set(\"*\")\n        for tag in tags:\n            selector = tag\n            for key, value in attributes.items():\n                value = value.replace('\"', r\"\\\"\")  # Escape double quotes in user input\n                # Not escaping anything with the key so the user can pass patterns like {'href*': '/p/'} or get errors :)\n                selector += '[{}=\"{}\"]'.format(key, value)\n            if selector != \"*\":\n                selectors.append(selector)\n\n        if selectors:\n            results = cast(Selectors, self.css(\", \".join(selectors)))\n            if results:\n                # From the results, get the ones that fulfill passed regex patterns\n                for pattern in patterns:\n                    results = results.filter(lambda e: e.text.re(pattern, check_match=True))\n\n                # From the results, get the ones that fulfill passed functions\n                for function in functions:\n                    results = results.filter(function)\n        else:\n            results = results or self.below_elements\n            for pattern in patterns:\n                results = results.filter(lambda e: e.text.re(pattern, check_match=True))\n\n            # Collect an element if it fulfills the passed function otherwise\n            for function in functions:\n                results = results.filter(function)\n\n        return results\n\n    def find(\n        self,\n        *args: str | Iterable[str] | Pattern | Callable | Dict[str, str],\n        **kwargs: str,\n    ) -> Optional[\"Selector\"]:\n        \"\"\"Find elements by filters of your creations for ease, then return the first result. Otherwise return `None`.\n\n        :param args: Tag name(s), iterable of tag names, regex patterns, function, or a dictionary of elements' attributes. Leave empty for selecting all.\n        :param kwargs: The attributes you want to filter elements based on it.\n        :return: The `Selector` object of the element or `None` if the result didn't match\n        \"\"\"\n        for element in self.find_all(*args, **kwargs):\n            return element\n        return None\n\n    def __calculate_similarity_score(self, original: Dict, candidate: HtmlElement) -> float:\n        \"\"\"Used internally to calculate a score that shows how a candidate element similar to the original one\n\n        :param original: The original element in the form of the dictionary generated from `element_to_dict` function\n        :param candidate: The element to compare with the original element.\n        :return: A percentage score of how similar is the candidate to the original element\n        \"\"\"\n        score: float = 0\n        checks: int = 0\n        data = _StorageTools.element_to_dict(candidate)\n\n        score += 1 if original[\"tag\"] == data[\"tag\"] else 0\n        checks += 1\n\n        if original[\"text\"]:\n            score += SequenceMatcher(None, original[\"text\"], data.get(\"text\") or \"\").ratio()\n            checks += 1\n\n        # if both don't have attributes, it still counts for something!\n        score += self.__calculate_dict_diff(original[\"attributes\"], data[\"attributes\"])\n        checks += 1\n\n        # Separate similarity test for class, id, href,... this will help in full structural changes\n        for attrib in (\n            \"class\",\n            \"id\",\n            \"href\",\n            \"src\",\n        ):\n            if original[\"attributes\"].get(attrib):\n                score += SequenceMatcher(\n                    None,\n                    original[\"attributes\"][attrib],\n                    data[\"attributes\"].get(attrib) or \"\",\n                ).ratio()\n                checks += 1\n\n        score += SequenceMatcher(None, original[\"path\"], data[\"path\"]).ratio()\n        checks += 1\n\n        if original.get(\"parent_name\"):\n            # Then we start comparing parents' data\n            if data.get(\"parent_name\"):\n                score += SequenceMatcher(None, original[\"parent_name\"], data.get(\"parent_name\") or \"\").ratio()\n                checks += 1\n\n                score += self.__calculate_dict_diff(original[\"parent_attribs\"], data.get(\"parent_attribs\") or {})\n                checks += 1\n\n                if original[\"parent_text\"]:\n                    score += SequenceMatcher(\n                        None,\n                        original[\"parent_text\"],\n                        data.get(\"parent_text\") or \"\",\n                    ).ratio()\n                    checks += 1\n            # else:\n            #     # The original element has a parent and this one not, this is not a good sign\n            #     score -= 0.1\n\n        if original.get(\"siblings\"):\n            score += SequenceMatcher(None, original[\"siblings\"], data.get(\"siblings\") or []).ratio()\n            checks += 1\n\n        # How % sure? let's see\n        return round((score / checks) * 100, 2)\n\n    @staticmethod\n    def __calculate_dict_diff(dict1: Dict, dict2: Dict) -> float:\n        \"\"\"Used internally to calculate similarity between two dictionaries as SequenceMatcher doesn't accept dictionaries\"\"\"\n        score = SequenceMatcher(None, tuple(dict1.keys()), tuple(dict2.keys())).ratio() * 0.5\n        score += SequenceMatcher(None, tuple(dict1.values()), tuple(dict2.values())).ratio() * 0.5\n        return score\n\n    def save(self, element: HtmlElement, identifier: str) -> None:\n        \"\"\"Saves the element's unique properties to the storage for retrieval and relocation later\n\n        :param element: The element itself that we want to save to storage, it can be a ` Selector ` or pure ` HtmlElement `\n        :param identifier: This is the identifier that will be used to retrieve the element later from the storage. See\n            the docs for more info.\n        \"\"\"\n        if self.__adaptive_enabled and self._storage:\n            target_element: Any = element\n            if isinstance(target_element, self.__class__):\n                target_element = target_element._root\n\n            if self._is_text_node(target_element):\n                target_element = target_element.getparent()\n\n            self._storage.save(target_element, identifier)\n        else:\n            raise RuntimeError(\n                \"Can't use `adaptive` features while it's disabled globally, you have to start a new class instance.\"\n            )\n\n    def retrieve(self, identifier: str) -> Optional[Dict[str, Any]]:\n        \"\"\"Using the identifier, we search the storage and return the unique properties of the element\n\n        :param identifier: This is the identifier that will be used to retrieve the element from the storage. See\n            the docs for more info.\n        :return: A dictionary of the unique properties\n        \"\"\"\n        if self.__adaptive_enabled and self._storage:\n            return self._storage.retrieve(identifier)\n\n        raise RuntimeError(\n            \"Can't use `adaptive` features while it's disabled globally, you have to start a new class instance.\"\n        )\n\n    # Operations on text functions\n    def json(self) -> Dict:\n        \"\"\"Return JSON response if the response is jsonable otherwise throws error\"\"\"\n        if self._is_text_node(self._root):\n            return TextHandler(str(self._root)).json()\n        if self._raw_body and isinstance(self._raw_body, (str, bytes)):\n            if isinstance(self._raw_body, str):\n                return TextHandler(self._raw_body).json()\n            else:\n                if TYPE_CHECKING:\n                    assert isinstance(self._raw_body, bytes)\n                return TextHandler(self._raw_body.decode()).json()\n        elif self.text:\n            return self.text.json()\n        else:\n            return self.get_all_text(strip=True).json()\n\n    def re(\n        self,\n        regex: str | Pattern[str],\n        replace_entities: bool = True,\n        clean_match: bool = False,\n        case_sensitive: bool = True,\n    ) -> TextHandlers:\n        \"\"\"Apply the given regex to the current text and return a list of strings with the matches.\n\n        :param regex: Can be either a compiled regular expression or a string.\n        :param replace_entities: If enabled character entity references are replaced by their corresponding character\n        :param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching\n        :param case_sensitive: if disabled, the function will set the regex to ignore the letters case while compiling it\n        \"\"\"\n        return self.text.re(regex, replace_entities, clean_match, case_sensitive)\n\n    def re_first(\n        self,\n        regex: str | Pattern[str],\n        default=None,\n        replace_entities: bool = True,\n        clean_match: bool = False,\n        case_sensitive: bool = True,\n    ) -> TextHandler:\n        \"\"\"Apply the given regex to text and return the first match if found, otherwise return the default value.\n\n        :param regex: Can be either a compiled regular expression or a string.\n        :param default: The default value to be returned if there is no match\n        :param replace_entities: if enabled character entity references are replaced by their corresponding character\n        :param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching\n        :param case_sensitive: if disabled, the function will set the regex to ignore the letters case while compiling it\n        \"\"\"\n        return self.text.re_first(regex, default, replace_entities, clean_match, case_sensitive)\n\n    @staticmethod\n    def __get_attributes(element: HtmlElement, ignore_attributes: List | Tuple) -> Dict:\n        \"\"\"Return attributes dictionary without the ignored list\"\"\"\n        return {k: v for k, v in element.attrib.items() if k not in ignore_attributes}\n\n    def __are_alike(\n        self,\n        original: HtmlElement,\n        original_attributes: Dict,\n        candidate: HtmlElement,\n        ignore_attributes: List | Tuple,\n        similarity_threshold: float,\n        match_text: bool = False,\n    ) -> bool:\n        \"\"\"Calculate a score of how much these elements are alike and return True\n        if the score is higher or equals the threshold\"\"\"\n        candidate_attributes = (\n            self.__get_attributes(candidate, ignore_attributes) if ignore_attributes else candidate.attrib\n        )\n        score: float = 0\n        checks: int = 0\n\n        if original_attributes:\n            score += sum(\n                SequenceMatcher(None, v, candidate_attributes.get(k, \"\")).ratio()\n                for k, v in original_attributes.items()\n            )\n            checks += len(candidate_attributes)\n        else:\n            if not candidate_attributes:\n                # Both don't have attributes, this must mean something\n                score += 1\n                checks += 1\n\n        if match_text:\n            score += SequenceMatcher(\n                None,\n                clean_spaces(original.text or \"\"),\n                clean_spaces(candidate.text or \"\"),\n            ).ratio()\n            checks += 1\n\n        if checks:\n            return round(score / checks, 2) >= similarity_threshold\n        return False\n\n    def find_similar(\n        self,\n        similarity_threshold: float = 0.2,\n        ignore_attributes: List | Tuple = (\n            \"href\",\n            \"src\",\n        ),\n        match_text: bool = False,\n    ) -> \"Selectors\":\n        \"\"\"Find elements that are in the same tree depth in the page with the same tag name and same parent tag etc...\n        then return the ones that match the current element attributes with a percentage higher than the input threshold.\n\n        This function is inspired by AutoScraper and made for cases where you, for example, found a product div inside\n        a products-list container and want to find other products using that element as a starting point EXCEPT\n        this function works in any case without depending on the element type.\n\n        :param similarity_threshold: The percentage to use while comparing element attributes.\n            Note: Elements found before attributes matching/comparison will be sharing the same depth, same tag name,\n            same parent tag name, and same grand parent tag name. So they are 99% likely to be correct unless you are\n            extremely unlucky, then attributes matching comes into play, so don't play with this number unless\n            you are getting the results you don't want.\n            Also, if the current element doesn't have attributes and the similar element as well, then it's a 100% match.\n        :param ignore_attributes: Attribute names passed will be ignored while matching the attributes in the last step.\n            The default value is to ignore `href` and `src` as URLs can change a lot between elements, so it's unreliable\n        :param match_text: If True, element text content will be taken into calculation while matching.\n            Not recommended to use in normal cases, but it depends.\n\n        :return: A ``Selectors`` container of ``Selector`` objects or empty list\n        \"\"\"\n        if self._is_text_node(self._root):\n            return Selectors()\n\n        # We will use the elements' root from now on to get the speed boost of using Lxml directly\n        root = self._root\n        similar_elements = list()\n\n        current_depth = len(list(root.iterancestors()))\n        target_attrs = self.__get_attributes(root, ignore_attributes) if ignore_attributes else root.attrib\n\n        path_parts = [self.tag]\n        if (parent := root.getparent()) is not None:\n            path_parts.insert(0, parent.tag)\n            if (grandparent := parent.getparent()) is not None:\n                path_parts.insert(0, grandparent.tag)\n\n        xpath_path = \"//{}\".format(\"/\".join(path_parts))\n        potential_matches = root.xpath(f\"{xpath_path}[count(ancestor::*) = {current_depth}]\")\n\n        for potential_match in potential_matches:\n            if potential_match != root and self.__are_alike(\n                root,\n                target_attrs,\n                potential_match,\n                ignore_attributes,\n                similarity_threshold,\n                match_text,\n            ):\n                similar_elements.append(potential_match)\n\n        return Selectors(map(self.__element_convertor, similar_elements))\n\n    @overload\n    def find_by_text(\n        self,\n        text: str,\n        first_match: Literal[True] = ...,\n        partial: bool = ...,\n        case_sensitive: bool = ...,\n        clean_match: bool = ...,\n    ) -> \"Selector\": ...\n\n    @overload\n    def find_by_text(\n        self,\n        text: str,\n        first_match: Literal[False],\n        partial: bool = ...,\n        case_sensitive: bool = ...,\n        clean_match: bool = ...,\n    ) -> \"Selectors\": ...\n\n    def find_by_text(\n        self,\n        text: str,\n        first_match: bool = True,\n        partial: bool = False,\n        case_sensitive: bool = False,\n        clean_match: bool = True,\n    ) -> Union[\"Selectors\", \"Selector\"]:\n        \"\"\"Find elements that its text content fully/partially matches input.\n        :param text: Text query to match\n        :param first_match: Returns the first element that matches conditions, enabled by default\n        :param partial: If enabled, the function returns elements that contain the input text\n        :param case_sensitive: if enabled, the letters case will be taken into consideration\n        :param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching\n        \"\"\"\n        if self._is_text_node(self._root):\n            return Selectors()\n\n        results = Selectors()\n        if not case_sensitive:\n            text = text.lower()\n\n        possible_targets = cast(List, _find_all_elements_with_spaces(self._root))\n        if possible_targets:\n            for node in self.__elements_convertor(possible_targets):\n                \"\"\"Check if element matches given text otherwise, traverse the children tree and iterate\"\"\"\n                node_text: TextHandler = node.text\n                if clean_match:\n                    node_text = TextHandler(node_text.clean())\n\n                if not case_sensitive:\n                    node_text = TextHandler(node_text.lower())\n\n                if partial:\n                    if text in node_text:\n                        results.append(node)\n                elif text == node_text:\n                    results.append(node)\n\n                if first_match and results:\n                    # we got an element so we should stop\n                    break\n\n            if first_match:\n                if results:\n                    return results[0]\n        return results\n\n    @overload\n    def find_by_regex(\n        self,\n        query: str | Pattern[str],\n        first_match: Literal[True] = ...,\n        case_sensitive: bool = ...,\n        clean_match: bool = ...,\n    ) -> \"Selector\": ...\n\n    @overload\n    def find_by_regex(\n        self,\n        query: str | Pattern[str],\n        first_match: Literal[False],\n        case_sensitive: bool = ...,\n        clean_match: bool = ...,\n    ) -> \"Selectors\": ...\n\n    def find_by_regex(\n        self,\n        query: str | Pattern[str],\n        first_match: bool = True,\n        case_sensitive: bool = False,\n        clean_match: bool = True,\n    ) -> Union[\"Selectors\", \"Selector\"]:\n        \"\"\"Find elements that its text content matches the input regex pattern.\n        :param query: Regex query/pattern to match\n        :param first_match: Return the first element that matches conditions; enabled by default.\n        :param case_sensitive: If enabled, the letters case will be taken into consideration in the regex.\n        :param clean_match: If enabled, this will ignore all whitespaces and consecutive spaces while matching.\n        \"\"\"\n        if self._is_text_node(self._root):\n            return Selectors()\n\n        results = Selectors()\n\n        possible_targets = cast(List, _find_all_elements_with_spaces(self._root))\n        if possible_targets:\n            for node in self.__elements_convertor(possible_targets):\n                \"\"\"Check if element matches given regex otherwise, traverse the children tree and iterate\"\"\"\n                node_text = node.text\n                if node_text.re(\n                    query,\n                    check_match=True,\n                    clean_match=clean_match,\n                    case_sensitive=case_sensitive,\n                ):\n                    results.append(node)\n\n                if first_match and results:\n                    # we got an element so we should stop\n                    break\n\n            if results and first_match:\n                return results[0]\n        return results\n\n\nclass Selectors(List[Selector]):\n    \"\"\"\n    The `Selectors` class is a subclass of the builtin ``List`` class, which provides a few additional methods.\n    \"\"\"\n\n    __slots__ = ()\n\n    @overload\n    def __getitem__(self, pos: SupportsIndex) -> Selector:\n        pass\n\n    @overload\n    def __getitem__(self, pos: slice) -> \"Selectors\":\n        pass\n\n    def __getitem__(self, pos: SupportsIndex | slice) -> Union[Selector, \"Selectors\"]:\n        lst = super().__getitem__(pos)\n        if isinstance(pos, slice):\n            return self.__class__(cast(List[Selector], lst))\n        else:\n            return cast(Selector, lst)\n\n    def xpath(\n        self,\n        selector: str,\n        identifier: str = \"\",\n        auto_save: bool = False,\n        percentage: int = 0,\n        **kwargs: Any,\n    ) -> \"Selectors\":\n        \"\"\"\n        Call the ``.xpath()`` method for each element in this list and return\n        their results as another `Selectors` class.\n\n        **Important:\n        It's recommended to use the identifier argument if you plan to use a different selector later\n        and want to relocate the same element(s)**\n\n         Note: **Additional keyword arguments will be passed as XPath variables in the XPath expression!**\n\n        :param selector: The XPath selector to be used.\n        :param identifier: A string that will be used to retrieve element's data in adaptive,\n         otherwise the selector will be used.\n        :param auto_save: Automatically save new elements for `adaptive` later\n        :param percentage: The minimum percentage to accept while `adaptive` is working and not going lower than that.\n         Be aware that the percentage calculation depends solely on the page structure, so don't play with this\n         number unless you must know what you are doing!\n\n        :return: `Selectors` class.\n        \"\"\"\n        results = [n.xpath(selector, identifier or selector, False, auto_save, percentage, **kwargs) for n in self]\n        return self.__class__(flatten(results))\n\n    def css(\n        self,\n        selector: str,\n        identifier: str = \"\",\n        auto_save: bool = False,\n        percentage: int = 0,\n    ) -> \"Selectors\":\n        \"\"\"\n        Call the ``.css()`` method for each element in this list and return\n        their results flattened as another `Selectors` class.\n\n        **Important:\n        It's recommended to use the identifier argument if you plan to use a different selector later\n        and want to relocate the same element(s)**\n\n        :param selector: The CSS3 selector to be used.\n        :param identifier: A string that will be used to retrieve element's data in adaptive,\n         otherwise the selector will be used.\n        :param auto_save: Automatically save new elements for `adaptive` later\n        :param percentage: The minimum percentage to accept while `adaptive` is working and not going lower than that.\n         Be aware that the percentage calculation depends solely on the page structure, so don't play with this\n         number unless you must know what you are doing!\n\n        :return: `Selectors` class.\n        \"\"\"\n        results = [n.css(selector, identifier or selector, False, auto_save, percentage) for n in self]\n        return self.__class__(flatten(results))\n\n    def re(\n        self,\n        regex: str | Pattern,\n        replace_entities: bool = True,\n        clean_match: bool = False,\n        case_sensitive: bool = True,\n    ) -> TextHandlers:\n        \"\"\"Call the ``.re()`` method for each element in this list and return\n        their results flattened as List of TextHandler.\n\n        :param regex: Can be either a compiled regular expression or a string.\n        :param replace_entities: If enabled character entity references are replaced by their corresponding character\n        :param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching\n        :param case_sensitive: if disabled, the function will set the regex to ignore the letters case while compiling it\n        \"\"\"\n        results = [n.re(regex, replace_entities, clean_match, case_sensitive) for n in self]\n        return TextHandlers(flatten(results))\n\n    def re_first(\n        self,\n        regex: str | Pattern,\n        default: Any = None,\n        replace_entities: bool = True,\n        clean_match: bool = False,\n        case_sensitive: bool = True,\n    ) -> TextHandler:\n        \"\"\"Call the ``.re_first()`` method for each element in this list and return\n        the first result or the default value otherwise.\n\n        :param regex: Can be either a compiled regular expression or a string.\n        :param default: The default value to be returned if there is no match\n        :param replace_entities: if enabled character entity references are replaced by their corresponding character\n        :param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching\n        :param case_sensitive: if disabled, function will set the regex to ignore the letters case while compiling it\n        \"\"\"\n        for n in self:\n            for result in n.re(regex, replace_entities, clean_match, case_sensitive):\n                return result\n        return default\n\n    def search(self, func: Callable[[\"Selector\"], bool]) -> Optional[\"Selector\"]:\n        \"\"\"Loop over all current elements and return the first element that matches the passed function\n        :param func: A function that takes each element as an argument and returns True/False\n        :return: The first element that match the function or ``None`` otherwise.\n        \"\"\"\n        for element in self:\n            if func(element):\n                return element\n        return None\n\n    def filter(self, func: Callable[[\"Selector\"], bool]) -> \"Selectors\":\n        \"\"\"Filter current elements based on the passed function\n        :param func: A function that takes each element as an argument and returns True/False\n        :return: The new `Selectors` object or empty list otherwise.\n        \"\"\"\n        return self.__class__([element for element in self if func(element)])\n\n    @overload\n    def get(self) -> Optional[TextHandler]: ...\n\n    @overload\n    def get(self, default: _T) -> Union[TextHandler, _T]: ...\n\n    def get(self, default=None):\n        \"\"\"Returns the serialized string of the first element, or ``default`` if empty.\n        :param default: the default value to return if the current list is empty\n        \"\"\"\n        for x in self:\n            return x.get()\n        return default\n\n    def getall(self) -> TextHandlers:\n        \"\"\"Serialize all elements and return as a TextHandlers list.\"\"\"\n        return TextHandlers([x.get() for x in self])\n\n    extract = getall\n    extract_first = get\n\n    @property\n    def first(self) -> Optional[Selector]:\n        \"\"\"Returns the first Selector item of the current list or `None` if the list is empty\"\"\"\n        return self[0] if len(self) > 0 else None\n\n    @property\n    def last(self) -> Optional[Selector]:\n        \"\"\"Returns the last Selector item of the current list or `None` if the list is empty\"\"\"\n        return self[-1] if len(self) > 0 else None\n\n    @property\n    def length(self) -> int:\n        \"\"\"Returns the length of the current list\"\"\"\n        return len(self)\n\n    def __getstate__(self) -> Any:  # pragma: no cover\n        # lxml don't like it :)\n        raise TypeError(\"Can't pickle Selectors object\")\n\n\n# For backward compatibility\nAdaptor = Selector\nAdaptors = Selectors\n"
  },
  {
    "path": "scrapling/py.typed",
    "content": "\r\n"
  },
  {
    "path": "scrapling/spiders/__init__.py",
    "content": "from .request import Request\nfrom .result import CrawlResult\nfrom .scheduler import Scheduler\nfrom .engine import CrawlerEngine\nfrom .session import SessionManager\nfrom .spider import Spider, SessionConfigurationError\nfrom scrapling.engines.toolbelt.custom import Response\n\n__all__ = [\n    \"Spider\",\n    \"SessionConfigurationError\",\n    \"Request\",\n    \"CrawlerEngine\",\n    \"CrawlResult\",\n    \"SessionManager\",\n    \"Scheduler\",\n    \"Response\",\n]\n"
  },
  {
    "path": "scrapling/spiders/checkpoint.py",
    "content": "import pickle\nfrom pathlib import Path\nfrom dataclasses import dataclass, field\n\nimport anyio\nfrom anyio import Path as AsyncPath\n\nfrom scrapling.core.utils import log\nfrom scrapling.core._types import Set, List, Optional, TYPE_CHECKING\n\nif TYPE_CHECKING:\n    from scrapling.spiders.request import Request\n\n\n@dataclass\nclass CheckpointData:\n    \"\"\"Container for checkpoint state.\"\"\"\n\n    requests: List[\"Request\"] = field(default_factory=list)\n    seen: Set[bytes] = field(default_factory=set)\n\n\nclass CheckpointManager:\n    \"\"\"Manages saving and loading checkpoint state to/from disk.\"\"\"\n\n    CHECKPOINT_FILE = \"checkpoint.pkl\"\n\n    def __init__(self, crawldir: str | Path | AsyncPath, interval: float = 300.0):\n        self.crawldir = AsyncPath(crawldir)\n        self._checkpoint_path = self.crawldir / self.CHECKPOINT_FILE\n        self.interval = interval\n        if not isinstance(interval, (int, float)):\n            raise TypeError(\"Checkpoints interval must be integer or float.\")\n        else:\n            if interval < 0:\n                raise ValueError(\"Checkpoints interval must be equal or greater than 0.\")\n\n    async def has_checkpoint(self) -> bool:\n        \"\"\"Check if a checkpoint exists.\"\"\"\n        return await self._checkpoint_path.exists()\n\n    async def save(self, data: CheckpointData) -> None:\n        \"\"\"Save checkpoint data to disk atomically.\"\"\"\n        await self.crawldir.mkdir(parents=True, exist_ok=True)\n\n        temp_path = self._checkpoint_path.with_suffix(\".tmp\")\n\n        try:\n            serialized = pickle.dumps(data, protocol=pickle.HIGHEST_PROTOCOL)\n            async with await anyio.open_file(temp_path, \"wb\") as f:\n                await f.write(serialized)\n\n            await temp_path.rename(self._checkpoint_path)\n\n            log.info(f\"Checkpoint saved: {len(data.requests)} requests, {len(data.seen)} seen URLs\")\n        except Exception as e:\n            # Clean up temp file if it exists\n            if await temp_path.exists():\n                await temp_path.unlink()\n            log.error(f\"Failed to save checkpoint: {e}\")\n            raise\n\n    async def load(self) -> Optional[CheckpointData]:\n        \"\"\"Load checkpoint data from disk.\n\n        Returns None if no checkpoint exists or if loading fails.\n        \"\"\"\n        if not await self.has_checkpoint():\n            return None\n\n        try:\n            async with await anyio.open_file(self._checkpoint_path, \"rb\") as f:\n                content = await f.read()\n                data: CheckpointData = pickle.loads(content)\n\n            log.info(f\"Checkpoint loaded: {len(data.requests)} requests, {len(data.seen)} seen URLs\")\n            return data\n\n        except Exception as e:\n            log.error(f\"Failed to load checkpoint (starting fresh): {e}\")\n            return None\n\n    async def cleanup(self) -> None:\n        \"\"\"Delete checkpoint file after successful completion.\"\"\"\n        try:\n            if await self._checkpoint_path.exists():\n                await self._checkpoint_path.unlink()\n            log.debug(\"Checkpoint file cleaned up\")\n        except Exception as e:\n            log.warning(f\"Failed to cleanup checkpoint file: {e}\")\n"
  },
  {
    "path": "scrapling/spiders/engine.py",
    "content": "import json\nimport pprint\nfrom pathlib import Path\n\nimport anyio\nfrom anyio import Path as AsyncPath\nfrom anyio import create_task_group, CapacityLimiter, create_memory_object_stream, EndOfStream\n\nfrom scrapling.core.utils import log\nfrom scrapling.spiders.request import Request\nfrom scrapling.spiders.scheduler import Scheduler\nfrom scrapling.spiders.session import SessionManager\nfrom scrapling.spiders.result import CrawlStats, ItemList\nfrom scrapling.spiders.checkpoint import CheckpointManager, CheckpointData\nfrom scrapling.core._types import Dict, Union, Optional, TYPE_CHECKING, Any, AsyncGenerator\n\nif TYPE_CHECKING:\n    from scrapling.spiders.spider import Spider\n\n\ndef _dump(obj: Dict) -> str:\n    return json.dumps(obj, indent=4)\n\n\nclass CrawlerEngine:\n    \"\"\"Orchestrates the crawling process.\"\"\"\n\n    def __init__(\n        self,\n        spider: \"Spider\",\n        session_manager: SessionManager,\n        crawldir: Optional[Union[str, Path, AsyncPath]] = None,\n        interval: float = 300.0,\n    ):\n        self.spider = spider\n        self.session_manager = session_manager\n        self.scheduler = Scheduler(\n            include_kwargs=spider.fp_include_kwargs,\n            include_headers=spider.fp_include_headers,\n            keep_fragments=spider.fp_keep_fragments,\n        )\n        self.stats = CrawlStats()\n\n        self._global_limiter = CapacityLimiter(spider.concurrent_requests)\n        self._domain_limiters: dict[str, CapacityLimiter] = {}\n        self._allowed_domains: set[str] = spider.allowed_domains or set()\n\n        self._active_tasks: int = 0\n        self._running: bool = False\n        self._items: ItemList = ItemList()\n        self._item_stream: Any = None\n\n        self._checkpoint_system_enabled = bool(crawldir)\n        self._checkpoint_manager = CheckpointManager(crawldir or \"\", interval)\n        self._last_checkpoint_time: float = 0.0\n        self._pause_requested: bool = False\n        self._force_stop: bool = False\n        self.paused: bool = False\n\n    def _is_domain_allowed(self, request: Request) -> bool:\n        \"\"\"Check if the request's domain is in allowed_domains.\"\"\"\n        if not self._allowed_domains:\n            return True\n\n        domain = request.domain\n        for allowed in self._allowed_domains:\n            if domain == allowed or domain.endswith(\".\" + allowed):\n                return True\n        return False\n\n    def _rate_limiter(self, domain: str) -> CapacityLimiter:\n        \"\"\"Get or create a per-domain concurrency limiter if enabled, otherwise use the global limiter.\"\"\"\n        if self.spider.concurrent_requests_per_domain:\n            if domain not in self._domain_limiters:\n                self._domain_limiters[domain] = CapacityLimiter(self.spider.concurrent_requests_per_domain)\n            return self._domain_limiters[domain]\n        return self._global_limiter\n\n    def _normalize_request(self, request: Request) -> None:\n        \"\"\"Normalize request fields before enqueueing.\n\n        Resolves empty sid to the session manager's default session ID.\n        This ensures consistent fingerprinting for requests using the same session.\n        \"\"\"\n        if not request.sid:\n            request.sid = self.session_manager.default_session_id\n\n    async def _process_request(self, request: Request) -> None:\n        \"\"\"Download and process a single request.\"\"\"\n        async with self._rate_limiter(request.domain):\n            if self.spider.download_delay:\n                await anyio.sleep(self.spider.download_delay)\n\n            if request._session_kwargs.get(\"proxy\"):\n                self.stats.proxies.append(request._session_kwargs[\"proxy\"])\n            if request._session_kwargs.get(\"proxies\"):\n                self.stats.proxies.append(dict(request._session_kwargs[\"proxies\"]))\n            try:\n                response = await self.session_manager.fetch(request)\n                self.stats.increment_requests_count(request.sid or self.session_manager.default_session_id)\n                self.stats.increment_response_bytes(request.domain, len(response.body))\n                self.stats.increment_status(response.status)\n\n            except Exception as e:\n                self.stats.failed_requests_count += 1\n                await self.spider.on_error(request, e)\n                return\n\n        if await self.spider.is_blocked(response):\n            self.stats.blocked_requests_count += 1\n            if request._retry_count < self.spider.max_blocked_retries:\n                retry_request = request.copy()\n                retry_request._retry_count += 1\n                retry_request.priority -= 1  # Don't retry immediately\n                retry_request.dont_filter = True\n                retry_request._session_kwargs.pop(\"proxy\", None)\n                retry_request._session_kwargs.pop(\"proxies\", None)\n\n                new_request = await self.spider.retry_blocked_request(retry_request, response)\n                self._normalize_request(new_request)\n                await self.scheduler.enqueue(new_request)\n                log.info(\n                    f\"Scheduled blocked request for retry ({retry_request._retry_count}/{self.spider.max_blocked_retries}): {request.url}\"\n                )\n            else:\n                log.warning(f\"Max retries exceeded for blocked request: {request.url}\")\n            return\n\n        callback = request.callback if request.callback else self.spider.parse\n        try:\n            async for result in callback(response):\n                if isinstance(result, Request):\n                    if self._is_domain_allowed(result):\n                        self._normalize_request(result)\n                        await self.scheduler.enqueue(result)\n                    else:\n                        self.stats.offsite_requests_count += 1\n                        log.debug(f\"Filtered offsite request to: {result.url}\")\n                elif isinstance(result, dict):\n                    processed_result = await self.spider.on_scraped_item(result)\n                    if processed_result:\n                        self.stats.items_scraped += 1\n                        log.debug(f\"Scraped from {str(response)}\\n{pprint.pformat(processed_result)}\")\n                        if self._item_stream:\n                            await self._item_stream.send(processed_result)\n                        else:\n                            self._items.append(processed_result)\n                    else:\n                        self.stats.items_dropped += 1\n                        log.warning(f\"Dropped from {str(response)}\\n{processed_result}\")\n                elif result is not None:\n                    log.error(f\"Spider must return Request, dict or None, got '{type(result)}' in {request}\")\n        except Exception as e:\n            msg = f\"Spider error processing {request}:\\n {e}\"\n            log.error(msg, exc_info=e)\n            await self.spider.on_error(request, e)\n\n    async def _task_wrapper(self, request: Request) -> None:\n        \"\"\"Wrapper to track active task count.\"\"\"\n        try:\n            await self._process_request(request)\n        finally:\n            self._active_tasks -= 1\n\n    def request_pause(self) -> None:\n        \"\"\"Request a graceful pause of the crawl.\n\n        First call: requests graceful pause (waits for active tasks).\n        Second call: forces immediate stop.\n        \"\"\"\n        if self._force_stop:\n            return  # Already forcing stop\n\n        if self._pause_requested:\n            # Second Ctrl+C - force stop\n            self._force_stop = True\n            log.warning(\"Force stop requested, cancelling immediately...\")\n        else:\n            self._pause_requested = True\n            log.info(\n                \"Pause requested, waiting for in-flight requests to complete (press Ctrl+C again to force stop)...\"\n            )\n\n    async def _save_checkpoint(self) -> None:\n        \"\"\"Save current state to checkpoint files.\"\"\"\n        requests, seen = self.scheduler.snapshot()\n        data = CheckpointData(requests=requests, seen=seen)\n        await self._checkpoint_manager.save(data)\n        self._last_checkpoint_time = anyio.current_time()\n\n    def _is_checkpoint_time(self) -> bool:\n        \"\"\"Check if it's time for the periodic checkpoint.\"\"\"\n        if not self._checkpoint_system_enabled:\n            return False\n\n        if self._checkpoint_manager.interval == 0:\n            return False\n\n        current_time = anyio.current_time()\n        return (current_time - self._last_checkpoint_time) >= self._checkpoint_manager.interval\n\n    async def _restore_from_checkpoint(self) -> bool:\n        \"\"\"Attempt to restore state from checkpoint.\n\n        Returns True if successfully restored, False otherwise.\n        \"\"\"\n        if not self._checkpoint_system_enabled:\n            raise\n\n        data = await self._checkpoint_manager.load()\n        if data is None:\n            return False\n\n        self.scheduler.restore(data)\n\n        # Restore callbacks from spider after scheduler restore\n        for request in data.requests:\n            request._restore_callback(self.spider)\n\n        return True\n\n    async def crawl(self) -> CrawlStats:\n        \"\"\"Run the spider and return CrawlStats.\"\"\"\n        self._running = True\n        self._items.clear()\n        self.paused = False\n        self._pause_requested = False\n        self._force_stop = False\n        self.stats = CrawlStats(start_time=anyio.current_time())\n\n        # Check for existing checkpoint\n        resuming = (await self._restore_from_checkpoint()) if self._checkpoint_system_enabled else False\n        self._last_checkpoint_time = anyio.current_time()\n\n        async with self.session_manager:\n            self.stats.concurrent_requests = self.spider.concurrent_requests\n            self.stats.concurrent_requests_per_domain = self.spider.concurrent_requests_per_domain\n            self.stats.download_delay = self.spider.download_delay\n            await self.spider.on_start(resuming=resuming)\n\n            try:\n                if not resuming:\n                    async for request in self.spider.start_requests():\n                        self._normalize_request(request)\n                        await self.scheduler.enqueue(request)\n                else:\n                    log.info(\"Resuming from checkpoint, skipping start_requests()\")\n\n                # Process queue\n                async with create_task_group() as tg:\n                    while self._running:\n                        if self._pause_requested:\n                            if self._active_tasks == 0 or self._force_stop:\n                                if self._force_stop:\n                                    log.warning(f\"Force stopping with {self._active_tasks} active tasks\")\n                                    tg.cancel_scope.cancel()\n\n                                # Only save checkpoint if checkpoint system is enabled\n                                if self._checkpoint_system_enabled:\n                                    await self._save_checkpoint()\n                                    self.paused = True\n                                    log.info(\"Spider paused, checkpoint saved\")\n                                else:\n                                    log.info(\"Spider stopped gracefully\")\n\n                                self._running = False\n                                break\n\n                            # Wait briefly and check again\n                            await anyio.sleep(0.05)\n                            continue\n\n                        if self._checkpoint_system_enabled and self._is_checkpoint_time():\n                            await self._save_checkpoint()\n\n                        if self.scheduler.is_empty:\n                            # Empty queue + no active tasks = done\n                            if self._active_tasks == 0:\n                                self._running = False\n                                log.debug(\"Spider idle\")\n                                break\n\n                            # Brief wait for callbacks to enqueue new requests\n                            await anyio.sleep(0.05)\n                            continue\n\n                        # Only spawn tasks up to concurrent_requests limit\n                        # This prevents spawning thousands of waiting tasks\n                        if self._active_tasks >= self.spider.concurrent_requests:\n                            await anyio.sleep(0.01)\n                            continue\n\n                        request = await self.scheduler.dequeue()\n                        self._active_tasks += 1\n                        tg.start_soon(self._task_wrapper, request)\n\n            finally:\n                await self.spider.on_close()\n                # Clean up checkpoint files on successful completion (not paused)\n                if not self.paused and self._checkpoint_system_enabled:\n                    await self._checkpoint_manager.cleanup()\n\n        self.stats.log_levels_counter = self.spider._log_counter.get_counts()\n        self.stats.end_time = anyio.current_time()\n        log.info(_dump(self.stats.to_dict()))\n        return self.stats\n\n    @property\n    def items(self) -> ItemList:\n        \"\"\"Access scraped items.\"\"\"\n        return self._items\n\n    def __aiter__(self) -> AsyncGenerator[dict, None]:\n        return self._stream()\n\n    async def _stream(self) -> AsyncGenerator[dict, None]:\n        \"\"\"Async generator that runs crawl and yields items.\"\"\"\n        send, recv = create_memory_object_stream[dict](100)\n        self._item_stream = send\n\n        async def run():\n            try:\n                await self.crawl()\n            finally:\n                await send.aclose()\n\n        async with create_task_group() as tg:\n            tg.start_soon(run)\n            try:\n                async for item in recv:\n                    yield item\n            except EndOfStream:\n                pass\n"
  },
  {
    "path": "scrapling/spiders/request.py",
    "content": "import hashlib\nfrom io import BytesIO\nfrom functools import cached_property\nfrom urllib.parse import urlparse, urlencode\n\nimport orjson\nfrom w3lib.url import canonicalize_url\n\nfrom scrapling.engines.toolbelt.custom import Response\nfrom scrapling.core._types import Any, AsyncGenerator, Callable, Dict, Optional, Union, Tuple, TYPE_CHECKING\n\nif TYPE_CHECKING:\n    from scrapling.spiders.spider import Spider\n\n\ndef _convert_to_bytes(value: str | bytes) -> bytes:\n    if isinstance(value, bytes):\n        return value\n    if not isinstance(value, str):\n        raise TypeError(f\"Can't convert {type(value).__name__} to bytes\")\n\n    return value.encode(encoding=\"utf-8\", errors=\"ignore\")\n\n\nclass Request:\n    def __init__(\n        self,\n        url: str,\n        sid: str = \"\",\n        callback: Callable[[Response], AsyncGenerator[Union[Dict[str, Any], \"Request\", None], None]] | None = None,\n        priority: int = 0,\n        dont_filter: bool = False,\n        meta: dict[str, Any] | None = None,\n        _retry_count: int = 0,\n        **kwargs: Any,\n    ) -> None:\n        self.url: str = url\n        self.sid: str = sid\n        self.callback = callback\n        self.priority: int = priority\n        self.dont_filter: bool = dont_filter\n        self.meta: dict[str, Any] = meta if meta else {}\n        self._retry_count: int = _retry_count\n        self._session_kwargs = kwargs if kwargs else {}\n        self._fp: Optional[bytes] = None\n\n    def copy(self) -> \"Request\":\n        \"\"\"Create a copy of this request.\"\"\"\n        return Request(\n            url=self.url,\n            sid=self.sid,\n            callback=self.callback,\n            priority=self.priority,\n            dont_filter=self.dont_filter,\n            meta=self.meta.copy(),\n            _retry_count=self._retry_count,\n            **self._session_kwargs,\n        )\n\n    @cached_property\n    def domain(self) -> str:\n        return urlparse(self.url).netloc\n\n    def update_fingerprint(\n        self,\n        include_kwargs: bool = False,\n        include_headers: bool = False,\n        keep_fragments: bool = False,\n    ) -> bytes:\n        \"\"\"Generate a unique fingerprint for deduplication.\n\n        Caches the result in self._fp after first computation.\n        \"\"\"\n        if self._fp is not None:\n            return self._fp\n\n        post_data = self._session_kwargs.get(\"data\", {})\n        body = b\"\"\n        if post_data:\n            if isinstance(post_data, dict | list | tuple):\n                body = urlencode(post_data).encode()\n            elif isinstance(post_data, str):\n                body = post_data.encode()\n            elif isinstance(post_data, BytesIO):\n                body = post_data.getvalue()\n            elif isinstance(post_data, bytes):\n                body = post_data\n        else:\n            post_data = self._session_kwargs.get(\"json\", {})\n            body = orjson.dumps(post_data) if post_data else b\"\"\n\n        data: Dict[str, str | Tuple] = {\n            \"sid\": self.sid,\n            \"body\": body.hex(),\n            \"method\": self._session_kwargs.get(\"method\", \"GET\"),\n            \"url\": canonicalize_url(self.url, keep_fragments=keep_fragments),\n        }\n\n        if include_kwargs:\n            kwargs = (key.lower() for key in self._session_kwargs.keys() if key.lower() not in (\"data\", \"json\"))\n            data[\"kwargs\"] = \"\".join(set(_convert_to_bytes(key).hex() for key in kwargs))\n\n        if include_headers:\n            headers = self._session_kwargs.get(\"headers\") or self._session_kwargs.get(\"extra_headers\") or {}\n            processed_headers = {}\n            # Some header normalization\n            for key, value in headers.items():\n                processed_headers[_convert_to_bytes(key.lower()).hex()] = _convert_to_bytes(value.lower()).hex()\n            data[\"headers\"] = tuple(processed_headers.items())\n\n        fp = hashlib.sha1(orjson.dumps(data, option=orjson.OPT_SORT_KEYS), usedforsecurity=False).digest()\n        self._fp = fp\n        return fp\n\n    def __repr__(self) -> str:\n        callback_name = getattr(self.callback, \"__name__\", None) or \"None\"\n        return f\"<Request({self.url}) priority={self.priority} callback={callback_name}>\"\n\n    def __str__(self) -> str:\n        return self.url\n\n    def __lt__(self, other: object) -> bool:\n        \"\"\"Compare requests by priority\"\"\"\n        if not isinstance(other, Request):\n            return NotImplemented\n        return self.priority < other.priority\n\n    def __gt__(self, other: object) -> bool:\n        \"\"\"Compare requests by priority\"\"\"\n        if not isinstance(other, Request):\n            return NotImplemented\n        return self.priority > other.priority\n\n    def __eq__(self, other: object) -> bool:\n        \"\"\"Requests are equal if they have the same fingerprint.\"\"\"\n        if not isinstance(other, Request):\n            return NotImplemented\n        if self._fp is None or other._fp is None:\n            raise RuntimeError(\"Cannot compare requests before generating their fingerprints!\")\n        return self._fp == other._fp\n\n    def __getstate__(self) -> dict[str, Any]:\n        \"\"\"Prepare state for pickling - store callback as name string for pickle compatibility.\"\"\"\n        state = self.__dict__.copy()\n        state[\"_callback_name\"] = getattr(self.callback, \"__name__\", None) if self.callback is not None else None\n        state[\"callback\"] = None  # Don't pickle the actual callable\n        return state\n\n    def __setstate__(self, state: dict[str, Any]) -> None:\n        \"\"\"Restore state from pickle - callback restored later via _restore_callback().\"\"\"\n        self._callback_name: str | None = state.pop(\"_callback_name\", None)\n        self.__dict__.update(state)\n\n    def _restore_callback(self, spider: \"Spider\") -> None:\n        \"\"\"Restore callback from spider after unpickling.\n\n        :param spider: Spider instance to look up callback method on\n        \"\"\"\n        if hasattr(self, \"_callback_name\") and self._callback_name:\n            self.callback = getattr(spider, self._callback_name, None) or spider.parse\n            del self._callback_name\n        elif hasattr(self, \"_callback_name\"):\n            del self._callback_name\n"
  },
  {
    "path": "scrapling/spiders/result.py",
    "content": "from pathlib import Path\nfrom dataclasses import dataclass, field\n\nimport orjson\n\nfrom scrapling.core.utils import log\nfrom scrapling.core._types import Any, Iterator, Dict, List, Tuple, Union\n\n\nclass ItemList(list):\n    \"\"\"A list of scraped items with export capabilities.\"\"\"\n\n    def to_json(self, path: Union[str, Path], *, indent: bool = False):\n        \"\"\"Export items to a JSON file.\n\n        :param path: Path to the output file\n        :param indent: Pretty-print with 2-space indentation (slightly slower)\n        \"\"\"\n        options = orjson.OPT_SERIALIZE_NUMPY\n        if indent:\n            options |= orjson.OPT_INDENT_2\n\n        file = Path(path)\n        file.parent.mkdir(parents=True, exist_ok=True)\n        file.write_bytes(orjson.dumps(list(self), option=options))\n        log.info(\"Saved %d items to %s\", len(self), path)\n\n    def to_jsonl(self, path: Union[str, Path]):\n        \"\"\"Export items as JSON Lines (one JSON object per line).\n\n        :param path: Path to the output file\n        \"\"\"\n        Path(path).parent.mkdir(parents=True, exist_ok=True)\n        with open(path, \"wb\") as f:\n            for item in self:\n                f.write(orjson.dumps(item, option=orjson.OPT_SERIALIZE_NUMPY))\n                f.write(b\"\\n\")\n        log.info(\"Saved %d items to %s\", len(self), path)\n\n\n@dataclass\nclass CrawlStats:\n    \"\"\"Statistics for a crawl run.\"\"\"\n\n    requests_count: int = 0\n    concurrent_requests: int = 0\n    concurrent_requests_per_domain: int = 0\n    failed_requests_count: int = 0\n    offsite_requests_count: int = 0\n    response_bytes: int = 0\n    items_scraped: int = 0\n    items_dropped: int = 0\n    start_time: float = 0.0\n    end_time: float = 0.0\n    download_delay: float = 0.0\n    blocked_requests_count: int = 0\n    custom_stats: Dict = field(default_factory=dict)\n    response_status_count: Dict = field(default_factory=dict)\n    domains_response_bytes: Dict = field(default_factory=dict)\n    sessions_requests_count: Dict = field(default_factory=dict)\n    proxies: List[str | Dict | Tuple] = field(default_factory=list)\n    log_levels_counter: Dict = field(default_factory=dict)\n\n    @property\n    def elapsed_seconds(self) -> float:\n        return self.end_time - self.start_time\n\n    @property\n    def requests_per_second(self) -> float:\n        if self.elapsed_seconds == 0:\n            return 0.0\n        return self.requests_count / self.elapsed_seconds\n\n    def increment_status(self, status: int) -> None:\n        self.response_status_count[f\"status_{status}\"] = self.response_status_count.get(f\"status_{status}\", 0) + 1\n\n    def increment_response_bytes(self, domain: str, count: int) -> None:\n        self.response_bytes += count\n        self.domains_response_bytes[domain] = self.domains_response_bytes.get(domain, 0) + count\n\n    def increment_requests_count(self, sid: str) -> None:\n        self.requests_count += 1\n        self.sessions_requests_count[sid] = self.sessions_requests_count.get(sid, 0) + 1\n\n    def to_dict(self) -> dict[str, Any]:\n        return {\n            \"items_scraped\": self.items_scraped,\n            \"items_dropped\": self.items_dropped,\n            \"elapsed_seconds\": round(self.elapsed_seconds, 2),\n            \"download_delay\": round(self.download_delay, 2),\n            \"concurrent_requests\": self.concurrent_requests,\n            \"concurrent_requests_per_domain\": self.concurrent_requests_per_domain,\n            \"requests_count\": self.requests_count,\n            \"requests_per_second\": round(self.requests_per_second, 2),\n            \"sessions_requests_count\": self.sessions_requests_count,\n            \"failed_requests_count\": self.failed_requests_count,\n            \"offsite_requests_count\": self.offsite_requests_count,\n            \"blocked_requests_count\": self.blocked_requests_count,\n            \"response_status_count\": self.response_status_count,\n            \"response_bytes\": self.response_bytes,\n            \"domains_response_bytes\": self.domains_response_bytes,\n            \"proxies\": self.proxies,\n            \"custom_stats\": self.custom_stats,\n            \"log_count\": self.log_levels_counter,\n        }\n\n\n@dataclass\nclass CrawlResult:\n    \"\"\"Complete result from a spider run.\"\"\"\n\n    stats: CrawlStats\n    items: ItemList\n    paused: bool = False\n\n    @property\n    def completed(self) -> bool:\n        \"\"\"True if the crawl completed normally (not paused).\"\"\"\n        return not self.paused\n\n    def __len__(self) -> int:\n        return len(self.items)\n\n    def __iter__(self) -> Iterator[dict[str, Any]]:\n        return iter(self.items)\n"
  },
  {
    "path": "scrapling/spiders/scheduler.py",
    "content": "import asyncio\nfrom itertools import count\n\nfrom scrapling.core.utils import log\nfrom scrapling.spiders.request import Request\nfrom scrapling.core._types import List, Set, Tuple, TYPE_CHECKING\n\nif TYPE_CHECKING:\n    from scrapling.spiders.checkpoint import CheckpointData\n\n\nclass Scheduler:\n    \"\"\"\n    Priority queue with URL deduplication. (heapq)\n\n    Higher priority requests are processed first.\n    Duplicate URLs are filtered unless dont_filter=True.\n    \"\"\"\n\n    def __init__(self, include_kwargs: bool = False, include_headers: bool = False, keep_fragments: bool = False):\n        self._queue: asyncio.PriorityQueue[tuple[int, int, Request]] = asyncio.PriorityQueue()\n        self._seen: set[bytes] = set()\n        self._counter = count()\n        # Mirror dict for snapshot without draining queue\n        self._pending: dict[int, tuple[int, int, Request]] = {}\n        self._include_kwargs = include_kwargs\n        self._include_headers = include_headers\n        self._keep_fragments = keep_fragments\n\n    async def enqueue(self, request: Request) -> bool:\n        \"\"\"Add a request to the queue.\"\"\"\n        fingerprint = request.update_fingerprint(self._include_kwargs, self._include_headers, self._keep_fragments)\n\n        if not request.dont_filter and fingerprint in self._seen:\n            log.debug(\"Dropped duplicate request: %s\", request)\n            return False\n\n        self._seen.add(fingerprint)\n\n        # Negative priority so higher priority = dequeued first\n        counter = next(self._counter)\n        item = (-request.priority, counter, request)\n        self._pending[counter] = item\n        await self._queue.put(item)\n        return True\n\n    async def dequeue(self) -> Request:\n        \"\"\"Get the next request to process.\"\"\"\n        _, counter, request = await self._queue.get()\n        self._pending.pop(counter, None)\n        return request\n\n    def __len__(self) -> int:\n        return self._queue.qsize()\n\n    @property\n    def is_empty(self) -> bool:\n        return self._queue.empty()\n\n    def snapshot(self) -> Tuple[List[Request], Set[bytes]]:\n        \"\"\"Create a snapshot of the current state for checkpoints.\"\"\"\n        sorted_items = sorted(self._pending.values(), key=lambda x: (x[0], x[1]))  # Maintain queue order\n        requests = [item[2] for item in sorted_items]\n        return requests, self._seen.copy()\n\n    def restore(self, data: \"CheckpointData\") -> None:\n        \"\"\"Restore scheduler state from checkpoint data.\n\n        :param data: CheckpointData containing requests and seen set\n        \"\"\"\n        self._seen = data.seen.copy()\n\n        # Restore pending requests in order (they're already sorted by priority)\n        for request in data.requests:\n            counter = next(self._counter)\n            item = (-request.priority, counter, request)\n            self._pending[counter] = item\n            self._queue.put_nowait(item)\n\n        log.info(f\"Scheduler restored: {len(data.requests)} requests, {len(data.seen)} seen\")\n"
  },
  {
    "path": "scrapling/spiders/session.py",
    "content": "from asyncio import Lock\n\nfrom scrapling.spiders.request import Request\nfrom scrapling.engines.static import _ASyncSessionLogic\nfrom scrapling.engines.toolbelt.convertor import Response\nfrom scrapling.core._types import Set, cast, SUPPORTED_HTTP_METHODS\nfrom scrapling.fetchers import AsyncDynamicSession, AsyncStealthySession, FetcherSession\n\nSession = FetcherSession | AsyncDynamicSession | AsyncStealthySession\n\n\nclass SessionManager:\n    \"\"\"Manages pre-configured session instances.\"\"\"\n\n    def __init__(self) -> None:\n        self._sessions: dict[str, Session] = {}\n        self._default_session_id: str | None = None\n        self._started: bool = False\n        self._lazy_sessions: Set[str] = set()\n        self._lazy_lock = Lock()\n\n    def add(self, session_id: str, session: Session, *, default: bool = False, lazy: bool = False) -> \"SessionManager\":\n        \"\"\"Register a session instance.\n\n        :param session_id: Name to reference this session in requests\n        :param session: Your pre-configured session instance\n        :param default: If True, this becomes the default session\n        :param lazy: If True, the session will be started only when a request uses its ID.\n        \"\"\"\n        if session_id in self._sessions:\n            raise ValueError(f\"Session '{session_id}' already registered\")\n\n        self._sessions[session_id] = session\n\n        if default or self._default_session_id is None:\n            self._default_session_id = session_id\n\n        if lazy:\n            self._lazy_sessions.add(session_id)\n\n        return self\n\n    def remove(self, session_id: str) -> None:\n        \"\"\"Removes a session.\n\n        :param session_id: ID of session to remove\n        \"\"\"\n        _ = self.pop(session_id)\n\n    def pop(self, session_id: str) -> Session:\n        \"\"\"Remove and returns a session.\n\n        :param session_id: ID of session to remove\n        \"\"\"\n        if session_id not in self._sessions:\n            raise KeyError(f\"Session '{session_id}' not found\")\n\n        session = self._sessions.pop(session_id)\n        if session_id in self._lazy_sessions:\n            self._lazy_sessions.remove(session_id)\n\n        if session and self._default_session_id == session_id:\n            self._default_session_id = next(iter(self._sessions), None)\n\n        return session\n\n    @property\n    def default_session_id(self) -> str:\n        if self._default_session_id is None:\n            raise RuntimeError(\"No sessions registered\")\n        return self._default_session_id\n\n    @property\n    def session_ids(self) -> list[str]:\n        return list(self._sessions.keys())\n\n    def get(self, session_id: str) -> Session:\n        if session_id not in self._sessions:\n            available = \", \".join(self._sessions.keys())\n            raise KeyError(f\"Session '{session_id}' not found. Available: {available}\")\n        return self._sessions[session_id]\n\n    async def start(self) -> None:\n        \"\"\"Start all sessions that aren't already alive.\"\"\"\n        if self._started:\n            return\n\n        for sid, session in self._sessions.items():\n            if sid not in self._lazy_sessions and not session._is_alive:\n                await session.__aenter__()\n\n        self._started = True\n\n    async def close(self) -> None:\n        \"\"\"Close all registered sessions.\"\"\"\n        for session in self._sessions.values():\n            _ = await session.__aexit__(None, None, None)\n\n        self._started = False\n\n    async def fetch(self, request: Request) -> Response:\n        sid = request.sid if request.sid else self.default_session_id\n        session = self.get(sid)\n\n        if session:\n            if sid in self._lazy_sessions and not session._is_alive:\n                async with self._lazy_lock:\n                    if not session._is_alive:\n                        await session.__aenter__()\n\n            if isinstance(session, FetcherSession):\n                client = session._client\n\n                if isinstance(client, _ASyncSessionLogic):\n                    response = await client._make_request(\n                        method=cast(SUPPORTED_HTTP_METHODS, request._session_kwargs.pop(\"method\", \"GET\")),\n                        url=request.url,\n                        **request._session_kwargs,\n                    )\n                else:\n                    # Sync session or other types - shouldn't happen in async context\n                    raise TypeError(f\"Session type {type(client)} not supported for async fetch\")\n            else:\n                response = await session.fetch(url=request.url, **request._session_kwargs)\n\n            response.request = request\n            # Merge request meta into response meta (response meta takes priority)\n            response.meta = {**request.meta, **response.meta}\n            return response\n        raise RuntimeError(\"No session found with the request session id\")\n\n    async def __aenter__(self) -> \"SessionManager\":\n        await self.start()\n        return self\n\n    async def __aexit__(self, *exc) -> None:\n        await self.close()\n\n    def __contains__(self, session_id: str) -> bool:\n        \"\"\"Check if a session ID is registered.\"\"\"\n        return session_id in self._sessions\n\n    def __len__(self) -> int:\n        \"\"\"Number of registered sessions.\"\"\"\n        return len(self._sessions)\n"
  },
  {
    "path": "scrapling/spiders/spider.py",
    "content": "import signal\nimport logging\nfrom pathlib import Path\nfrom abc import ABC, abstractmethod\n\nimport anyio\nfrom anyio import Path as AsyncPath\n\nfrom scrapling.spiders.request import Request\nfrom scrapling.spiders.engine import CrawlerEngine\nfrom scrapling.spiders.session import SessionManager\nfrom scrapling.core.utils import set_logger, reset_logger\nfrom scrapling.spiders.result import CrawlResult, CrawlStats\nfrom scrapling.core._types import Set, Any, Dict, Optional, Union, TYPE_CHECKING, AsyncGenerator\n\nBLOCKED_CODES = {401, 403, 407, 429, 444, 500, 502, 503, 504}\nif TYPE_CHECKING:\n    from scrapling.engines.toolbelt.custom import Response\n\n\nclass LogCounterHandler(logging.Handler):\n    \"\"\"A logging handler that counts log messages by level.\"\"\"\n\n    def __init__(self):\n        super().__init__()\n        self.counts = {\n            logging.DEBUG: 0,\n            logging.INFO: 0,\n            logging.WARNING: 0,\n            logging.ERROR: 0,\n            logging.CRITICAL: 0,\n        }\n\n    def emit(self, record: logging.LogRecord) -> None:\n        level = record.levelno\n        # Map to the closest standard level\n        if level >= logging.CRITICAL:\n            self.counts[logging.CRITICAL] += 1\n        elif level >= logging.ERROR:\n            self.counts[logging.ERROR] += 1\n        elif level >= logging.WARNING:\n            self.counts[logging.WARNING] += 1\n        elif level >= logging.INFO:\n            self.counts[logging.INFO] += 1\n        else:\n            self.counts[logging.DEBUG] += 1\n\n    def get_counts(self) -> Dict[str, int]:\n        \"\"\"Return counts as a dictionary with string keys.\"\"\"\n        return {\n            \"debug\": self.counts[logging.DEBUG],\n            \"info\": self.counts[logging.INFO],\n            \"warning\": self.counts[logging.WARNING],\n            \"error\": self.counts[logging.ERROR],\n            \"critical\": self.counts[logging.CRITICAL],\n        }\n\n\nclass SessionConfigurationError(Exception):\n    \"\"\"Raised when session configuration fails.\"\"\"\n\n    pass\n\n\nclass Spider(ABC):\n    \"\"\"An abstract base class for creating web spiders.\n\n    Check the documentation website for more information.\n    \"\"\"\n\n    name: Optional[str] = None\n    start_urls: list[str] = []\n    allowed_domains: Set[str] = set()\n\n    # Concurrency settings\n    concurrent_requests: int = 4\n    concurrent_requests_per_domain: int = 0\n    download_delay: float = 0.0\n    max_blocked_retries: int = 3\n\n    # Fingerprint adjustments\n    fp_include_kwargs: bool = False\n    fp_keep_fragments: bool = False\n    fp_include_headers: bool = False\n\n    # Logging settings\n    logging_level: int = logging.DEBUG\n    logging_format: str = \"[%(asctime)s]:({spider_name}) %(levelname)s: %(message)s\"\n    logging_date_format: str = \"%Y-%m-%d %H:%M:%S\"\n    log_file: Optional[str] = None\n\n    def __init__(self, crawldir: Optional[Union[str, Path, AsyncPath]] = None, interval: float = 300.0):\n        \"\"\"Initialize the spider.\n\n        :param crawldir: Directory for checkpoint files. If provided, enables pause/resume.\n        :param interval: Seconds between periodic checkpoint saves (default 5 minutes).\n        \"\"\"\n        if self.name is None:\n            raise ValueError(f\"{self.__class__.__name__} must have a name.\")\n\n        self.logger = logging.getLogger(f\"scrapling.spiders.{self.name}\")\n        self.logger.setLevel(self.logging_level)\n        self.logger.handlers.clear()\n        self.logger.propagate = False  # Don't propagate to parent 'scrapling' logger\n\n        formatter = logging.Formatter(\n            fmt=self.logging_format.format(spider_name=self.name), datefmt=self.logging_date_format\n        )\n\n        # Add a log counter handler to track log counts by level\n        self._log_counter = LogCounterHandler()\n        self.logger.addHandler(self._log_counter)\n\n        console_handler = logging.StreamHandler()\n        console_handler.setFormatter(formatter)\n        self.logger.addHandler(console_handler)\n\n        if self.log_file:\n            Path(self.log_file).parent.mkdir(parents=True, exist_ok=True)\n            file_handler = logging.FileHandler(self.log_file)\n            file_handler.setFormatter(formatter)\n            self.logger.addHandler(file_handler)\n\n        self.crawldir: Optional[Path] = Path(crawldir) if crawldir else None\n        self._interval = interval\n        self._engine: Optional[CrawlerEngine] = None\n        self._original_sigint_handler: Any = None\n\n        self._session_manager = SessionManager()\n\n        try:\n            self.configure_sessions(self._session_manager)\n        except Exception as e:\n            raise SessionConfigurationError(f\"Error in {self.__class__.__name__}.configure_sessions(): {e}\") from e\n\n        if len(self._session_manager) == 0:\n            raise SessionConfigurationError(f\"{self.__class__.__name__}.configure_sessions() did not add any sessions\")\n\n        self.logger.info(\"Spider initialized\")\n\n    async def start_requests(self) -> AsyncGenerator[Request, None]:\n        \"\"\"Generate initial requests to start the crawl.\n\n        By default, this generates Request objects for each URL in `start_urls`\n        using the session manager's default session and `parse()` as callback.\n\n        Override this method for more control over initial requests\n        (e.g., to add custom headers, use different callbacks, etc.)\n        \"\"\"\n        if not self.start_urls:\n            raise RuntimeError(\n                \"Spider has no starting point, either set `start_urls` or override `start_requests` function.\"\n            )\n\n        for url in self.start_urls:\n            yield Request(url, sid=self._session_manager.default_session_id)\n\n    @abstractmethod\n    async def parse(self, response: \"Response\") -> AsyncGenerator[Dict[str, Any] | Request | None, None]:\n        \"\"\"Default callback for processing responses\"\"\"\n        raise NotImplementedError(f\"{self.__class__.__name__} must implement parse() method\")\n        yield  # Make this a generator for type checkers\n\n    async def on_start(self, resuming: bool = False) -> None:\n        \"\"\"Called before crawling starts. Override for setup logic.\n\n        :param resuming: It's enabled if the spider is resuming from a checkpoint, left for the user to use.\n        \"\"\"\n        if resuming:\n            self.logger.debug(\"Resuming spider from checkpoint\")\n        else:\n            self.logger.debug(\"Starting spider\")\n\n    async def on_close(self) -> None:\n        \"\"\"Called after crawling finishes. Override for cleanup logic.\"\"\"\n        self.logger.debug(\"Spider closed\")\n\n    async def on_error(self, request: Request, error: Exception) -> None:\n        \"\"\"\n        Handle request errors for all spider requests.\n\n        Override for custom error handling.\n        \"\"\"\n        pass\n\n    async def on_scraped_item(self, item: Dict[str, Any]) -> Dict[str, Any] | None:\n        \"\"\"A hook to be overridden by users to do some processing on scraped items, return `None` to drop the item silently.\"\"\"\n        return item\n\n    async def is_blocked(self, response: \"Response\") -> bool:\n        \"\"\"Check if the response is blocked. Users should override this for custom detection logic.\"\"\"\n        if response.status in BLOCKED_CODES:\n            return True\n        return False\n\n    async def retry_blocked_request(self, request: Request, response: \"Response\") -> Request:\n        \"\"\"Users should override this to prepare the blocked request before retrying, if needed.\"\"\"\n        return request\n\n    def __repr__(self) -> str:\n        \"\"\"String representation of the spider.\"\"\"\n        return f\"<{self.__class__.__name__} '{self.name}'>\"\n\n    def configure_sessions(self, manager: SessionManager) -> None:\n        \"\"\"Configure sessions for this spider.\n\n        Override this method to add custom sessions.\n        The default implementation creates a FetcherSession session.\n\n        The first session added becomes the default for `start_requests()` unless specified otherwise.\n\n        :param manager: SessionManager to configure\n        \"\"\"\n        from scrapling.fetchers import FetcherSession\n\n        manager.add(\"default\", FetcherSession())\n\n    def pause(self):\n        \"\"\"Request graceful shutdown of the crawling process.\"\"\"\n        if self._engine:\n            self._engine.request_pause()\n        else:\n            raise RuntimeError(\"No active crawl to stop\")\n\n    def _setup_signal_handler(self) -> None:\n        \"\"\"Set up SIGINT handler for graceful pause.\"\"\"\n\n        def handler(_signum: int, _frame: Any) -> None:\n            if self._engine:\n                self._engine.request_pause()\n            else:\n                # No engine yet, just raise KeyboardInterrupt\n                raise KeyboardInterrupt\n\n        try:\n            self._original_sigint_handler = signal.signal(signal.SIGINT, handler)\n        except ValueError:\n            self._original_sigint_handler = None\n\n    def _restore_signal_handler(self) -> None:\n        \"\"\"Restore original SIGINT handler.\"\"\"\n        if self._original_sigint_handler is not None:\n            try:\n                signal.signal(signal.SIGINT, self._original_sigint_handler)\n            except ValueError:\n                pass\n\n    async def __run(self) -> CrawlResult:\n        token = set_logger(self.logger)\n        try:\n            self._engine = CrawlerEngine(self, self._session_manager, self.crawldir, self._interval)\n            stats = await self._engine.crawl()\n            paused = self._engine.paused\n            return CrawlResult(stats=stats, items=self._engine.items, paused=paused)\n        finally:\n            self._engine = None\n            reset_logger(token)\n            # Close any file handlers to release file resources.\n            if self.log_file:\n                for handler in self.logger.handlers:\n                    if isinstance(handler, logging.FileHandler):\n                        handler.close()\n\n    def start(self, use_uvloop: bool = False, **backend_options: Any) -> CrawlResult:\n        \"\"\"Run the spider and return results.\n\n        This is the main entry point for running a spider.\n        Handles async execution internally via anyio.\n\n        Pressing Ctrl+C will initiate graceful shutdown (waits for active tasks to complete).\n        Pressing Ctrl+C a second time will force immediate stop.\n\n        If crawldir is set, a checkpoint will also be saved on graceful shutdown,\n        allowing you to resume the crawl later by running the spider again.\n\n        :param use_uvloop: Whether to use the faster uvloop/winloop event loop implementation, if available.\n        :param backend_options: Asyncio backend options to be used with `anyio.run`\n        \"\"\"\n        backend_options = backend_options or {}\n        if use_uvloop:\n            backend_options.update({\"use_uvloop\": True})\n\n        # Set up SIGINT handler for graceful shutdown\n        self._setup_signal_handler()\n        try:\n            return anyio.run(self.__run, backend=\"asyncio\", backend_options=backend_options)\n        finally:\n            self._restore_signal_handler()\n\n    async def stream(self) -> AsyncGenerator[Dict[str, Any], None]:\n        \"\"\"Stream items as they're scraped. Ideal for long-running spiders or building applications on top of the spiders.\n\n        Must be called from an async context. Yields items one by one as they are scraped.\n        Access `spider.stats` during iteration for real-time statistics.\n\n        Note: SIGINT handling for pause/resume is not available in stream mode.\n        \"\"\"\n        token = set_logger(self.logger)\n        try:\n            self._engine = CrawlerEngine(self, self._session_manager, self.crawldir, self._interval)\n            async for item in self._engine:\n                yield item\n        finally:\n            self._engine = None\n            reset_logger(token)\n            if self.log_file:\n                for handler in self.logger.handlers:\n                    if isinstance(handler, logging.FileHandler):\n                        handler.close()\n\n    @property\n    def stats(self) -> CrawlStats:\n        \"\"\"Access current crawl stats (works during streaming).\"\"\"\n        if self._engine:\n            return self._engine.stats\n        raise RuntimeError(\"No active crawl. Use this property inside `async for item in spider.stream():`\")\n"
  },
  {
    "path": "server.json",
    "content": "{\n  \"$schema\": \"https://static.modelcontextprotocol.io/schemas/2025-12-11/server.schema.json\",\n  \"name\": \"io.github.D4Vinci/Scrapling\",\n  \"title\": \"Scrapling MCP Server\",\n  \"description\": \"Web scraping with stealth HTTP, real browsers, and Cloudflare bypass. CSS selectors supported.\",\n  \"websiteUrl\": \"https://scrapling.readthedocs.io/en/latest/ai/mcp-server.html\",\n  \"repository\": {\n    \"url\": \"https://github.com/D4Vinci/Scrapling\",\n    \"source\": \"github\"\n  },\n  \"icons\": [\n    {\n      \"src\": \"https://raw.githubusercontent.com/D4Vinci/Scrapling/main/docs/assets/logo.png\",\n      \"mimeType\": \"image/png\"\n    }\n  ],\n  \"version\": \"0.4.2\",\n  \"packages\": [\n    {\n      \"registryType\": \"pypi\",\n      \"identifier\": \"scrapling\",\n      \"version\": \"0.4.2\",\n      \"runtimeHint\": \"uvx\",\n      \"packageArguments\": [\n        {\n          \"type\": \"positional\",\n          \"valueHint\": \"mcp\",\n          \"isFixed\": true\n        }\n      ],\n      \"transport\": {\n        \"type\": \"stdio\"\n      }\n    },\n    {\n      \"registryType\": \"oci\",\n      \"identifier\": \"ghcr.io/d4vinci/scrapling\",\n      \"packageArguments\": [\n        {\n          \"type\": \"positional\",\n          \"valueHint\": \"mcp\",\n          \"isFixed\": true\n        }\n      ],\n      \"transport\": {\n        \"type\": \"stdio\"\n      }\n    }\n  ]\n}"
  },
  {
    "path": "setup.cfg",
    "content": "[metadata]\nname = scrapling\nversion = 0.4.2\nauthor = Karim Shoair\nauthor_email = karim.shoair@pm.me\ndescription = Scrapling is an undetectable, powerful, flexible, high-performance Python library that makes Web Scraping easy and effortless as it should be!\nlicense = BSD\nhome_page = https://github.com/D4Vinci/Scrapling"
  },
  {
    "path": "tests/__init__.py",
    "content": "\"\"\"Package for test project.\"\"\"\n"
  },
  {
    "path": "tests/ai/__init__.py",
    "content": ""
  },
  {
    "path": "tests/ai/test_ai_mcp.py",
    "content": "import pytest\nimport pytest_httpbin\n\nfrom scrapling.core.ai import ScraplingMCPServer, ResponseModel\n\n\n@pytest_httpbin.use_class_based_httpbin\nclass TestMCPServer:\n    \"\"\"Test MCP server functionality\"\"\"\n\n    @pytest.fixture(scope=\"class\")\n    def test_url(self, httpbin):\n        return f\"{httpbin.url}/html\"\n\n    @pytest.fixture\n    def server(self):\n        return ScraplingMCPServer()\n\n    def test_get_tool(self, server, test_url):\n        \"\"\"Test the get tool method\"\"\"\n        result = server.get(url=test_url, extraction_type=\"markdown\")\n        assert isinstance(result, ResponseModel)\n        assert result.status == 200\n        assert result.url == test_url\n\n    @pytest.mark.asyncio\n    async def test_bulk_get_tool(self, server, test_url):\n        \"\"\"Test the bulk_get tool method\"\"\"\n        results = await server.bulk_get(urls=(test_url, test_url), extraction_type=\"html\")\n\n        assert len(results) == 2\n        assert all(isinstance(r, ResponseModel) for r in results)\n\n    @pytest.mark.asyncio\n    async def test_fetch_tool(self, server, test_url):\n        \"\"\"Test the fetch tool method\"\"\"\n        result = await server.fetch(url=test_url, headless=True)\n        assert isinstance(result, ResponseModel)\n        assert result.status == 200\n\n    @pytest.mark.asyncio\n    async def test_bulk_fetch_tool(self, server, test_url):\n        \"\"\"Test the bulk_fetch tool method\"\"\"\n        result = await server.bulk_fetch(urls=(test_url, test_url), headless=True)\n        assert all(isinstance(r, ResponseModel) for r in result)\n\n    @pytest.mark.asyncio\n    async def test_stealthy_fetch_tool(self, server, test_url):\n        \"\"\"Test the stealthy_fetch tool method\"\"\"\n        result = await server.stealthy_fetch(url=test_url, headless=True)\n        assert isinstance(result, ResponseModel)\n        assert result.status == 200\n\n    @pytest.mark.asyncio\n    async def test_bulk_stealthy_fetch_tool(self, server, test_url):\n        \"\"\"Test the bulk_stealthy_fetch tool method\"\"\"\n        result = await server.bulk_stealthy_fetch(urls=(test_url, test_url), headless=True)\n        assert all(isinstance(r, ResponseModel) for r in result)\n"
  },
  {
    "path": "tests/cli/__init__.py",
    "content": ""
  },
  {
    "path": "tests/cli/test_cli.py",
    "content": "import pytest\nfrom click.testing import CliRunner\nfrom unittest.mock import patch, MagicMock\nimport pytest_httpbin\n\nfrom scrapling.parser import Selector\nfrom scrapling.cli import (\n    shell, mcp, get, post, put, delete, fetch, stealthy_fetch\n)\n\n\n@pytest_httpbin.use_class_based_httpbin\ndef configure_selector_mock():\n    \"\"\"Helper function to create a properly configured Selector mock\"\"\"\n    mock_response = MagicMock(spec=Selector)\n    mock_response.body = \"<html><body>Test content</body></html>\"\n    mock_response.html_content = \"<html><body>Test content</body></html>\"\n    mock_response.encoding = \"utf-8\"\n    mock_response.get_all_text.return_value = \"Test content\"\n    mock_response.css.return_value = [mock_response]\n    return mock_response\n\n\nclass TestCLI:\n    \"\"\"Test CLI functionality\"\"\"\n\n    @pytest.fixture\n    def html_url(self, httpbin):\n        return f\"{httpbin.url}/html\"\n\n    @pytest.fixture\n    def runner(self):\n        return CliRunner()\n\n    def test_shell_command(self, runner):\n        \"\"\"Test shell command\"\"\"\n        with patch('scrapling.core.shell.CustomShell') as mock_shell:\n            mock_instance = MagicMock()\n            mock_shell.return_value = mock_instance\n\n            result = runner.invoke(shell)\n            assert result.exit_code == 0\n            mock_instance.start.assert_called_once()\n\n    def test_mcp_command(self, runner):\n        \"\"\"Test MCP command\"\"\"\n        with patch('scrapling.core.ai.ScraplingMCPServer') as mock_server:\n            mock_instance = MagicMock()\n            mock_server.return_value = mock_instance\n\n            result = runner.invoke(mcp)\n            assert result.exit_code == 0\n            mock_instance.serve.assert_called_once()\n\n    def test_extract_get_command(self, runner, tmp_path, html_url):\n        \"\"\"Test extract `get` command\"\"\"\n        output_file = tmp_path / \"output.md\"\n\n        with patch('scrapling.fetchers.Fetcher.get') as mock_get:\n            mock_response = configure_selector_mock()\n            mock_response.status = 200\n            mock_get.return_value = mock_response\n\n            result = runner.invoke(\n                get,\n                [html_url, str(output_file)]\n            )\n            assert result.exit_code == 0\n\n        # Test with various options\n        with patch('scrapling.fetchers.Fetcher.get') as mock_get:\n            mock_get.return_value = mock_response\n\n            result = runner.invoke(\n                get,\n                [\n                    html_url,\n                    str(output_file),\n                    '-H', 'User-Agent: Test',\n                    '--cookies', 'session=abc123',\n                    '--timeout', '60',\n                    '--proxy', 'http://proxy:8080',\n                    '-s', '.content',\n                    '-p', 'page=1'\n                ]\n            )\n            assert result.exit_code == 0\n\n    def test_extract_post_command(self, runner, tmp_path, html_url):\n        \"\"\"Test extract `post` command\"\"\"\n        output_file = tmp_path / \"output.html\"\n\n        with patch('scrapling.fetchers.Fetcher.post') as mock_post:\n            mock_response = configure_selector_mock()\n            mock_post.return_value = mock_response\n\n            result = runner.invoke(\n                post,\n                [\n                    html_url,\n                    str(output_file),\n                    '-d', 'key=value',\n                    '-j', '{\"data\": \"test\"}'\n                ]\n            )\n            assert result.exit_code == 0\n\n    def test_extract_put_command(self, runner, tmp_path, html_url):\n        \"\"\"Test extract `put` command\"\"\"\n        output_file = tmp_path / \"output.html\"\n\n        with patch('scrapling.fetchers.Fetcher.put') as mock_put:\n            mock_response = configure_selector_mock()\n            mock_put.return_value = mock_response\n\n            result = runner.invoke(\n                put,\n                [\n                    html_url,\n                    str(output_file),\n                    '-d', 'key=value',\n                    '-j', '{\"data\": \"test\"}'\n                ]\n            )\n            assert result.exit_code == 0\n\n    def test_extract_delete_command(self, runner, tmp_path, html_url):\n        \"\"\"Test extract `delete` command\"\"\"\n        output_file = tmp_path / \"output.html\"\n\n        with patch('scrapling.fetchers.Fetcher.delete') as mock_delete:\n            mock_response = configure_selector_mock()\n            mock_delete.return_value = mock_response\n\n            result = runner.invoke(\n                delete,\n                [\n                    html_url,\n                    str(output_file)\n                ]\n            )\n            assert result.exit_code == 0\n\n    def test_extract_fetch_command(self, runner, tmp_path, html_url):\n        \"\"\"Test extract fetch command\"\"\"\n        output_file = tmp_path / \"output.txt\"\n\n        with patch('scrapling.fetchers.DynamicFetcher.fetch') as mock_fetch:\n            mock_response = configure_selector_mock()\n            mock_fetch.return_value = mock_response\n\n            result = runner.invoke(\n                fetch,\n                [\n                    html_url,\n                    str(output_file),\n                    '--headless',\n                    '--timeout', '60000'\n                ]\n            )\n            assert result.exit_code == 0\n\n    def test_extract_stealthy_fetch_command(self, runner, tmp_path, html_url):\n        \"\"\"Test extract fetch command\"\"\"\n        output_file = tmp_path / \"output.md\"\n\n        with patch('scrapling.fetchers.StealthyFetcher.fetch') as mock_fetch:\n            mock_response = configure_selector_mock()\n            mock_fetch.return_value = mock_response\n\n            result = runner.invoke(\n                stealthy_fetch,\n                [\n                    html_url,\n                    str(output_file),\n                    '--headless',\n                    '--css-selector', 'body',\n                    '--timeout', '60000'\n                ]\n            )\n            assert result.exit_code == 0\n\n    def test_invalid_arguments(self, runner, html_url):\n        \"\"\"Test invalid arguments handling\"\"\"\n        # Missing required arguments\n        result = runner.invoke(get)\n        assert result.exit_code != 0\n\n        _ = runner.invoke(\n            get,\n            [html_url, 'output.invalid']\n        )\n        # Should handle the error gracefully\n\n    def test_impersonate_comma_separated(self, runner, tmp_path, html_url):\n        \"\"\"Test that comma-separated impersonate values are parsed correctly\"\"\"\n        output_file = tmp_path / \"output.md\"\n\n        with patch('scrapling.fetchers.Fetcher.get') as mock_get:\n            mock_response = configure_selector_mock()\n            mock_response.status = 200\n            mock_get.return_value = mock_response\n\n            result = runner.invoke(\n                get,\n                [\n                    html_url,\n                    str(output_file),\n                    '--impersonate', 'chrome,firefox,safari'\n                ]\n            )\n            assert result.exit_code == 0\n\n            # Verify that the impersonate argument was converted to a list\n            call_kwargs = mock_get.call_args[1]\n            assert isinstance(call_kwargs['impersonate'], list)\n            assert call_kwargs['impersonate'] == ['chrome', 'firefox', 'safari']\n\n    def test_impersonate_single_browser(self, runner, tmp_path, html_url):\n        \"\"\"Test that single impersonate value remains as string\"\"\"\n        output_file = tmp_path / \"output.md\"\n\n        with patch('scrapling.fetchers.Fetcher.get') as mock_get:\n            mock_response = configure_selector_mock()\n            mock_response.status = 200\n            mock_get.return_value = mock_response\n\n            result = runner.invoke(\n                get,\n                [\n                    html_url,\n                    str(output_file),\n                    '--impersonate', 'chrome'\n                ]\n            )\n            assert result.exit_code == 0\n\n            # Verify that the impersonate argument remains a string\n            call_kwargs = mock_get.call_args[1]\n            assert isinstance(call_kwargs['impersonate'], str)\n            assert call_kwargs['impersonate'] == 'chrome'\n"
  },
  {
    "path": "tests/cli/test_shell_functionality.py",
    "content": "import pytest\nfrom unittest.mock import patch, MagicMock\n\nfrom scrapling.parser import Selector\nfrom scrapling.core.shell import CustomShell, CurlParser, Convertor\n\n\nclass TestCurlParser:\n    \"\"\"Test curl command parsing\"\"\"\n\n    @pytest.fixture\n    def parser(self):\n        return CurlParser()\n\n    def test_basic_curl_parse(self, parser):\n        \"\"\"Test parsing basic curl commands\"\"\"\n        # Simple GET\n        curl_cmd = 'curl https://example.com'\n        request = parser.parse(curl_cmd)\n\n        assert request.url == 'https://example.com'\n        assert request.method == 'get'\n        assert request.data is None\n\n    def test_curl_with_headers(self, parser):\n        \"\"\"Test parsing curl with headers\"\"\"\n        curl_cmd = '''curl https://example.com \\\n            -H \"User-Agent: Mozilla/5.0\" \\\n            -H \"Accept: application/json\"'''\n\n        request = parser.parse(curl_cmd)\n\n        assert request.headers['User-Agent'] == 'Mozilla/5.0'\n        assert request.headers['Accept'] == 'application/json'\n\n    def test_curl_with_data(self, parser):\n        \"\"\"Test parsing curl with data\"\"\"\n        # Form data\n        curl_cmd = 'curl https://example.com -X POST -d \"key=value&foo=bar\"'\n        request = parser.parse(curl_cmd)\n\n        assert request.method == 'post'\n        assert request.data == 'key=value&foo=bar'\n\n        # JSON data\n        curl_cmd = \"\"\"curl https://example.com -X POST --data-raw '{\"key\": \"value\"}'\"\"\"\n        request = parser.parse(curl_cmd)\n\n        assert request.json_data == {\"key\": \"value\"}\n\n    def test_curl_with_cookies(self, parser):\n        \"\"\"Test parsing curl with cookies\"\"\"\n        curl_cmd = '''curl https://example.com \\\n            -H \"Cookie: session=abc123; user=john\" \\\n            -b \"extra=cookie\"'''\n\n        request = parser.parse(curl_cmd)\n\n        assert request.cookies['session'] == 'abc123'\n        assert request.cookies['user'] == 'john'\n        assert request.cookies['extra'] == 'cookie'\n\n    def test_curl_with_proxy(self, parser):\n        \"\"\"Test parsing curl with proxy\"\"\"\n        curl_cmd = 'curl https://example.com -x http://proxy:8080 -U user:pass'\n        request = parser.parse(curl_cmd)\n\n        assert 'http://user:pass@proxy:8080' in request.proxy['http']\n\n    def test_curl2fetcher(self, parser):\n        \"\"\"Test converting curl to fetcher request\"\"\"\n        with patch('scrapling.fetchers.Fetcher.get') as mock_get:\n            mock_response = MagicMock()\n            mock_get.return_value = mock_response\n\n            curl_cmd = 'curl https://example.com'\n            _ = parser.convert2fetcher(curl_cmd)\n\n            mock_get.assert_called_once()\n\n    def test_invalid_curl_commands(self, parser):\n        \"\"\"Test handling invalid curl commands\"\"\"\n        # Invalid format\n        with pytest.raises(AttributeError):\n            parser.parse('not a curl command')\n\n\nclass TestConvertor:\n    \"\"\"Test content conversion functionality\"\"\"\n\n    @pytest.fixture\n    def sample_html(self):\n        return \"\"\"\n        <html>\n            <body>\n                <div class=\"content\">\n                    <h1>Title</h1>\n                    <p>Some text content</p>\n                </div>\n            </body>\n        </html>\n        \"\"\"\n\n    def test_extract_markdown(self, sample_html):\n        \"\"\"Test extracting content as Markdown\"\"\"\n        page = Selector(sample_html)\n        content = list(Convertor._extract_content(page, \"markdown\"))\n\n        assert len(content) > 0\n        assert \"Title\\n=====\" in content[0]  # Markdown conversion\n\n    def test_extract_html(self, sample_html):\n        \"\"\"Test extracting content as HTML\"\"\"\n        page = Selector(sample_html)\n        content = list(Convertor._extract_content(page, \"html\"))\n\n        assert len(content) > 0\n        assert \"<h1>Title</h1>\" in content[0]\n\n    def test_extract_text(self, sample_html):\n        \"\"\"Test extracting content as plain text\"\"\"\n        page = Selector(sample_html)\n        content = list(Convertor._extract_content(page, \"text\"))\n\n        assert len(content) > 0\n        assert \"Title\" in content[0]\n        assert \"Some text content\" in content[0]\n\n    def test_extract_with_selector(self, sample_html):\n        \"\"\"Test extracting with CSS selector\"\"\"\n        page = Selector(sample_html)\n        content = list(Convertor._extract_content(\n            page,\n            \"text\",\n            css_selector=\".content\"\n        ))\n\n        assert len(content) > 0\n\n    def test_write_to_file(self, sample_html, tmp_path):\n        \"\"\"Test writing content to files\"\"\"\n        page = Selector(sample_html)\n\n        # Test markdown\n        md_file = tmp_path / \"output.md\"\n        Convertor.write_content_to_file(page, str(md_file))\n        assert md_file.exists()\n\n        # Test HTML\n        html_file = tmp_path / \"output.html\"\n        Convertor.write_content_to_file(page, str(html_file))\n        assert html_file.exists()\n\n        # Test text\n        txt_file = tmp_path / \"output.txt\"\n        Convertor.write_content_to_file(page, str(txt_file))\n        assert txt_file.exists()\n\n    def test_invalid_operations(self, sample_html):\n        \"\"\"Test error handling in convertor\"\"\"\n        page = Selector(sample_html)\n\n        # Invalid extraction type\n        with pytest.raises(ValueError):\n            list(Convertor._extract_content(page, \"invalid\"))\n\n        # Invalid filename\n        with pytest.raises(ValueError):\n            Convertor.write_content_to_file(page, \"\")\n\n        # Unknown file extension\n        with pytest.raises(ValueError):\n            Convertor.write_content_to_file(page, \"output.xyz\")\n\n\nclass TestCustomShell:\n    \"\"\"Test interactive shell functionality\"\"\"\n\n    def test_shell_initialization(self):\n        \"\"\"Test shell initialization\"\"\"\n        shell = CustomShell(code=\"\", log_level=\"debug\")\n\n        assert shell.log_level == 10  # DEBUG level\n        assert shell.page is None\n        assert len(shell.pages) == 0\n\n    def test_shell_namespace(self):\n        \"\"\"Test shell namespace creation\"\"\"\n        shell = CustomShell(code=\"\")\n        namespace = shell.get_namespace()\n\n        # Check all expected functions/classes are available\n        assert 'get' in namespace\n        assert 'post' in namespace\n        assert 'Fetcher' in namespace\n        assert 'DynamicFetcher' in namespace\n        assert 'view' in namespace\n        assert 'uncurl' in namespace\n"
  },
  {
    "path": "tests/core/__init__.py",
    "content": ""
  },
  {
    "path": "tests/core/test_shell_core.py",
    "content": "import pytest\n\nfrom scrapling.core.shell import (\n    _CookieParser,\n    _ParseHeaders,\n    Request,\n    _known_logging_levels,\n)\n\n\nclass TestCookieParser:\n    \"\"\"Test cookie parsing functionality\"\"\"\n    \n    def test_simple_cookie_parsing(self):\n        \"\"\"Test parsing a simple cookie\"\"\"\n        cookie_string = \"session_id=abc123\"\n        cookies = list(_CookieParser(cookie_string))\n        assert len(cookies) == 1\n        assert cookies[0] == (\"session_id\", \"abc123\")\n    \n    def test_multiple_cookies_parsing(self):\n        \"\"\"Test parsing multiple cookies\"\"\"\n        cookie_string = \"session_id=abc123; theme=dark; lang=en\"\n        cookies = list(_CookieParser(cookie_string))\n        assert len(cookies) == 3\n        cookie_dict = dict(cookies)\n        assert cookie_dict[\"session_id\"] == \"abc123\"\n        assert cookie_dict[\"theme\"] == \"dark\"\n        assert cookie_dict[\"lang\"] == \"en\"\n    \n    def test_cookie_with_attributes(self):\n        \"\"\"Test parsing cookies with attributes\"\"\"\n        cookie_string = \"session_id=abc123; Path=/; HttpOnly; Secure\"\n        cookies = list(_CookieParser(cookie_string))\n        assert len(cookies) == 1\n        assert cookies[0] == (\"session_id\", \"abc123\")\n    \n    def test_empty_cookie_string(self):\n        \"\"\"Test parsing empty cookie string\"\"\"\n        cookies = list(_CookieParser(\"\"))\n        assert len(cookies) == 0\n    \n    def test_malformed_cookie_handling(self):\n        \"\"\"Test handling of malformed cookies\"\"\"\n        # Should not raise exception but may return an empty list\n        cookies = list(_CookieParser(\"invalid_cookie_format\"))\n        assert isinstance(cookies, list)\n\n\nclass TestParseHeaders:\n    \"\"\"Test header parsing functionality\"\"\"\n    \n    def test_simple_headers(self):\n        \"\"\"Test parsing simple headers\"\"\"\n        header_lines = [\n            \"Content-Type: text/html\",\n            \"Content-Length: 1234\",\n            \"User-Agent: TestAgent/1.0\"\n        ]\n        headers, cookies = _ParseHeaders(header_lines)\n        \n        assert headers[\"Content-Type\"] == \"text/html\"\n        assert headers[\"Content-Length\"] == \"1234\"\n        assert headers[\"User-Agent\"] == \"TestAgent/1.0\"\n        assert len(cookies) == 0\n    \n    def test_headers_with_cookies(self):\n        \"\"\"Test parsing headers with cookie headers\"\"\"\n        header_lines = [\n            \"Content-Type: text/html\",\n            \"Set-Cookie: session_id=abc123\",\n            \"Set-Cookie: theme=dark; Path=/\",\n        ]\n        headers, cookies = _ParseHeaders(header_lines)\n        \n        assert headers[\"Content-Type\"] == \"text/html\"\n        assert \"Set-Cookie\" in headers  # Should contain the first Set-Cookie\n        # Cookie parsing behavior depends on implementation\n    \n    def test_headers_without_colons(self):\n        \"\"\"Test headers without colons\"\"\"\n        header_lines = [\n            \"Content-Type: text/html\",\n            \"InvalidHeader;\",  # Header ending with semicolon\n        ]\n        headers, cookies = _ParseHeaders(header_lines)\n        \n        assert headers[\"Content-Type\"] == \"text/html\"\n        assert \"InvalidHeader\" in headers\n        assert headers[\"InvalidHeader\"] == \"\"\n    \n    def test_invalid_header_format(self):\n        \"\"\"Test invalid header format raises error\"\"\"\n        header_lines = [\n            \"Content-Type: text/html\",\n            \"InvalidHeaderWithoutColon\",  # No colon, no semicolon\n        ]\n        \n        with pytest.raises(ValueError, match=\"Could not parse header without colon\"):\n            _ParseHeaders(header_lines)\n    \n    def test_headers_with_multiple_colons(self):\n        \"\"\"Test headers with multiple colons\"\"\"\n        header_lines = [\n            \"Authorization: Bearer: token123\",\n            \"X-Custom: value:with:colons\",\n        ]\n        headers, cookies = _ParseHeaders(header_lines)\n        \n        assert headers[\"Authorization\"] == \"Bearer: token123\"\n        assert headers[\"X-Custom\"] == \"value:with:colons\"\n    \n    def test_headers_with_whitespace(self):\n        \"\"\"Test headers with extra whitespace\"\"\"\n        header_lines = [\n            \"  Content-Type  :  text/html  \",\n            \"\\tUser-Agent\\t:\\tTestAgent/1.0\\t\",\n        ]\n        headers, cookies = _ParseHeaders(header_lines)\n        \n        # Should handle whitespace correctly\n        assert \"Content-Type\" in headers or \"  Content-Type  \" in headers\n        assert \"text/html\" in str(headers.values()) or \"  text/html  \" in str(headers.values())\n    \n    def test_parse_cookies_disabled(self):\n        \"\"\"Test parsing with cookies disabled\"\"\"\n        header_lines = [\n            \"Content-Type: text/html\",\n            \"Set-Cookie: session_id=abc123\",\n        ]\n        headers, cookies = _ParseHeaders(header_lines, parse_cookies=False)\n        \n        assert headers[\"Content-Type\"] == \"text/html\"\n        # Cookie parsing behavior when disabled\n        assert len(cookies) == 0 or \"Set-Cookie\" in headers\n    \n    def test_empty_header_lines(self):\n        \"\"\"Test parsing empty header lines\"\"\"\n        headers, cookies = _ParseHeaders([])\n        assert len(headers) == 0\n        assert len(cookies) == 0\n\n\nclass TestRequestNamedTuple:\n    \"\"\"Test Request namedtuple functionality\"\"\"\n    \n    def test_request_creation(self):\n        \"\"\"Test creating Request namedtuple\"\"\"\n        request = Request(\n            method=\"GET\",\n            url=\"https://example.com\",\n            params={\"q\": \"test\"},\n            data=None,\n            json_data=None,\n            headers={\"User-Agent\": \"Test\"},\n            cookies={\"session\": \"abc123\"},\n            proxy=None,\n            follow_redirects=True\n        )\n        \n        assert request.method == \"GET\"\n        assert request.url == \"https://example.com\"\n        assert request.params == {\"q\": \"test\"}\n        assert request.headers == {\"User-Agent\": \"Test\"}\n        assert request.follow_redirects is True\n    \n    def test_request_defaults(self):\n        \"\"\"Test Request with default/None values\"\"\"\n        request = Request(\n            method=\"POST\",\n            url=\"https://api.example.com\",\n            params=None,\n            data='{\"key\": \"value\"}',\n            json_data={\"key\": \"value\"},\n            headers={},\n            cookies={},\n            proxy=\"http://proxy:8080\",\n            follow_redirects=False\n        )\n        \n        assert request.method == \"POST\"\n        assert request.data == '{\"key\": \"value\"}'\n        assert request.json_data == {\"key\": \"value\"}\n        assert request.proxy == \"http://proxy:8080\"\n        assert request.follow_redirects is False\n    \n    def test_request_field_access(self):\n        \"\"\"Test accessing Request fields\"\"\"\n        request = Request(\n            \"GET\", \"https://example.com\", {}, None, None, {}, {}, None, True\n        )\n        \n        # Test field access by name\n        assert hasattr(request, 'method')\n        assert hasattr(request, 'url') \n        assert hasattr(request, 'params')\n        assert hasattr(request, 'data')\n        assert hasattr(request, 'json_data')\n        assert hasattr(request, 'headers')\n        assert hasattr(request, 'cookies')\n        assert hasattr(request, 'proxy')\n        assert hasattr(request, 'follow_redirects')\n        \n        # Test field access by index\n        assert request[0] == \"GET\"\n        assert request[1] == \"https://example.com\"\n\n\nclass TestLoggingLevels:\n    \"\"\"Test logging level constants\"\"\"\n    \n    def test_known_logging_levels(self):\n        \"\"\"Test that all known logging levels are defined\"\"\"\n        expected_levels = [\"debug\", \"info\", \"warning\", \"error\", \"critical\", \"fatal\"]\n        \n        for level in expected_levels:\n            assert level in _known_logging_levels\n            assert isinstance(_known_logging_levels[level], int)\n    \n    def test_logging_level_values(self):\n        \"\"\"Test logging level values are correct\"\"\"\n        from logging import DEBUG, INFO, WARNING, ERROR, CRITICAL, FATAL\n        \n        assert _known_logging_levels[\"debug\"] == DEBUG\n        assert _known_logging_levels[\"info\"] == INFO\n        assert _known_logging_levels[\"warning\"] == WARNING\n        assert _known_logging_levels[\"error\"] == ERROR\n        assert _known_logging_levels[\"critical\"] == CRITICAL\n        assert _known_logging_levels[\"fatal\"] == FATAL\n    \n    def test_level_hierarchy(self):\n        \"\"\"Test that logging levels have correct hierarchy\"\"\"\n        levels = [\n            _known_logging_levels[\"debug\"],\n            _known_logging_levels[\"info\"],\n            _known_logging_levels[\"warning\"],\n            _known_logging_levels[\"error\"],\n            _known_logging_levels[\"critical\"],\n        ]\n        \n        # Levels should be in ascending order\n        for i in range(len(levels) - 1):\n            assert levels[i] < levels[i + 1]\n"
  },
  {
    "path": "tests/core/test_storage_core.py",
    "content": "import tempfile\nimport os\n\nfrom scrapling.core.storage import SQLiteStorageSystem\n\n\nclass TestSQLiteStorageSystem:\n    \"\"\"Test SQLiteStorageSystem functionality\"\"\"\n    \n    def test_sqlite_storage_creation(self):\n        \"\"\"Test SQLite storage system creation\"\"\"\n        # Use an in-memory database for testing\n        storage = SQLiteStorageSystem(storage_file=\":memory:\")\n        assert storage is not None\n    \n    def test_sqlite_storage_with_file(self):\n        \"\"\"Test SQLite storage with an actual file\"\"\"\n        with tempfile.NamedTemporaryFile(suffix='.db', delete=False) as tmp_file:\n            db_path = tmp_file.name\n\n        storage = None\n        try:\n            storage = SQLiteStorageSystem(storage_file=db_path)\n            assert storage is not None\n            assert os.path.exists(db_path)\n        finally:\n            # Close the database connection before deleting (required on Windows)\n            if storage is not None:\n                storage.close()\n            if os.path.exists(db_path):\n                os.unlink(db_path)\n    \n    def test_sqlite_storage_initialization_args(self):\n        \"\"\"Test SQLite storage with various initialization arguments\"\"\"\n        # Test with URL parameter\n        storage = SQLiteStorageSystem(\n            storage_file=\":memory:\",\n            url=\"https://example.com\"\n        )\n        assert storage is not None\n        assert storage.url == \"https://example.com\"\n"
  },
  {
    "path": "tests/fetchers/__init__.py",
    "content": "# Because I'm too lazy to mock requests :)\n"
  },
  {
    "path": "tests/fetchers/async/__init__.py",
    "content": ""
  },
  {
    "path": "tests/fetchers/async/test_dynamic.py",
    "content": "import pytest\nimport pytest_httpbin\n\nfrom scrapling import DynamicFetcher\n\nDynamicFetcher.adaptive = True\n\n\n@pytest_httpbin.use_class_based_httpbin\nclass TestDynamicFetcherAsync:\n    @pytest.fixture\n    def fetcher(self):\n        return DynamicFetcher\n\n    @pytest.fixture\n    def urls(self, httpbin):\n        return {\n            \"status_200\": f\"{httpbin.url}/status/200\",\n            \"status_404\": f\"{httpbin.url}/status/404\",\n            \"status_501\": f\"{httpbin.url}/status/501\",\n            \"basic_url\": f\"{httpbin.url}/get\",\n            \"html_url\": f\"{httpbin.url}/html\",\n            \"delayed_url\": f\"{httpbin.url}/delay/10\",\n            \"cookies_url\": f\"{httpbin.url}/cookies/set/test/value\",\n        }\n\n    @pytest.mark.asyncio\n    async def test_basic_fetch(self, fetcher, urls):\n        \"\"\"Test doing a basic fetch request with multiple statuses\"\"\"\n        response = await fetcher.async_fetch(urls[\"status_200\"])\n        assert response.status == 200\n\n    @pytest.mark.asyncio\n    async def test_cookies_loading(self, fetcher, urls):\n        \"\"\"Test if cookies are set after the request\"\"\"\n        response = await fetcher.async_fetch(urls[\"cookies_url\"])\n        cookies = {response.cookies[0]['name']: response.cookies[0]['value']}\n        assert cookies == {\"test\": \"value\"}\n\n    @pytest.mark.asyncio\n    async def test_automation(self, fetcher, urls):\n        \"\"\"Test if automation breaks the code or not\"\"\"\n\n        async def scroll_page(page):\n            await page.mouse.wheel(10, 0)\n            await page.mouse.move(100, 400)\n            await page.mouse.up()\n            return page\n\n        response = await fetcher.async_fetch(urls[\"html_url\"], page_action=scroll_page)\n        assert response.status == 200\n\n    @pytest.mark.parametrize(\n        \"kwargs\",\n        [\n            {\"real_chrome\": True, \"disable_resources\": True},\n            {\"wait_selector\": \"h1\", \"wait_selector_state\": \"attached\"},\n            {\"wait_selector\": \"h1\", \"wait_selector_state\": \"visible\"},\n            {\n                \"google_search\": True,\n                \"real_chrome\": True,\n                \"wait\": 10,\n                \"locale\": \"en-US\",\n                \"extra_headers\": {\"ayo\": \"\"},\n                \"useragent\": \"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:131.0) Gecko/20100101 Firefox/131.0\",\n                \"cookies\": [{\"name\": \"test\", \"value\": \"123\", \"domain\": \"example.com\", \"path\": \"/\"}],\n                \"network_idle\": True,\n                \"selector_config\": {\"keep_comments\": False, \"keep_cdata\": False},\n            },\n        ],\n    )\n    @pytest.mark.asyncio\n    async def test_properties(self, fetcher, urls, kwargs):\n        \"\"\"Test if different arguments break the code or not\"\"\"\n        response = await fetcher.async_fetch(urls[\"html_url\"], **kwargs)\n        assert response.status == 200\n\n    @pytest.mark.asyncio\n    async def test_cdp_url_invalid(self, fetcher, urls):\n        \"\"\"Test if invalid CDP URLs raise appropriate exceptions\"\"\"\n        with pytest.raises(TypeError):\n            await fetcher.async_fetch(urls[\"html_url\"], cdp_url=\"blahblah\")\n\n        with pytest.raises(TypeError):\n            await fetcher.async_fetch(\n                urls[\"html_url\"], cdp_url=\"blahblah\"\n            )\n\n        with pytest.raises(Exception):\n            await fetcher.async_fetch(urls[\"html_url\"], cdp_url=\"ws://blahblah\")\n"
  },
  {
    "path": "tests/fetchers/async/test_dynamic_session.py",
    "content": "import pytest\nimport asyncio\n\nimport pytest_httpbin\n\nfrom scrapling.fetchers import AsyncDynamicSession\n\n\n@pytest_httpbin.use_class_based_httpbin\n@pytest.mark.asyncio\nclass TestAsyncDynamicSession:\n    \"\"\"Test AsyncDynamicSession\"\"\"\n\n    # The `AsyncDynamicSession` is inheriting from `DynamicSession` class so no need to repeat all the tests\n    @pytest.fixture\n    def urls(self, httpbin):\n        return {\n            \"basic\": f\"{httpbin.url}/get\",\n            \"html\": f\"{httpbin.url}/html\",\n        }\n\n    async def test_concurrent_async_requests(self, urls):\n        \"\"\"Test concurrent requests with async session\"\"\"\n        async with AsyncDynamicSession(max_pages=3) as session:\n            # Launch multiple concurrent requests\n            tasks = [\n                session.fetch(urls[\"basic\"]),\n                session.fetch(urls[\"html\"]),\n                session.fetch(urls[\"basic\"])\n            ]\n\n            assert session.max_pages == 3\n            assert session.page_pool.max_pages == 3\n            assert session.context is not None\n\n            responses = await asyncio.gather(*tasks)\n\n            # All should succeed\n            assert all(r.status == 200 for r in responses)\n\n            # Check pool stats\n            stats = session.get_pool_stats()\n            assert stats[\"total_pages\"] <= 3\n\n        # After exit, should be closed\n        assert session._is_alive is False\n\n        # Should raise RuntimeError when used after closing\n        with pytest.raises(RuntimeError):\n            await session.fetch(urls[\"basic\"])\n\n    async def test_page_pool_management(self, urls):\n        \"\"\"Test page pool creation and reuse\"\"\"\n        async with AsyncDynamicSession() as session:\n            # The first request creates a page\n            response = await session.fetch(urls[\"basic\"])\n            assert response.status == 200\n            assert session.page_pool.pages_count == 0\n            \n            # The second request should reuse the page\n            response = await session.fetch(urls[\"html\"])\n            assert response.status == 200\n            assert session.page_pool.pages_count == 0\n\n            # Check pool stats\n            stats = session.get_pool_stats()\n            assert stats[\"total_pages\"] == 0\n            assert stats[\"max_pages\"] == 1\n\n    async def test_dynamic_session_with_options(self, urls):\n        \"\"\"Test AsyncDynamicSession with various options\"\"\"\n        async with AsyncDynamicSession(\n                headless=False,\n                disable_resources=True,\n                extra_headers={\"X-Test\": \"value\"}\n        ) as session:\n            response = await session.fetch(urls[\"html\"])\n            assert response.status == 200\n\n    async def test_error_handling_in_fetch(self, urls):\n        \"\"\"Test error handling during fetch\"\"\"\n        async with AsyncDynamicSession() as session:\n            # Test with invalid URL\n            with pytest.raises(Exception):\n                await session.fetch(\"invalid://url\")\n"
  },
  {
    "path": "tests/fetchers/async/test_requests.py",
    "content": "import pytest\nimport pytest_httpbin\n\nfrom scrapling.fetchers import AsyncFetcher\n\nAsyncFetcher.adaptive = True\n\n\n@pytest_httpbin.use_class_based_httpbin\n@pytest.mark.asyncio\nclass TestAsyncFetcher:\n    @pytest.fixture(scope=\"class\")\n    def fetcher(self):\n        return AsyncFetcher\n\n    @pytest.fixture(scope=\"class\")\n    def urls(self, httpbin):\n        return {\n            \"status_200\": f\"{httpbin.url}/status/200\",\n            \"status_404\": f\"{httpbin.url}/status/404\",\n            \"status_501\": f\"{httpbin.url}/status/501\",\n            \"basic_url\": f\"{httpbin.url}/get\",\n            \"post_url\": f\"{httpbin.url}/post\",\n            \"put_url\": f\"{httpbin.url}/put\",\n            \"delete_url\": f\"{httpbin.url}/delete\",\n            \"html_url\": f\"{httpbin.url}/html\",\n        }\n\n    async def test_basic_get(self, fetcher, urls):\n        \"\"\"Test doing basic get request with multiple statuses\"\"\"\n        assert (await fetcher.get(urls[\"status_200\"])).status == 200\n        assert (await fetcher.get(urls[\"status_404\"])).status == 404\n        assert (await fetcher.get(urls[\"status_501\"])).status == 501\n\n    async def test_get_properties(self, fetcher, urls):\n        \"\"\"Test if different arguments with the GET request break the code or not\"\"\"\n        assert (\n            await fetcher.get(urls[\"status_200\"], stealthy_headers=True)\n        ).status == 200\n        assert (\n            await fetcher.get(urls[\"status_200\"], follow_redirects=True)\n        ).status == 200\n        assert (await fetcher.get(urls[\"status_200\"], timeout=None)).status == 200\n        assert (\n            await fetcher.get(\n                urls[\"status_200\"],\n                stealthy_headers=True,\n                follow_redirects=True,\n                timeout=None,\n            )\n        ).status == 200\n\n    async def test_post_properties(self, fetcher, urls):\n        \"\"\"Test if different arguments with the POST request break the code or not\"\"\"\n        assert (\n            await fetcher.post(urls[\"post_url\"], data={\"key\": \"value\"})\n        ).status == 200\n        assert (\n            await fetcher.post(\n                urls[\"post_url\"], data={\"key\": \"value\"}, stealthy_headers=True\n            )\n        ).status == 200\n        assert (\n            await fetcher.post(\n                urls[\"post_url\"], data={\"key\": \"value\"}, follow_redirects=True\n            )\n        ).status == 200\n        assert (\n            await fetcher.post(urls[\"post_url\"], data={\"key\": \"value\"}, timeout=None)\n        ).status == 200\n        assert (\n            await fetcher.post(\n                urls[\"post_url\"],\n                data={\"key\": \"value\"},\n                stealthy_headers=True,\n                follow_redirects=True,\n                timeout=None,\n            )\n        ).status == 200\n\n    async def test_put_properties(self, fetcher, urls):\n        \"\"\"Test if different arguments with a PUT request break the code or not\"\"\"\n        assert (await fetcher.put(urls[\"put_url\"], data={\"key\": \"value\"})).status in [\n            200,\n            405,\n        ]\n        assert (\n            await fetcher.put(\n                urls[\"put_url\"], data={\"key\": \"value\"}, stealthy_headers=True\n            )\n        ).status in [200, 405]\n        assert (\n            await fetcher.put(\n                urls[\"put_url\"], data={\"key\": \"value\"}, follow_redirects=True\n            )\n        ).status in [200, 405]\n        assert (\n            await fetcher.put(urls[\"put_url\"], data={\"key\": \"value\"}, timeout=None)\n        ).status in [200, 405]\n        assert (\n            await fetcher.put(\n                urls[\"put_url\"],\n                data={\"key\": \"value\"},\n                stealthy_headers=True,\n                follow_redirects=True,\n                timeout=None,\n            )\n        ).status in [200, 405]\n\n    async def test_delete_properties(self, fetcher, urls):\n        \"\"\"Test if different arguments with the DELETE request break the code or not\"\"\"\n        assert (\n            await fetcher.delete(urls[\"delete_url\"], stealthy_headers=True)\n        ).status == 200\n        assert (\n            await fetcher.delete(urls[\"delete_url\"], follow_redirects=True)\n        ).status == 200\n        assert (await fetcher.delete(urls[\"delete_url\"], timeout=None)).status == 200\n        assert (\n            await fetcher.delete(\n                urls[\"delete_url\"],\n                stealthy_headers=True,\n                follow_redirects=True,\n                timeout=None,\n            )\n        ).status == 200\n"
  },
  {
    "path": "tests/fetchers/async/test_requests_session.py",
    "content": "\n\nfrom scrapling.engines.static import AsyncFetcherClient\n\n\nclass TestFetcherSession:\n    \"\"\"Test FetcherSession functionality\"\"\"\n\n    def test_async_fetcher_client_creation(self):\n        \"\"\"Test AsyncFetcherClient creation\"\"\"\n        client = AsyncFetcherClient()\n\n        # Should not have context manager methods\n        assert client.__aenter__ is None\n        assert client.__aexit__ is None\n"
  },
  {
    "path": "tests/fetchers/async/test_stealth.py",
    "content": "import pytest\nimport pytest_httpbin\n\nfrom scrapling import StealthyFetcher\n\nStealthyFetcher.adaptive = True\n\n\n@pytest_httpbin.use_class_based_httpbin\n@pytest.mark.asyncio\nclass TestStealthyFetcher:\n    @pytest.fixture(scope=\"class\")\n    def fetcher(self):\n        return StealthyFetcher\n\n    @pytest.fixture(scope=\"class\")\n    def urls(self, httpbin):\n        url = httpbin.url\n        return {\n            \"status_200\": f\"{url}/status/200\",\n            \"status_404\": f\"{url}/status/404\",\n            \"status_501\": f\"{url}/status/501\",\n            \"basic_url\": f\"{url}/get\",\n            \"html_url\": f\"{url}/html\",\n            \"delayed_url\": f\"{url}/delay/10\",  # 10 Seconds delay response\n            \"cookies_url\": f\"{url}/cookies/set/test/value\"\n        }\n\n    async def test_basic_fetch(self, fetcher, urls):\n        \"\"\"Test doing a basic fetch request with multiple statuses\"\"\"\n        assert (await fetcher.async_fetch(urls[\"status_200\"])).status == 200\n        # assert (await fetcher.async_fetch(urls[\"status_404\"])).status == 404\n        # assert (await fetcher.async_fetch(urls[\"status_501\"])).status == 501\n\n    async def test_cookies_loading(self, fetcher, urls):\n        \"\"\"Test if cookies are set after the request\"\"\"\n        response = await fetcher.async_fetch(urls[\"cookies_url\"])\n        cookies = {response.cookies[0]['name']: response.cookies[0]['value']}\n        assert cookies == {\"test\": \"value\"}\n\n    async def test_automation(self, fetcher, urls):\n        \"\"\"Test if automation breaks the code or not\"\"\"\n\n        async def scroll_page(page):\n            await page.mouse.wheel(10, 0)\n            await page.mouse.move(100, 400)\n            await page.mouse.up()\n            return page\n\n        assert (\n            await fetcher.async_fetch(urls[\"html_url\"], page_action=scroll_page, humanize=True)\n        ).status == 200\n\n    @pytest.mark.parametrize(\n        \"kwargs\",\n        [\n            {\"block_webrtc\": True, \"allow_webgl\": True},\n            {\"block_webrtc\": False, \"allow_webgl\": True},\n            {\"block_webrtc\": True, \"allow_webgl\": False, \"disable_resources\": True},\n            {\"wait_selector\": \"h1\", \"wait_selector_state\": \"attached\"},\n            {\"wait_selector\": \"h1\", \"wait_selector_state\": \"visible\"},\n            {\n                \"network_idle\": True,\n                \"wait\": 10,\n                \"cookies\": [{\"name\": \"test\", \"value\": \"123\", \"domain\": \"example.com\", \"path\": \"/\"}],\n                \"google_search\": True,\n                \"extra_headers\": {\"ayo\": \"\"},\n                \"selector_config\": {\"keep_comments\": False, \"keep_cdata\": False},\n                \"additional_args\": {},\n            },\n        ],\n    )\n    async def test_properties(self, fetcher, urls, kwargs):\n        \"\"\"Test if different arguments break the code or not\"\"\"\n        response = await fetcher.async_fetch(\n            urls[\"html_url\"],\n            **kwargs\n        )\n        assert response.status == 200\n"
  },
  {
    "path": "tests/fetchers/async/test_stealth_session.py",
    "content": "\nimport pytest\nimport asyncio\n\nimport pytest_httpbin\n\nfrom scrapling.fetchers import AsyncStealthySession\n\n\n@pytest_httpbin.use_class_based_httpbin\n@pytest.mark.asyncio\nclass TestAsyncStealthySession:\n    \"\"\"Test AsyncStealthySession\"\"\"\n\n    # The `AsyncStealthySession` is inheriting from `StealthySession` class so no need to repeat all the tests\n    @pytest.fixture\n    def urls(self, httpbin):\n        return {\n            \"basic\": f\"{httpbin.url}/get\",\n            \"html\": f\"{httpbin.url}/html\",\n        }\n\n    async def test_concurrent_async_requests(self, urls):\n        \"\"\"Test concurrent requests with async session\"\"\"\n        async with AsyncStealthySession(max_pages=3) as session:\n            # Launch multiple concurrent requests\n            tasks = [\n                session.fetch(urls[\"basic\"]),\n                session.fetch(urls[\"html\"]),\n                session.fetch(urls[\"basic\"])\n            ]\n\n            assert session.max_pages == 3\n            assert session.page_pool.max_pages == 3\n            assert session.context is not None\n\n            responses = await asyncio.gather(*tasks)\n\n            # All should succeed\n            assert all(r.status == 200 for r in responses)\n\n            # Check pool stats\n            stats = session.get_pool_stats()\n            assert stats[\"total_pages\"] <= 3\n\n        # After exit, should be closed\n        assert session._is_alive is False\n\n        # Should raise RuntimeError when used after closing\n        with pytest.raises(RuntimeError):\n            await session.fetch(urls[\"basic\"])\n\n    async def test_page_pool_management(self, urls):\n        \"\"\"Test page pool creation and reuse\"\"\"\n        async with AsyncStealthySession() as session:\n            # The first request creates a page\n            response = await session.fetch(urls[\"basic\"])\n            assert response.status == 200\n            assert session.page_pool.pages_count == 0\n\n            # The second request should reuse the page\n            response = await session.fetch(urls[\"html\"])\n            assert response.status == 200\n            assert session.page_pool.pages_count == 0\n\n            # Check pool stats\n            stats = session.get_pool_stats()\n            assert stats[\"total_pages\"] == 0\n            assert stats[\"max_pages\"] == 1\n\n    async def test_stealthy_session_with_options(self, urls):\n        \"\"\"Test AsyncStealthySession with various options\"\"\"\n        async with AsyncStealthySession(\n                max_pages=1,\n                block_webrtc=True,\n                allow_webgl=True\n        ) as session:\n            response = await session.fetch(urls[\"html\"])\n            assert response.status == 200\n\n    async def test_error_handling_in_fetch(self, urls):\n        \"\"\"Test error handling during fetch\"\"\"\n        async with AsyncStealthySession() as session:\n            # Test with invalid URL\n            with pytest.raises(Exception):\n                await session.fetch(\"invalid://url\")\n"
  },
  {
    "path": "tests/fetchers/sync/__init__.py",
    "content": ""
  },
  {
    "path": "tests/fetchers/sync/test_dynamic.py",
    "content": "import pytest\nimport pytest_httpbin\n\nfrom scrapling import DynamicFetcher\n\nDynamicFetcher.adaptive = True\n\n\n@pytest_httpbin.use_class_based_httpbin\nclass TestDynamicFetcher:\n    @pytest.fixture(scope=\"class\")\n    def fetcher(self):\n        \"\"\"Fixture to create a StealthyFetcher instance for the entire test class\"\"\"\n        return DynamicFetcher\n\n    @pytest.fixture(autouse=True)\n    def setup_urls(self, httpbin):\n        \"\"\"Fixture to set up URLs for testing\"\"\"\n        self.status_200 = f\"{httpbin.url}/status/200\"\n        self.status_404 = f\"{httpbin.url}/status/404\"\n        self.status_501 = f\"{httpbin.url}/status/501\"\n        self.basic_url = f\"{httpbin.url}/get\"\n        self.html_url = f\"{httpbin.url}/html\"\n        self.delayed_url = f\"{httpbin.url}/delay/10\"  # 10 Seconds delay response\n        self.cookies_url = f\"{httpbin.url}/cookies/set/test/value\"\n\n    def test_basic_fetch(self, fetcher):\n        \"\"\"Test doing a basic fetch request with multiple statuses\"\"\"\n        assert fetcher.fetch(self.status_200).status == 200\n        # There's a bug with playwright makes it crashes if a URL returns status code 4xx/5xx without body, let's disable this till they reply to my issue report\n        # assert fetcher.fetch(self.status_404).status == 404\n        # assert fetcher.fetch(self.status_501).status == 501\n\n    def test_cookies_loading(self, fetcher):\n        \"\"\"Test if cookies are set after the request\"\"\"\n        response = fetcher.fetch(self.cookies_url)\n        cookies = {response.cookies[0]['name']: response.cookies[0]['value']}\n        assert cookies == {\"test\": \"value\"}\n\n    def test_automation(self, fetcher):\n        \"\"\"Test if automation breaks the code or not\"\"\"\n\n        def scroll_page(page):\n            page.mouse.wheel(10, 0)\n            page.mouse.move(100, 400)\n            page.mouse.up()\n            return page\n\n        assert fetcher.fetch(self.html_url, page_action=scroll_page).status == 200\n\n    @pytest.mark.parametrize(\n        \"kwargs\",\n        [\n            {\"disable_resources\": True, \"real_chrome\": True},\n            {\"wait_selector\": \"h1\", \"wait_selector_state\": \"attached\"},\n            {\"wait_selector\": \"h1\", \"wait_selector_state\": \"visible\"},\n            {\n                \"google_search\": True,\n                \"real_chrome\": True,\n                \"wait\": 10,\n                \"locale\": \"en-US\",\n                \"timezone_id\": \"America/New_York\",\n                \"extra_headers\": {\"ayo\": \"\"},\n                \"useragent\": \"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:131.0) Gecko/20100101 Firefox/131.0\",\n                \"cookies\": [{\"name\": \"test\", \"value\": \"123\", \"domain\": \"example.com\", \"path\": \"/\"}],\n                \"network_idle\": True,\n                \"selector_config\": {\"keep_comments\": False, \"keep_cdata\": False},\n            },\n        ],\n    )\n    def test_properties(self, fetcher, kwargs):\n        \"\"\"Test if different arguments break the code or not\"\"\"\n        response = fetcher.fetch(self.html_url, **kwargs)\n        assert response.status == 200\n\n    def test_cdp_url_invalid(self, fetcher):\n        \"\"\"Test if invalid CDP URLs raise appropriate exceptions\"\"\"\n        with pytest.raises(TypeError):\n            fetcher.fetch(self.html_url, cdp_url=\"blahblah\")\n\n        with pytest.raises(TypeError):\n            fetcher.fetch(self.html_url, cdp_url=\"blahblah\")\n\n        with pytest.raises(Exception):\n            fetcher.fetch(self.html_url, cdp_url=\"ws://blahblah\")\n"
  },
  {
    "path": "tests/fetchers/sync/test_requests.py",
    "content": "import pytest\nimport pytest_httpbin\n\nfrom scrapling import Fetcher\n\nFetcher.adaptive = True\n\n\n@pytest_httpbin.use_class_based_httpbin\nclass TestFetcher:\n    @pytest.fixture(scope=\"class\")\n    def fetcher(self):\n        \"\"\"Fixture to create a Fetcher instance for the entire test class\"\"\"\n        return Fetcher\n\n    @pytest.fixture(autouse=True)\n    def setup_urls(self, httpbin):\n        \"\"\"Fixture to set up URLs for testing\"\"\"\n        self.status_200 = f\"{httpbin.url}/status/200\"\n        self.status_404 = f\"{httpbin.url}/status/404\"\n        self.status_501 = f\"{httpbin.url}/status/501\"\n        self.basic_url = f\"{httpbin.url}/get\"\n        self.post_url = f\"{httpbin.url}/post\"\n        self.put_url = f\"{httpbin.url}/put\"\n        self.delete_url = f\"{httpbin.url}/delete\"\n        self.html_url = f\"{httpbin.url}/html\"\n\n    def test_basic_get(self, fetcher):\n        \"\"\"Test doing basic get request with multiple statuses\"\"\"\n        assert fetcher.get(self.status_200).status == 200\n        assert fetcher.get(self.status_404).status == 404\n        assert fetcher.get(self.status_501).status == 501\n\n    def test_get_properties(self, fetcher):\n        \"\"\"Test if different arguments with the GET request break the code or not\"\"\"\n        assert fetcher.get(self.status_200, stealthy_headers=True).status == 200\n        assert fetcher.get(self.status_200, follow_redirects=True).status == 200\n        assert fetcher.get(self.status_200, timeout=None).status == 200\n        assert (\n            fetcher.get(\n                self.status_200,\n                stealthy_headers=True,\n                follow_redirects=True,\n                timeout=None,\n            ).status\n            == 200\n        )\n\n    def test_post_properties(self, fetcher):\n        \"\"\"Test if different arguments with the POST request break the code or not\"\"\"\n        assert fetcher.post(self.post_url, data={\"key\": \"value\"}).status == 200\n        assert (\n            fetcher.post(\n                self.post_url, data={\"key\": \"value\"}, stealthy_headers=True\n            ).status\n            == 200\n        )\n        assert (\n            fetcher.post(\n                self.post_url, data={\"key\": \"value\"}, follow_redirects=True\n            ).status\n            == 200\n        )\n        assert (\n            fetcher.post(self.post_url, data={\"key\": \"value\"}, timeout=None).status\n            == 200\n        )\n        assert (\n            fetcher.post(\n                self.post_url,\n                data={\"key\": \"value\"},\n                stealthy_headers=True,\n                follow_redirects=True,\n                timeout=None,\n            ).status\n            == 200\n        )\n\n    def test_put_properties(self, fetcher):\n        \"\"\"Test if different arguments with a PUT request break the code or not\"\"\"\n        assert fetcher.put(self.put_url, data={\"key\": \"value\"}).status == 200\n        assert (\n            fetcher.put(\n                self.put_url, data={\"key\": \"value\"}, stealthy_headers=True\n            ).status\n            == 200\n        )\n        assert (\n            fetcher.put(\n                self.put_url, data={\"key\": \"value\"}, follow_redirects=True\n            ).status\n            == 200\n        )\n        assert (\n            fetcher.put(self.put_url, data={\"key\": \"value\"}, timeout=None).status == 200\n        )\n        assert (\n            fetcher.put(\n                self.put_url,\n                data={\"key\": \"value\"},\n                stealthy_headers=True,\n                follow_redirects=True,\n                timeout=None,\n            ).status\n            == 200\n        )\n\n    def test_delete_properties(self, fetcher):\n        \"\"\"Test if different arguments with the DELETE request break the code or not\"\"\"\n        assert fetcher.delete(self.delete_url, stealthy_headers=True).status == 200\n        assert fetcher.delete(self.delete_url, follow_redirects=True).status == 200\n        assert fetcher.delete(self.delete_url, timeout=None).status == 200\n        assert (\n            fetcher.delete(\n                self.delete_url,\n                stealthy_headers=True,\n                follow_redirects=True,\n                timeout=None,\n            ).status\n            == 200\n        )\n"
  },
  {
    "path": "tests/fetchers/sync/test_requests_session.py",
    "content": "import pytest\n\n\nfrom scrapling.engines.static import _SyncSessionLogic as FetcherSession, FetcherClient\n\n\nclass TestFetcherSession:\n    \"\"\"Test FetcherSession functionality\"\"\"\n\n    def test_fetcher_session_creation(self):\n        \"\"\"Test FetcherSession creation\"\"\"\n        session = FetcherSession(\n            timeout=30,\n            retries=3,\n            stealthy_headers=True\n        )\n\n        assert session._default_timeout == 30\n        assert session._default_retries == 3\n\n    def test_fetcher_session_context_manager(self):\n        \"\"\"Test FetcherSession as a context manager\"\"\"\n        session = FetcherSession()\n\n        with session as s:\n            assert s == session\n            assert session._curl_session is not None\n\n        # Session should be cleaned up\n\n    def test_fetcher_session_double_enter(self):\n        \"\"\"Test error on double entering\"\"\"\n        session = FetcherSession()\n\n        with session:\n            with pytest.raises(RuntimeError):\n                session.__enter__()\n\n    def test_fetcher_client_creation(self):\n        \"\"\"Test FetcherClient creation\"\"\"\n        client = FetcherClient()\n\n        # Should not have context manager methods\n        assert client.__enter__ is None\n        assert client.__exit__ is None\n"
  },
  {
    "path": "tests/fetchers/sync/test_stealth_session.py",
    "content": "import re\nimport pytest\nimport pytest_httpbin\n\nfrom scrapling.engines._browsers._stealth import StealthySession, __CF_PATTERN__\n\n\nclass TestStealthConstants:\n    \"\"\"Test Stealth constants and patterns\"\"\"\n\n    def test_cf_pattern_regex(self):\n        \"\"\"Test __CF_PATTERN__ regex compilation\"\"\"\n\n        assert isinstance(__CF_PATTERN__, re.Pattern)\n\n        # Test matching URLs\n        test_urls = [\n            \"https://challenges.cloudflare.com/cdn-cgi/challenge-platform/h/123456\",\n            \"https://challenges.cloudflare.com/cdn-cgi/challenge-platform/orchestrate/jsch/v1\",\n            \"http://challenges.cloudflare.com/cdn-cgi/challenge-platform/scripts/abc\"\n        ]\n\n        for url in test_urls:\n            assert __CF_PATTERN__.search(url) is not None\n\n        # Test non-matching URLs\n        non_matching_urls = [\n            \"https://example.com/challenge\",\n            \"https://cloudflare.com/something\",\n            \"https://challenges.cloudflare.com/other-path\"\n        ]\n\n        for url in non_matching_urls:\n            assert __CF_PATTERN__.search(url) is None\n\n\n@pytest_httpbin.use_class_based_httpbin\nclass TestStealthySession:\n\n    \"\"\"All the code is tested in the async version tests, so no need to repeat it here. The async class inherits from this one.\"\"\"\n    @pytest.fixture(autouse=True)\n    def setup_urls(self, httpbin):\n        \"\"\"Fixture to set up URLs for testing\"\"\"\n        self.status_200 = f\"{httpbin.url}/status/200\"\n        self.status_404 = f\"{httpbin.url}/status/404\"\n        self.status_501 = f\"{httpbin.url}/status/501\"\n        self.basic_url = f\"{httpbin.url}/get\"\n        self.html_url = f\"{httpbin.url}/html\"\n        self.delayed_url = f\"{httpbin.url}/delay/10\"  # 10 Seconds delay response\n        self.cookies_url = f\"{httpbin.url}/cookies/set/test/value\"\n\n    def test_session_creation(self):\n        \"\"\"Test if the session is created correctly\"\"\"\n\n        with StealthySession(\n            headless=True,\n            disable_resources=True,\n            solve_cloudflare=True,\n            wait=1000,\n            timeout=60000,\n            cookies=[{\"name\": \"test\", \"value\": \"123\", \"domain\": \"example.com\", \"path\": \"/\"}],\n        ) as session:\n\n            assert session.max_pages == 1\n            assert session._config.headless is True\n            assert session._config.disable_resources is True\n            assert session._config.solve_cloudflare is True\n            assert session._config.wait == 1000\n            assert session._config.timeout == 60000\n            assert session.context is not None\n\n            # Test Cloudflare detection\n            for cloudflare_type in ('managed', 'interactive', 'non-interactive'):\n                page_content = f\"\"\"\n                <html>\n                    <script>\n                        cType: '{cloudflare_type}'\n                    </script>\n                </html>\n                \"\"\"\n                result = session._detect_cloudflare(page_content)\n                assert result == cloudflare_type\n\n            page_content = \"\"\"\n            <html>\n                <body>\n                    <p>Regular page content</p>\n                </body>\n            </html>\n            \"\"\"\n\n            result = StealthySession._detect_cloudflare(page_content)\n            assert result is None\n            assert session.fetch(self.status_200).status == 200\n"
  },
  {
    "path": "tests/fetchers/test_base.py",
    "content": "import pytest\n\nfrom scrapling.engines.toolbelt.custom import BaseFetcher\n\n\nclass TestBaseFetcher:\n    \"\"\"Test BaseFetcher configuration functionality\"\"\"\n\n    def test_default_configuration(self):\n        \"\"\"Test default configuration values\"\"\"\n        config = BaseFetcher.display_config()\n\n        assert config['huge_tree'] is True\n        assert config['adaptive'] is False\n        assert config['keep_comments'] is False\n        assert config['keep_cdata'] is False\n\n    def test_configure_single_parameter(self):\n        \"\"\"Test configuring single parameter\"\"\"\n        BaseFetcher.configure(adaptive=True)\n\n        config = BaseFetcher.display_config()\n        assert config['adaptive'] is True\n\n        # Reset\n        BaseFetcher.configure(adaptive=False)\n\n    def test_configure_multiple_parameters(self):\n        \"\"\"Test configuring multiple parameters\"\"\"\n        BaseFetcher.configure(\n            huge_tree=False,\n            keep_comments=True,\n            adaptive=True\n        )\n\n        config = BaseFetcher.display_config()\n        assert config['huge_tree'] is False\n        assert config['keep_comments'] is True\n        assert config['adaptive'] is True\n\n        # Reset\n        BaseFetcher.configure(\n            huge_tree=True,\n            keep_comments=False,\n            adaptive=False\n        )\n\n    def test_configure_invalid_parameter(self):\n        \"\"\"Test configuring invalid parameter\"\"\"\n        with pytest.raises(ValueError):\n            BaseFetcher.configure(invalid_param=True)\n\n    def test_configure_no_parameters(self):\n        \"\"\"Test configure with no parameters\"\"\"\n        with pytest.raises(AttributeError):\n            BaseFetcher.configure()\n\n    def test_configure_non_parser_keyword(self):\n        \"\"\"Test configuring non-parser keyword\"\"\"\n        with pytest.raises(AttributeError):\n            # Assuming there's some attribute that's not in parser_keywords\n            BaseFetcher.some_other_attr = \"test\"\n            BaseFetcher.configure(some_other_attr=\"new_value\")\n\n    def test_generate_parser_arguments(self):\n        \"\"\"Test parser arguments generation\"\"\"\n        BaseFetcher.configure(\n            huge_tree=False,\n            adaptive=True,\n            adaptive_domain=\"example.com\"\n        )\n\n        args = BaseFetcher._generate_parser_arguments()\n\n        assert args['huge_tree'] is False\n        assert args['adaptive'] is True\n        assert args['adaptive_domain'] == \"example.com\"\n\n        # Reset\n        BaseFetcher.configure(\n            huge_tree=True,\n            adaptive=False\n        )\n        BaseFetcher.adaptive_domain = None\n"
  },
  {
    "path": "tests/fetchers/test_constants.py",
    "content": "from scrapling.engines.constants import EXTRA_RESOURCES, STEALTH_ARGS, HARMFUL_ARGS, DEFAULT_ARGS\n\n\nclass TestConstants:\n    \"\"\"Test constant values\"\"\"\n\n    def test_default_disabled_resources(self):\n        \"\"\"Test default disabled resources\"\"\"\n        assert \"image\" in EXTRA_RESOURCES\n        assert \"font\" in EXTRA_RESOURCES\n        assert \"stylesheet\" in EXTRA_RESOURCES\n        assert \"media\" in EXTRA_RESOURCES\n\n    def test_harmful_default_args(self):\n        \"\"\"Test harmful default arguments\"\"\"\n        assert \"--enable-automation\" in HARMFUL_ARGS\n        assert \"--disable-popup-blocking\" in HARMFUL_ARGS\n\n    def test_flags(self):\n        \"\"\"Test default stealth flags\"\"\"\n        assert \"--no-pings\" in DEFAULT_ARGS\n        # assert \"--incognito\" in STEALTH_ARGS\n        assert \"--disable-blink-features=AutomationControlled\" in STEALTH_ARGS\n"
  },
  {
    "path": "tests/fetchers/test_impersonate_list.py",
    "content": "\"\"\"Test suite for list-based impersonate parameter functionality.\"\"\"\nimport pytest\nimport pytest_httpbin\nfrom unittest.mock import patch, MagicMock\n\nfrom scrapling import Fetcher\nfrom scrapling.fetchers import FetcherSession\nfrom scrapling.engines.static import _select_random_browser\n\n\nclass TestRandomBrowserSelection:\n    \"\"\"Test the random browser selection helper function.\"\"\"\n\n    def test_select_random_browser_with_single_string(self):\n        \"\"\"Test that single browser string is returned as-is.\"\"\"\n        result = _select_random_browser(\"chrome\")\n        assert result == \"chrome\"\n\n    def test_select_random_browser_with_none(self):\n        \"\"\"Test that None is returned as-is.\"\"\"\n        result = _select_random_browser(None)\n        assert result is None\n\n    def test_select_random_browser_with_list(self):\n        \"\"\"Test that a browser is randomly selected from a list.\"\"\"\n        browsers = [\"chrome\", \"firefox\", \"safari\"]\n        result = _select_random_browser(browsers)\n        assert result in browsers\n\n    def test_select_random_browser_with_empty_list(self):\n        \"\"\"Test that empty list returns None.\"\"\"\n        result = _select_random_browser([])\n        assert result is None\n\n    def test_select_random_browser_with_single_item_list(self):\n        \"\"\"Test that single-item list returns that item.\"\"\"\n        result = _select_random_browser([\"chrome\"])\n        assert result == \"chrome\"\n\n\n@pytest_httpbin.use_class_based_httpbin\nclass TestFetcherWithImpersonateList:\n    \"\"\"Test Fetcher with list-based impersonate parameter.\"\"\"\n\n    @pytest.fixture(autouse=True)\n    def setup_urls(self, httpbin):\n        \"\"\"Fixture to set up URLs for testing.\"\"\"\n        self.basic_url = f\"{httpbin.url}/get\"\n\n    def test_get_with_impersonate_list(self):\n        \"\"\"Test that GET request works with impersonate as a list.\"\"\"\n        browsers = [\"chrome\", \"firefox\"]\n        response = Fetcher.get(self.basic_url, impersonate=browsers)\n        assert response.status == 200\n\n    def test_get_with_single_impersonate(self):\n        \"\"\"Test that GET request still works with single browser string.\"\"\"\n        response = Fetcher.get(self.basic_url, impersonate=\"chrome\")\n        assert response.status == 200\n\n    def test_post_with_impersonate_list(self):\n        \"\"\"Test that POST request works with impersonate as a list.\"\"\"\n        browsers = [\"chrome\", \"firefox\"]\n        post_url = self.basic_url.replace(\"/get\", \"/post\")\n        response = Fetcher.post(post_url, data={\"key\": \"value\"}, impersonate=browsers)\n        assert response.status == 200\n\n    def test_put_with_impersonate_list(self):\n        \"\"\"Test that PUT request works with impersonate as a list.\"\"\"\n        browsers = [\"chrome\", \"safari\"]\n        put_url = self.basic_url.replace(\"/get\", \"/put\")\n        response = Fetcher.put(put_url, data={\"key\": \"value\"}, impersonate=browsers)\n        assert response.status == 200\n\n    def test_delete_with_impersonate_list(self):\n        \"\"\"Test that DELETE request works with impersonate as a list.\"\"\"\n        browsers = [\"chrome\", \"edge\"]\n        delete_url = self.basic_url.replace(\"/get\", \"/delete\")\n        response = Fetcher.delete(delete_url, impersonate=browsers)\n        assert response.status == 200\n\n\n@pytest_httpbin.use_class_based_httpbin\nclass TestFetcherSessionWithImpersonateList:\n    \"\"\"Test FetcherSession with list-based impersonate parameter.\"\"\"\n\n    @pytest.fixture(autouse=True)\n    def setup_urls(self, httpbin):\n        \"\"\"Fixture to set up URLs for testing.\"\"\"\n        self.basic_url = f\"{httpbin.url}/get\"\n\n    def test_session_init_with_impersonate_list(self):\n        \"\"\"Test that FetcherSession can be initialized with impersonate as a list.\"\"\"\n        browsers = [\"chrome\", \"firefox\", \"safari\"]\n        session = FetcherSession(impersonate=browsers)\n        assert session._default_impersonate == browsers\n\n    def test_session_request_with_impersonate_list(self):\n        \"\"\"Test that session request works with impersonate as a list.\"\"\"\n        browsers = [\"chrome\", \"firefox\"]\n        with FetcherSession(impersonate=browsers) as session:\n            response = session.get(self.basic_url)\n            assert response.status == 200\n\n    def test_session_multiple_requests_with_impersonate_list(self):\n        \"\"\"Test that multiple requests in a session work with impersonate list.\"\"\"\n        browsers = [\"chrome110\", \"chrome120\", \"chrome131\"]\n        with FetcherSession(impersonate=browsers) as session:\n            response1 = session.get(self.basic_url)\n            response2 = session.get(self.basic_url)\n            assert response1.status == 200\n            assert response2.status == 200\n\n    def test_session_request_level_impersonate_override(self):\n        \"\"\"Test that request-level impersonate overrides session-level.\"\"\"\n        session_browsers = [\"chrome\", \"firefox\"]\n        request_browser = \"safari\"\n\n        with FetcherSession(impersonate=session_browsers) as session:\n            response = session.get(self.basic_url, impersonate=request_browser)\n            assert response.status == 200\n\n    def test_session_request_level_impersonate_list_override(self):\n        \"\"\"Test that request-level impersonate list overrides session-level.\"\"\"\n        session_browsers = [\"chrome\", \"firefox\"]\n        request_browsers = [\"safari\", \"edge\"]\n\n        with FetcherSession(impersonate=session_browsers) as session:\n            response = session.get(self.basic_url, impersonate=request_browsers)\n            assert response.status == 200\n\n\nclass TestImpersonateTypeValidation:\n    \"\"\"Test type validation for impersonate parameter.\"\"\"\n\n    def test_impersonate_accepts_string(self):\n        \"\"\"Test that impersonate accepts string type.\"\"\"\n        # This should not raise any type errors\n        session = FetcherSession(impersonate=\"chrome\")\n        assert session._default_impersonate == \"chrome\"\n\n    def test_impersonate_accepts_list(self):\n        \"\"\"Test that impersonate accepts list type.\"\"\"\n        # This should not raise any type errors\n        browsers = [\"chrome\", \"firefox\"]\n        session = FetcherSession(impersonate=browsers)\n        assert session._default_impersonate == browsers\n\n    def test_impersonate_accepts_none(self):\n        \"\"\"Test that impersonate accepts None.\"\"\"\n        # This should not raise any type errors\n        session = FetcherSession(impersonate=None)\n        assert session._default_impersonate is None\n"
  },
  {
    "path": "tests/fetchers/test_pages.py",
    "content": "import pytest\nfrom unittest.mock import Mock\nfrom scrapling.engines._browsers._page import PageInfo, PagePool\n\n\nclass TestPageInfo:\n    \"\"\"Test PageInfo functionality\"\"\"\n\n    def test_page_info_creation(self):\n        \"\"\"Test PageInfo creation\"\"\"\n        mock_page = Mock()\n        page_info = PageInfo(mock_page, \"ready\", \"https://example.com\")\n\n        assert page_info.page == mock_page\n        assert page_info.state == \"ready\"\n        assert page_info.url == \"https://example.com\"\n\n    def test_page_info_marking(self):\n        \"\"\"Test marking page\"\"\"\n        mock_page = Mock()\n        page_info = PageInfo(mock_page, \"ready\", None)\n\n        page_info.mark_busy(\"https://example.com\")\n        assert page_info.state == \"busy\"\n        assert page_info.url == \"https://example.com\"\n\n        page_info.mark_error()\n        assert page_info.state == \"error\"\n\n    def test_page_info_equality(self):\n        \"\"\"Test PageInfo equality comparison\"\"\"\n        mock_page1 = Mock()\n        mock_page2 = Mock()\n\n        page_info1 = PageInfo(mock_page1, \"ready\", None)\n        page_info2 = PageInfo(mock_page1, \"busy\", None)  # Same page, different state\n        page_info3 = PageInfo(mock_page2, \"ready\", None)  # Different page\n\n        assert page_info1 == page_info2  # Same page\n        assert page_info1 != page_info3  # Different page\n        assert page_info1 != \"not a page info\"  # Different type\n\n    def test_page_info_repr(self):\n        \"\"\"Test PageInfo string representation\"\"\"\n        mock_page = Mock()\n        page_info = PageInfo(mock_page, \"ready\", \"https://example.com\")\n\n        repr_str = repr(page_info)\n        assert \"ready\" in repr_str\n        assert \"https://example.com\" in repr_str\n\n\nclass TestPagePool:\n    \"\"\"Test PagePool functionality\"\"\"\n\n    def test_page_pool_creation(self):\n        \"\"\"Test PagePool creation\"\"\"\n        pool = PagePool(max_pages=5)\n\n        assert pool.max_pages == 5\n        assert pool.pages_count == 0\n        assert pool.busy_count == 0\n\n    def test_add_page(self):\n        \"\"\"Test adding page to pool\"\"\"\n        pool = PagePool(max_pages=2)\n        mock_page = Mock()\n\n        page_info = pool.add_page(mock_page)\n\n        assert isinstance(page_info, PageInfo)\n        assert page_info.page == mock_page\n        assert page_info.state == \"ready\"\n        assert pool.pages_count == 1\n\n    def test_add_page_limit_exceeded(self):\n        \"\"\"Test adding page when limit exceeded\"\"\"\n        pool = PagePool(max_pages=1)\n\n        # Add first page\n        pool.add_page(Mock())\n\n        # Try to add a second page\n        with pytest.raises(RuntimeError):\n            pool.add_page(Mock())\n\n\n\n    def test_cleanup_error_pages(self):\n        \"\"\"Test cleaning up error pages\"\"\"\n        pool = PagePool(max_pages=3)\n\n        # Add pages\n        page1 = pool.add_page(Mock())\n        _ = pool.add_page(Mock())\n        page3 = pool.add_page(Mock())\n\n        # Mark some as error\n        page1.mark_error()\n        page3.mark_error()\n\n        assert pool.pages_count == 3\n\n        pool.cleanup_error_pages()\n\n        assert pool.pages_count == 1  # Only 2 should remain\n"
  },
  {
    "path": "tests/fetchers/test_proxy_rotation.py",
    "content": "import pytest\nimport random\nfrom threading import Thread\nfrom concurrent.futures import ThreadPoolExecutor\n\nfrom scrapling.engines.toolbelt import ProxyRotator, is_proxy_error, cyclic_rotation\n\n\nclass TestCyclicRotationStrategy:\n    \"\"\"Test the default cyclic_rotation strategy function\"\"\"\n\n    def test_cyclic_rotation_cycles_through_proxies(self):\n        \"\"\"Test that cyclic_rotation returns proxies in order\"\"\"\n        proxies = [\"http://p1:8080\", \"http://p2:8080\", \"http://p3:8080\"]\n\n        proxy, next_idx = cyclic_rotation(proxies, 0)\n        assert proxy == \"http://p1:8080\"\n        assert next_idx == 1\n\n        proxy, next_idx = cyclic_rotation(proxies, 1)\n        assert proxy == \"http://p2:8080\"\n        assert next_idx == 2\n\n        proxy, next_idx = cyclic_rotation(proxies, 2)\n        assert proxy == \"http://p3:8080\"\n        assert next_idx == 0  # Wraps around\n\n    def test_cyclic_rotation_wraps_index(self):\n        \"\"\"Test that cyclic_rotation handles index overflow\"\"\"\n        proxies = [\"http://p1:8080\", \"http://p2:8080\"]\n\n        # Index larger than list length should wrap\n        proxy, next_idx = cyclic_rotation(proxies, 5)\n        assert proxy == \"http://p2:8080\"  # 5 % 2 = 1\n        assert next_idx == 0\n\n\nclass TestProxyRotatorCreation:\n    \"\"\"Test ProxyRotator initialization and validation\"\"\"\n\n    def test_create_with_string_proxies(self):\n        \"\"\"Test creating rotator with string proxy URLs\"\"\"\n        proxies = [\"http://p1:8080\", \"http://p2:8080\"]\n        rotator = ProxyRotator(proxies)\n\n        assert len(rotator) == 2\n        assert rotator.proxies == proxies\n\n    def test_create_with_dict_proxies(self):\n        \"\"\"Test creating rotator with dict proxies\"\"\"\n        proxies = [\n            {\"server\": \"http://p1:8080\", \"username\": \"user1\", \"password\": \"pass1\"},\n            {\"server\": \"http://p2:8080\"},\n        ]\n        rotator = ProxyRotator(proxies)\n\n        assert len(rotator) == 2\n        assert rotator.proxies == proxies\n\n    def test_create_with_mixed_proxies(self):\n        \"\"\"Test creating rotator with mixed string and dict proxies\"\"\"\n        proxies = [\n            \"http://p1:8080\",\n            {\"server\": \"http://p2:8080\", \"username\": \"user\"},\n        ]\n        rotator = ProxyRotator(proxies)\n\n        assert len(rotator) == 2\n\n    def test_empty_proxies_raises_error(self):\n        \"\"\"Test that empty proxy list raises ValueError\"\"\"\n        with pytest.raises(ValueError, match=\"At least one proxy must be provided\"):\n            ProxyRotator([])\n\n    def test_dict_without_server_raises_error(self):\n        \"\"\"Test that dict proxy without 'server' key raises ValueError\"\"\"\n        with pytest.raises(ValueError, match=\"Proxy dict must have a 'server' key\"):\n            ProxyRotator([{\"username\": \"user\", \"password\": \"pass\"}])\n\n    def test_invalid_proxy_type_raises_error(self):\n        \"\"\"Test that invalid proxy type raises TypeError\"\"\"\n        with pytest.raises(TypeError, match=\"Invalid proxy type\"):\n            ProxyRotator([123])\n\n        with pytest.raises(TypeError, match=\"Invalid proxy type\"):\n            ProxyRotator([None])\n\n    def test_non_callable_strategy_raises_error(self):\n        \"\"\"Test that non-callable strategy raises TypeError\"\"\"\n        with pytest.raises(TypeError, match=\"strategy must be callable\"):\n            ProxyRotator([\"http://p1:8080\"], strategy=\"cyclic_rotation\")\n\n        with pytest.raises(TypeError, match=\"strategy must be callable\"):\n            ProxyRotator([\"http://p1:8080\"], strategy=123)\n\n\nclass TestProxyRotatorRotation:\n    \"\"\"Test ProxyRotator rotation behavior\"\"\"\n\n    def test_get_proxy_cyclic_rotation(self):\n        \"\"\"Test that get_proxy cycles through proxies in order\"\"\"\n        proxies = [\"http://p1:8080\", \"http://p2:8080\", \"http://p3:8080\"]\n        rotator = ProxyRotator(proxies)\n\n        # First cycle\n        assert rotator.get_proxy() == \"http://p1:8080\"\n        assert rotator.get_proxy() == \"http://p2:8080\"\n        assert rotator.get_proxy() == \"http://p3:8080\"\n\n        # Second cycle - wraps around\n        assert rotator.get_proxy() == \"http://p1:8080\"\n        assert rotator.get_proxy() == \"http://p2:8080\"\n        assert rotator.get_proxy() == \"http://p3:8080\"\n\n    def test_get_proxy_single_proxy(self):\n        \"\"\"Test rotation with single proxy always returns the same proxy\"\"\"\n        rotator = ProxyRotator([\"http://only:8080\"])\n\n        for _ in range(5):\n            assert rotator.get_proxy() == \"http://only:8080\"\n\n    def test_get_proxy_with_dict_proxies(self):\n        \"\"\"Test rotation with dict proxies\"\"\"\n        proxies = [\n            {\"server\": \"http://p1:8080\"},\n            {\"server\": \"http://p2:8080\"},\n        ]\n        rotator = ProxyRotator(proxies)\n\n        assert rotator.get_proxy() == {\"server\": \"http://p1:8080\"}\n        assert rotator.get_proxy() == {\"server\": \"http://p2:8080\"}\n        assert rotator.get_proxy() == {\"server\": \"http://p1:8080\"}\n\n\nclass TestCustomStrategies:\n    \"\"\"Test ProxyRotator with custom rotation strategies\"\"\"\n\n    def test_random_strategy(self):\n        \"\"\"Test custom random selection strategy\"\"\"\n        def random_strategy(proxies, idx):\n            return random.choice(proxies), idx\n\n        proxies = [\"http://p1:8080\", \"http://p2:8080\", \"http://p3:8080\"]\n        rotator = ProxyRotator(proxies, strategy=random_strategy)\n\n        # Get multiple proxies - they should all be valid\n        results = [rotator.get_proxy() for _ in range(10)]\n        assert all(p in proxies for p in results)\n\n    def test_sticky_strategy(self):\n        \"\"\"Test custom sticky strategy that always returns first proxy\"\"\"\n        def sticky_strategy(proxies, idx):\n            return proxies[0], idx\n\n        rotator = ProxyRotator(\n            [\"http://p1:8080\", \"http://p2:8080\"],\n            strategy=sticky_strategy\n        )\n\n        for _ in range(5):\n            assert rotator.get_proxy() == \"http://p1:8080\"\n\n    def test_weighted_strategy(self):\n        \"\"\"Test custom weighted strategy\"\"\"\n        call_count = {\"count\": 0}\n\n        def alternating_strategy(proxies, idx):\n            # Returns first proxy twice, then second proxy once\n            call_count[\"count\"] += 1\n            if call_count[\"count\"] % 3 == 0:\n                return proxies[1], idx\n            return proxies[0], idx\n\n        rotator = ProxyRotator(\n            [\"http://primary:8080\", \"http://backup:8080\"],\n            strategy=alternating_strategy\n        )\n\n        assert rotator.get_proxy() == \"http://primary:8080\"\n        assert rotator.get_proxy() == \"http://primary:8080\"\n        assert rotator.get_proxy() == \"http://backup:8080\"\n\n    def test_lambda_strategy(self):\n        \"\"\"Test using lambda as strategy\"\"\"\n        rotator = ProxyRotator(\n            [\"http://p1:8080\", \"http://p2:8080\"],\n            strategy=lambda proxies, idx: (proxies[-1], idx)  # Always last\n        )\n\n        assert rotator.get_proxy() == \"http://p2:8080\"\n        assert rotator.get_proxy() == \"http://p2:8080\"\n\n\nclass TestProxyRotatorProperties:\n    \"\"\"Test ProxyRotator properties and methods\"\"\"\n\n    def test_proxies_property_returns_copy(self):\n        \"\"\"Test that proxies property returns a copy, not the original list\"\"\"\n        original = [\"http://p1:8080\", \"http://p2:8080\"]\n        rotator = ProxyRotator(original)\n\n        proxies_copy = rotator.proxies\n        proxies_copy.append(\"http://p3:8080\")\n\n        # Original should be unchanged\n        assert len(rotator) == 2\n        assert len(rotator.proxies) == 2\n\n    def test_len_returns_proxy_count(self):\n        \"\"\"Test __len__ returns correct count\"\"\"\n        assert len(ProxyRotator([\"http://p1:8080\"])) == 1\n        assert len(ProxyRotator([\"http://p1:8080\", \"http://p2:8080\"])) == 2\n        assert len(ProxyRotator([\"a\", \"b\", \"c\", \"d\", \"e\"])) == 5\n\n    def test_repr(self):\n        \"\"\"Test __repr__ format\"\"\"\n        rotator = ProxyRotator([\"http://p1:8080\", \"http://p2:8080\", \"http://p3:8080\"])\n        assert repr(rotator) == \"ProxyRotator(proxies=3)\"\n\n\nclass TestProxyRotatorThreadSafety:\n    \"\"\"Test ProxyRotator thread safety\"\"\"\n\n    def test_concurrent_get_proxy(self):\n        \"\"\"Test that concurrent get_proxy calls don't cause errors\"\"\"\n        proxies = [f\"http://p{i}:8080\" for i in range(10)]\n        rotator = ProxyRotator(proxies)\n        results = []\n\n        def get_proxies(n):\n            for _ in range(n):\n                results.append(rotator.get_proxy())\n\n        threads = [Thread(target=get_proxies, args=(100,)) for _ in range(10)]\n        for t in threads:\n            t.start()\n        for t in threads:\n            t.join()\n\n        # All results should be valid proxies\n        assert len(results) == 1000\n        assert all(p in proxies for p in results)\n\n    def test_thread_pool_concurrent_access(self):\n        \"\"\"Test concurrent access using ThreadPoolExecutor\"\"\"\n        proxies = [\"http://p1:8080\", \"http://p2:8080\", \"http://p3:8080\"]\n        rotator = ProxyRotator(proxies)\n\n        with ThreadPoolExecutor(max_workers=5) as executor:\n            futures = [executor.submit(rotator.get_proxy) for _ in range(100)]\n            results = [f.result() for f in futures]\n\n        assert len(results) == 100\n        assert all(p in proxies for p in results)\n\n\nclass TestIsProxyError:\n    \"\"\"Test is_proxy_error utility function\"\"\"\n\n    @pytest.mark.parametrize(\"error_msg\", [\n        \"net::err_proxy_connection_failed\",\n        \"NET::ERR_PROXY_AUTH_FAILED\",\n        \"net::err_tunnel_connection_failed\",\n        \"Connection refused by proxy\",\n        \"Connection reset by peer\",\n        \"Connection timed out while connecting to proxy\",\n        \"Failed to connect to proxy server\",\n        \"Could not resolve proxy host\",\n    ])\n    def test_proxy_errors_detected(self, error_msg):\n        \"\"\"Test that proxy-related errors are detected\"\"\"\n        assert is_proxy_error(Exception(error_msg)) is True\n\n    @pytest.mark.parametrize(\"error_msg\", [\n        \"Page not found\",\n        \"404 Not Found\",\n        \"Internal server error\",\n        \"DNS resolution failed\",\n        \"SSL certificate error\",\n        \"Timeout waiting for response\",\n        \"Invalid JSON response\",\n    ])\n    def test_non_proxy_errors_not_detected(self, error_msg):\n        \"\"\"Test that non-proxy errors are not detected as proxy errors\"\"\"\n        assert is_proxy_error(Exception(error_msg)) is False\n\n    def test_case_insensitive_detection(self):\n        \"\"\"Test that error detection is case-insensitive\"\"\"\n        assert is_proxy_error(Exception(\"NET::ERR_PROXY\")) is True\n        assert is_proxy_error(Exception(\"Net::Err_Proxy\")) is True\n        assert is_proxy_error(Exception(\"CONNECTION REFUSED\")) is True\n\n    def test_empty_error_message(self):\n        \"\"\"Test handling of empty error message\"\"\"\n        assert is_proxy_error(Exception(\"\")) is False\n\n    def test_custom_exception_types(self):\n        \"\"\"Test with custom exception types\"\"\"\n        class CustomError(Exception):\n            pass\n\n        assert is_proxy_error(CustomError(\"net::err_proxy_failed\")) is True\n        assert is_proxy_error(CustomError(\"normal error\")) is False\n"
  },
  {
    "path": "tests/fetchers/test_response_handling.py",
    "content": "from unittest.mock import Mock\n\nfrom scrapling.parser import Selector\nfrom scrapling.engines.toolbelt.convertor import ResponseFactory, Response\n\n\nclass TestResponseFactory:\n    \"\"\"Test ResponseFactory functionality\"\"\"\n\n    def test_response_from_curl(self):\n        \"\"\"Test creating response from curl_cffi response\"\"\"\n        # Mock curl response\n        mock_curl_response = Mock()\n        mock_curl_response.url = \"https://example.com\"\n        mock_curl_response.content = b\"<html><body>Test</body></html>\"\n        mock_curl_response.status_code = 200\n        mock_curl_response.reason = \"OK\"\n        mock_curl_response.encoding = \"utf-8\"\n        mock_curl_response.cookies = {\"session\": \"abc\"}\n        mock_curl_response.headers = {\"Content-Type\": \"text/html\"}\n        mock_curl_response.request.headers = {\"User-Agent\": \"Test\"}\n        mock_curl_response.request.method = \"GET\"\n        mock_curl_response.history = []\n\n        response = ResponseFactory.from_http_request(\n            mock_curl_response,\n            {\"adaptive\": False}\n        )\n\n        assert response.status == 200\n        assert response.url == \"https://example.com\"\n        assert isinstance(response, Response)\n\n    def test_response_history_processing(self):\n        \"\"\"Test processing response history\"\"\"\n        # Mock responses with redirects\n        mock_final = Mock()\n        mock_final.status = 200\n        mock_final.status_text = \"OK\"\n        mock_final.all_headers = Mock(return_value={})\n\n        mock_redirect = Mock()\n        mock_redirect.url = \"https://example.com/redirect\"\n        mock_redirect.response = Mock(return_value=mock_final)\n        mock_redirect.all_headers = Mock(return_value={})\n        mock_redirect.redirected_from = None\n\n        mock_first = Mock()\n        mock_first.request.redirected_from = mock_redirect\n\n        # Process history\n        history = ResponseFactory._process_response_history(\n            mock_first,\n            {}\n        )\n\n        assert len(history) >= 0  # Should process redirects\n\n\nclass TestErrorScenarios:\n    \"\"\"Test various error scenarios\"\"\"\n\n    def test_invalid_html_handling(self):\n        \"\"\"Test handling of malformed HTML\"\"\"\n        malformed_html = \"\"\"\n        <html>\n            <body>\n                <div>Unclosed div\n                <p>Paragraph without closing tag\n                <span>Nested unclosed\n            </body>\n        \"\"\"\n\n        # Should handle gracefully\n        page = Selector(malformed_html)\n        assert page is not None\n\n        # Should still be able to select elements\n        divs = page.css(\"div\")\n        assert len(divs) > 0\n\n    def test_empty_responses(self):\n        \"\"\"Test handling of empty responses\"\"\"\n        # Empty HTML\n        page = Selector(\"\")\n        assert page is not None\n\n        # Whitespace only\n        page = Selector(\"   \\n\\t   \")\n        assert page is not None\n\n        # Null bytes\n        page = Selector(\"Hello\\x00World\")\n        assert \"Hello\" in page.get_all_text()\n"
  },
  {
    "path": "tests/fetchers/test_utils.py",
    "content": "import pytest\n\nfrom scrapling.engines.toolbelt.custom import StatusText, Response\nfrom scrapling.engines.toolbelt.navigation import (\n    construct_proxy_dict,\n    create_intercept_handler,\n    create_async_intercept_handler,\n)\nfrom scrapling.engines.toolbelt.fingerprints import (\n    get_os_name,\n    generate_headers\n)\n\n\n@pytest.fixture\ndef content_type_map():\n    return {\n        # A map generated by ChatGPT for most possible `content_type` values and the expected outcome\n        \"text/html; charset=UTF-8\": \"UTF-8\",\n        \"text/html; charset=ISO-8859-1\": \"ISO-8859-1\",\n        \"text/html\": \"ISO-8859-1\",\n        \"application/json; charset=UTF-8\": \"UTF-8\",\n        \"application/json\": \"utf-8\",\n        \"text/json\": \"utf-8\",\n        \"application/javascript; charset=UTF-8\": \"UTF-8\",\n        \"application/javascript\": \"utf-8\",\n        \"text/plain; charset=UTF-8\": \"UTF-8\",\n        \"text/plain; charset=ISO-8859-1\": \"ISO-8859-1\",\n        \"text/plain\": \"ISO-8859-1\",\n        \"application/xhtml+xml; charset=UTF-8\": \"UTF-8\",\n        \"application/xhtml+xml\": \"utf-8\",\n        \"text/html; charset=windows-1252\": \"windows-1252\",\n        \"application/json; charset=windows-1252\": \"windows-1252\",\n        \"text/plain; charset=windows-1252\": \"windows-1252\",\n        'text/html; charset=\"UTF-8\"': \"UTF-8\",\n        'text/html; charset=\"ISO-8859-1\"': \"ISO-8859-1\",\n        'text/html; charset=\"windows-1252\"': \"windows-1252\",\n        'application/json; charset=\"UTF-8\"': \"UTF-8\",\n        'application/json; charset=\"ISO-8859-1\"': \"ISO-8859-1\",\n        'application/json; charset=\"windows-1252\"': \"windows-1252\",\n        'text/json; charset=\"UTF-8\"': \"UTF-8\",\n        'application/javascript; charset=\"UTF-8\"': \"UTF-8\",\n        'application/javascript; charset=\"ISO-8859-1\"': \"ISO-8859-1\",\n        'text/plain; charset=\"UTF-8\"': \"UTF-8\",\n        'text/plain; charset=\"ISO-8859-1\"': \"ISO-8859-1\",\n        'text/plain; charset=\"windows-1252\"': \"windows-1252\",\n        'application/xhtml+xml; charset=\"UTF-8\"': \"UTF-8\",\n        'application/xhtml+xml; charset=\"ISO-8859-1\"': \"ISO-8859-1\",\n        'application/xhtml+xml; charset=\"windows-1252\"': \"windows-1252\",\n        'text/html; charset=\"US-ASCII\"': \"US-ASCII\",\n        'application/json; charset=\"US-ASCII\"': \"US-ASCII\",\n        'text/plain; charset=\"US-ASCII\"': \"US-ASCII\",\n        'text/html; charset=\"Shift_JIS\"': \"Shift_JIS\",\n        'application/json; charset=\"Shift_JIS\"': \"Shift_JIS\",\n        'text/plain; charset=\"Shift_JIS\"': \"Shift_JIS\",\n        'application/xml; charset=\"UTF-8\"': \"UTF-8\",\n        'application/xml; charset=\"ISO-8859-1\"': \"ISO-8859-1\",\n        \"application/xml\": \"utf-8\",\n        'text/xml; charset=\"UTF-8\"': \"UTF-8\",\n        'text/xml; charset=\"ISO-8859-1\"': \"ISO-8859-1\",\n        \"text/xml\": \"utf-8\",\n    }\n\n\n@pytest.fixture\ndef status_map():\n    return {\n        100: \"Continue\",\n        101: \"Switching Protocols\",\n        102: \"Processing\",\n        103: \"Early Hints\",\n        200: \"OK\",\n        201: \"Created\",\n        202: \"Accepted\",\n        203: \"Non-Authoritative Information\",\n        204: \"No Content\",\n        205: \"Reset Content\",\n        206: \"Partial Content\",\n        207: \"Multi-Status\",\n        208: \"Already Reported\",\n        226: \"IM Used\",\n        300: \"Multiple Choices\",\n        301: \"Moved Permanently\",\n        302: \"Found\",\n        303: \"See Other\",\n        304: \"Not Modified\",\n        305: \"Use Proxy\",\n        307: \"Temporary Redirect\",\n        308: \"Permanent Redirect\",\n        400: \"Bad Request\",\n        401: \"Unauthorized\",\n        402: \"Payment Required\",\n        403: \"Forbidden\",\n        404: \"Not Found\",\n        405: \"Method Not Allowed\",\n        406: \"Not Acceptable\",\n        407: \"Proxy Authentication Required\",\n        408: \"Request Timeout\",\n        409: \"Conflict\",\n        410: \"Gone\",\n        411: \"Length Required\",\n        412: \"Precondition Failed\",\n        413: \"Payload Too Large\",\n        414: \"URI Too Long\",\n        415: \"Unsupported Media Type\",\n        416: \"Range Not Satisfiable\",\n        417: \"Expectation Failed\",\n        418: \"I'm a teapot\",\n        421: \"Misdirected Request\",\n        422: \"Unprocessable Entity\",\n        423: \"Locked\",\n        424: \"Failed Dependency\",\n        425: \"Too Early\",\n        426: \"Upgrade Required\",\n        428: \"Precondition Required\",\n        429: \"Too Many Requests\",\n        431: \"Request Header Fields Too Large\",\n        451: \"Unavailable For Legal Reasons\",\n        500: \"Internal Server Error\",\n        501: \"Not Implemented\",\n        502: \"Bad Gateway\",\n        503: \"Service Unavailable\",\n        504: \"Gateway Timeout\",\n        505: \"HTTP Version Not Supported\",\n        506: \"Variant Also Negotiates\",\n        507: \"Insufficient Storage\",\n        508: \"Loop Detected\",\n        510: \"Not Extended\",\n        511: \"Network Authentication Required\",\n    }\n\n\ndef test_parsing_response_status(status_map):\n    \"\"\"Test if using different http responses' status codes returns the expected result\"\"\"\n    for status_code, expected_status_text in status_map.items():\n        assert StatusText.get(status_code) == expected_status_text\n\n\ndef test_unknown_status_code():\n    \"\"\"Test handling of an unknown status code\"\"\"\n    assert StatusText.get(1000) == \"Unknown Status Code\"\n\n\nclass TestConstructProxyDict:\n    \"\"\"Test proxy dictionary construction\"\"\"\n\n    def test_proxy_string_basic(self):\n        \"\"\"Test a basic proxy string\"\"\"\n        result = construct_proxy_dict(\"http://proxy.example.com:8080\")\n\n        expected = {\n            \"server\": \"http://proxy.example.com:8080\",\n            \"username\": \"\",\n            \"password\": \"\"\n        }\n        assert result == expected\n\n    def test_proxy_string_with_auth(self):\n        \"\"\"Test proxy string with authentication\"\"\"\n        result = construct_proxy_dict(\"http://user:pass@proxy.example.com:8080\")\n\n        expected = {\n            \"server\": \"http://proxy.example.com:8080\",\n            \"username\": \"user\",\n            \"password\": \"pass\"\n        }\n        assert result == expected\n\n    def test_proxy_dict_input(self):\n        \"\"\"Test proxy dictionary input\"\"\"\n        input_dict = {\n            \"server\": \"http://proxy.example.com:8080\",\n            \"username\": \"user\",\n            \"password\": \"pass\"\n        }\n        result = construct_proxy_dict(input_dict)\n\n        assert result == input_dict\n\n    def test_proxy_dict_minimal(self):\n        \"\"\"Test minimal proxy dictionary\"\"\"\n        input_dict = {\"server\": \"http://proxy.example.com:8080\"}\n        result = construct_proxy_dict(input_dict)\n\n        expected = {\n            \"server\": \"http://proxy.example.com:8080\",\n            \"username\": \"\",\n            \"password\": \"\"\n        }\n        assert result == expected\n\n    def test_invalid_proxy_string(self):\n        \"\"\"Test invalid proxy string\"\"\"\n        with pytest.raises(ValueError):\n            construct_proxy_dict(\"invalid-proxy-format\")\n\n    def test_invalid_proxy_dict(self):\n        \"\"\"Test invalid proxy dictionary\"\"\"\n        with pytest.raises(TypeError):\n            construct_proxy_dict({\"invalid\": \"structure\"})\n\n\nclass TestFingerprintFunctions:\n    \"\"\"Test fingerprint generation functions\"\"\"\n\n    def test_get_os_name(self):\n        \"\"\"Test OS name detection\"\"\"\n        result = get_os_name()\n\n        # Should return one of the known OS names or None\n        valid_names = [\"linux\", \"macos\", \"windows\", \"ios\"]\n        assert result is None or result in valid_names\n\n    def test_generate_headers_basic(self):\n        \"\"\"Test basic header generation\"\"\"\n        headers = generate_headers()\n\n        assert isinstance(headers, dict)\n        assert \"User-Agent\" in headers\n        assert len(headers[\"User-Agent\"]) > 0\n\n    def test_generate_headers_browser_mode(self):\n        \"\"\"Test header generation in browser mode\"\"\"\n        headers = generate_headers(browser_mode=True)\n\n        assert isinstance(headers, dict)\n        assert \"User-Agent\" in headers\n\n\nclass TestResponse:\n    \"\"\"Test Response class functionality\"\"\"\n\n    def test_response_creation(self):\n        \"\"\"Test Response object creation\"\"\"\n        response = Response(\n            url=\"https://example.com\",\n            content=\"<html><body>Test</body></html>\",\n            status=200,\n            reason=\"OK\",\n            cookies={\"session\": \"abc123\"},\n            headers={\"Content-Type\": \"text/html\"},\n            request_headers={\"User-Agent\": \"Test\"},\n            encoding=\"utf-8\"\n        )\n\n        assert response.url == \"https://example.com\"\n        assert response.status == 200\n        assert response.reason == \"OK\"\n        assert response.cookies == {\"session\": \"abc123\"}\n\n    def test_response_with_bytes_content(self):\n        \"\"\"Test Response with 'bytes' content\"\"\"\n        content_bytes = \"<html><body>Test</body></html>\".encode('utf-8')\n\n        response = Response(\n            url=\"https://example.com\",\n            content=content_bytes,\n            status=200,\n            reason=\"OK\",\n            cookies={},\n            headers={},\n            request_headers={}\n        )\n\n        # Should handle 'bytes' content properly\n        assert response.status == 200\n\n\nclass _MockRequest:\n    \"\"\"Minimal mock for Playwright's Request object.\"\"\"\n    def __init__(self, url: str, resource_type: str = \"document\"):\n        self.url = url\n        self.resource_type = resource_type\n\n\nclass _MockRoute:\n    \"\"\"Minimal mock for Playwright's sync Route object.\"\"\"\n    def __init__(self, url: str, resource_type: str = \"document\"):\n        self.request = _MockRequest(url, resource_type)\n        self.aborted = False\n        self.continued = False\n\n    def abort(self):\n        self.aborted = True\n\n    def continue_(self):\n        self.continued = True\n\n\nclass _AsyncMockRoute:\n    \"\"\"Minimal mock for Playwright's async Route object.\"\"\"\n    def __init__(self, url: str, resource_type: str = \"document\"):\n        self.request = _MockRequest(url, resource_type)\n        self.aborted = False\n        self.continued = False\n\n    async def abort(self):\n        self.aborted = True\n\n    async def continue_(self):\n        self.continued = True\n\n\nclass TestCreateInterceptHandler:\n    \"\"\"Test the unified sync route handler factory.\"\"\"\n\n    def test_blocks_disabled_resource_types(self):\n        handler = create_intercept_handler(disable_resources=True)\n        route = _MockRoute(\"https://example.com/image.png\", resource_type=\"image\")\n        handler(route)\n        assert route.aborted\n\n    def test_continues_allowed_resource_types(self):\n        handler = create_intercept_handler(disable_resources=True)\n        route = _MockRoute(\"https://example.com/page\", resource_type=\"document\")\n        handler(route)\n        assert route.continued\n\n    def test_blocks_exact_domain(self):\n        handler = create_intercept_handler(disable_resources=False, blocked_domains={\"ads.example.com\"})\n        route = _MockRoute(\"https://ads.example.com/tracker.js\")\n        handler(route)\n        assert route.aborted\n\n    def test_blocks_subdomain(self):\n        handler = create_intercept_handler(disable_resources=False, blocked_domains={\"example.com\"})\n        route = _MockRoute(\"https://sub.example.com/page\")\n        handler(route)\n        assert route.aborted\n\n    def test_continues_non_blocked_domain(self):\n        handler = create_intercept_handler(disable_resources=False, blocked_domains={\"ads.example.com\"})\n        route = _MockRoute(\"https://safe.example.com/page\")\n        handler(route)\n        assert route.continued\n\n    def test_resource_blocking_takes_priority_over_domain(self):\n        \"\"\"When both are active, resource type check comes first.\"\"\"\n        handler = create_intercept_handler(disable_resources=True, blocked_domains={\"example.com\"})\n        route = _MockRoute(\"https://example.com/style.css\", resource_type=\"stylesheet\")\n        handler(route)\n        assert route.aborted\n\n    def test_domain_blocking_with_resources_disabled(self):\n        \"\"\"Non-blocked resource type from a blocked domain should still be aborted.\"\"\"\n        handler = create_intercept_handler(disable_resources=True, blocked_domains={\"tracker.io\"})\n        route = _MockRoute(\"https://tracker.io/api\", resource_type=\"document\")\n        handler(route)\n        assert route.aborted\n\n    def test_no_blocking_continues(self):\n        handler = create_intercept_handler(disable_resources=False)\n        route = _MockRoute(\"https://example.com/page\")\n        handler(route)\n        assert route.continued\n\n    def test_does_not_block_partial_domain_match(self):\n        \"\"\"'example.com' should not block 'notexample.com'.\"\"\"\n        handler = create_intercept_handler(disable_resources=False, blocked_domains={\"example.com\"})\n        route = _MockRoute(\"https://notexample.com/page\")\n        handler(route)\n        assert route.continued\n\n    def test_multiple_blocked_domains(self):\n        handler = create_intercept_handler(disable_resources=False, blocked_domains={\"ads.com\", \"tracker.io\"})\n        route_ads = _MockRoute(\"https://ads.com/banner\")\n        route_tracker = _MockRoute(\"https://cdn.tracker.io/script.js\")\n        route_safe = _MockRoute(\"https://example.com/page\")\n        handler(route_ads)\n        handler(route_tracker)\n        handler(route_safe)\n        assert route_ads.aborted\n        assert route_tracker.aborted\n        assert route_safe.continued\n\n\nclass TestCreateAsyncInterceptHandler:\n    \"\"\"Test the unified async route handler factory.\"\"\"\n\n    @pytest.mark.asyncio\n    async def test_blocks_disabled_resource_types(self):\n        handler = create_async_intercept_handler(disable_resources=True)\n        route = _AsyncMockRoute(\"https://example.com/font.woff\", resource_type=\"font\")\n        await handler(route)\n        assert route.aborted\n\n    @pytest.mark.asyncio\n    async def test_blocks_domain(self):\n        handler = create_async_intercept_handler(disable_resources=False, blocked_domains={\"ads.example.com\"})\n        route = _AsyncMockRoute(\"https://ads.example.com/track\")\n        await handler(route)\n        assert route.aborted\n\n    @pytest.mark.asyncio\n    async def test_continues_non_blocked(self):\n        handler = create_async_intercept_handler(disable_resources=False, blocked_domains={\"ads.example.com\"})\n        route = _AsyncMockRoute(\"https://safe.example.com/page\")\n        await handler(route)\n        assert route.continued\n\n    @pytest.mark.asyncio\n    async def test_blocks_subdomain(self):\n        handler = create_async_intercept_handler(disable_resources=False, blocked_domains={\"tracker.io\"})\n        route = _AsyncMockRoute(\"https://cdn.tracker.io/script.js\")\n        await handler(route)\n        assert route.aborted\n\n    @pytest.mark.asyncio\n    async def test_does_not_block_partial_domain_match(self):\n        handler = create_async_intercept_handler(disable_resources=False, blocked_domains={\"example.com\"})\n        route = _AsyncMockRoute(\"https://notexample.com/page\")\n        await handler(route)\n        assert route.continued\n"
  },
  {
    "path": "tests/fetchers/test_validator.py",
    "content": "import pytest\nfrom scrapling.engines._browsers._validators import (\n    validate,\n    StealthConfig,\n    PlaywrightConfig,\n)\n\n\nclass TestValidators:\n    \"\"\"Test configuration validators\"\"\"\n\n    def test_playwright_config_valid(self):\n        \"\"\"Test valid PlaywrightConfig\"\"\"\n        params = {\n            \"max_pages\": 2,\n            \"headless\": True,\n            \"timeout\": 30000,\n            \"proxy\": \"http://proxy.example.com:8080\"\n        }\n\n        config = validate(params, PlaywrightConfig)\n\n        assert config.max_pages == 2\n        assert config.headless is True\n        assert config.timeout == 30000\n        assert isinstance(config.proxy, dict)\n\n    def test_playwright_config_invalid_max_pages(self):\n        \"\"\"Test PlaywrightConfig with invalid max_pages\"\"\"\n        params = {\"max_pages\": 0}\n\n        with pytest.raises(TypeError):\n            validate(params, PlaywrightConfig)\n\n        params = {\"max_pages\": 51}\n\n        with pytest.raises(TypeError):\n            validate(params, PlaywrightConfig)\n\n    def test_playwright_config_invalid_timeout(self):\n        \"\"\"Test PlaywrightConfig with an invalid timeout\"\"\"\n        params = {\"timeout\": -1}\n\n        with pytest.raises(TypeError):\n            validate(params, PlaywrightConfig)\n\n    def test_playwright_config_invalid_cdp_url(self):\n        \"\"\"Test PlaywrightConfig with invalid CDP URL\"\"\"\n        params = {\"cdp_url\": \"invalid-url\"}\n\n        with pytest.raises(TypeError):\n            validate(params, PlaywrightConfig)\n\n    def test_stealth_config_valid(self):\n        \"\"\"Test valid StealthConfig\"\"\"\n        params = {\n            \"max_pages\": 1,\n            \"headless\": True,\n            \"solve_cloudflare\": False,\n            \"timeout\": 30000\n        }\n\n        config = validate(params, StealthConfig)\n\n        assert config.max_pages == 1\n        assert config.headless is True\n        assert config.solve_cloudflare is False\n        assert config.timeout == 30000\n\n    def test_stealth_config_cloudflare_timeout(self):\n        \"\"\"Test StealthConfig timeout adjustment for Cloudflare\"\"\"\n        params = {\n            \"solve_cloudflare\": True,\n            \"timeout\": 10000  # Less than the required 60,000\n        }\n\n        config = validate(params, StealthConfig)\n\n        assert config.timeout == 60000  # Should be increased\n\n    def test_playwright_config_blocked_domains(self):\n        \"\"\"Test PlaywrightConfig with blocked_domains\"\"\"\n        params = {\"blocked_domains\": {\"ads.example.com\", \"tracker.io\"}}\n\n        config = validate(params, PlaywrightConfig)\n\n        assert config.blocked_domains == {\"ads.example.com\", \"tracker.io\"}\n\n    def test_playwright_config_blocked_domains_default_none(self):\n        \"\"\"Test PlaywrightConfig blocked_domains defaults to None\"\"\"\n        config = validate({}, PlaywrightConfig)\n\n        assert config.blocked_domains is None\n\n    def test_stealth_config_blocked_domains(self):\n        \"\"\"Test StealthConfig inherits blocked_domains\"\"\"\n        params = {\"blocked_domains\": {\"ads.example.com\"}}\n\n        config = validate(params, StealthConfig)\n\n        assert config.blocked_domains == {\"ads.example.com\"}\n"
  },
  {
    "path": "tests/parser/__init__.py",
    "content": ""
  },
  {
    "path": "tests/parser/test_adaptive.py",
    "content": "import asyncio\n\nimport pytest\n\nfrom scrapling import Selector\n\n\nclass TestParserAdaptive:\n    def test_element_relocation(self):\n        \"\"\"Test relocating element after structure change\"\"\"\n        original_html = \"\"\"\n                <div class=\"container\">\n                    <section class=\"products\">\n                        <article class=\"product\" id=\"p1\">\n                            <h3>Product 1</h3>\n                            <p class=\"description\">Description 1</p>\n                        </article>\n                        <article class=\"product\" id=\"p2\">\n                            <h3>Product 2</h3>\n                            <p class=\"description\">Description 2</p>\n                        </article>\n                    </section>\n                </div>\n                \"\"\"\n        changed_html = \"\"\"\n                <div class=\"new-container\">\n                    <div class=\"product-wrapper\">\n                        <section class=\"products\">\n                            <article class=\"product new-class\" data-id=\"p1\">\n                                <div class=\"product-info\">\n                                    <h3>Product 1</h3>\n                                    <p class=\"new-description\">Description 1</p>\n                                </div>\n                            </article>\n                            <article class=\"product new-class\" data-id=\"p2\">\n                                <div class=\"product-info\">\n                                    <h3>Product 2</h3>\n                                    <p class=\"new-description\">Description 2</p>\n                                </div>\n                            </article>\n                        </section>\n                    </div>\n                </div>\n                \"\"\"\n\n        old_page = Selector(original_html, url=\"example.com\", adaptive=True)\n        new_page = Selector(changed_html, url=\"example.com\", adaptive=True)\n\n        # 'p1' was used as ID and now it's not and all the path elements have changes\n        # Also at the same time testing `adaptive` vs combined selectors\n        _ = old_page.css(\"#p1, #p2\", auto_save=True)[0]\n        relocated = new_page.css(\"#p1\", adaptive=True)\n\n        assert relocated is not None\n        assert relocated[0].attrib[\"data-id\"] == \"p1\"\n        assert relocated[0].has_class(\"new-class\")\n        assert relocated[0].css(\".new-description\")[0].text == \"Description 1\"\n\n    @pytest.mark.asyncio\n    async def test_element_relocation_async(self):\n        \"\"\"Test relocating element after structure change in async mode\"\"\"\n        original_html = \"\"\"\n                <div class=\"container\">\n                    <section class=\"products\">\n                        <article class=\"product\" id=\"p1\">\n                            <h3>Product 1</h3>\n                            <p class=\"description\">Description 1</p>\n                        </article>\n                        <article class=\"product\" id=\"p2\">\n                            <h3>Product 2</h3>\n                            <p class=\"description\">Description 2</p>\n                        </article>\n                    </section>\n                </div>\n                \"\"\"\n        changed_html = \"\"\"\n                <div class=\"new-container\">\n                    <div class=\"product-wrapper\">\n                        <section class=\"products\">\n                            <article class=\"product new-class\" data-id=\"p1\">\n                                <div class=\"product-info\">\n                                    <h3>Product 1</h3>\n                                    <p class=\"new-description\">Description 1</p>\n                                </div>\n                            </article>\n                            <article class=\"product new-class\" data-id=\"p2\">\n                                <div class=\"product-info\">\n                                    <h3>Product 2</h3>\n                                    <p class=\"new-description\">Description 2</p>\n                                </div>\n                            </article>\n                        </section>\n                    </div>\n                </div>\n                \"\"\"\n\n        # Simulate async operation\n        await asyncio.sleep(0.1)  # Minimal async operation\n\n        old_page = Selector(original_html, url=\"example.com\", adaptive=True)\n        new_page = Selector(changed_html, url=\"example.com\", adaptive=True)\n\n        # 'p1' was used as ID and now it's not and all the path elements have changes\n        # Also at the same time testing `adaptive` vs combined selectors\n        _ = old_page.css(\"#p1, #p2\", auto_save=True)[0]\n        relocated = new_page.css(\"#p1\", adaptive=True)\n\n        assert relocated is not None\n        assert relocated[0].attrib[\"data-id\"] == \"p1\"\n        assert relocated[0].has_class(\"new-class\")\n        assert relocated[0].css(\".new-description\")[0].text == \"Description 1\"\n"
  },
  {
    "path": "tests/parser/test_attributes_handler.py",
    "content": "import pytest\nimport json\n\nfrom scrapling import Selector\nfrom scrapling.core.custom_types import AttributesHandler\n\n\nclass TestAttributesHandler:\n    \"\"\"Test AttributesHandler functionality\"\"\"\n\n    @pytest.fixture\n    def sample_html(self):\n        return \"\"\"\n        <html>\n            <body>\n                <div id=\"main\" \n                     class=\"container active\" \n                     data-config='{\"theme\": \"dark\", \"version\": 2.5}'\n                     data-items='[1, 2, 3, 4, 5]'\n                     data-invalid-json='{\"broken: json}'\n                     title=\"Main Container\"\n                     style=\"color: red; background: blue;\"\n                     data-empty=\"\"\n                     data-number=\"42\"\n                     data-bool=\"true\"\n                     data-url=\"https://example.com/page?param=value\"\n                     custom-attr=\"custom-value\"\n                     data-nested='{\"user\": {\"name\": \"John\", \"age\": 30}}'\n                     data-encoded=\"&lt;div&gt;HTML&lt;/div&gt;\"\n                     onclick=\"handleClick()\"\n                     data-null=\"null\"\n                     data-undefined=\"undefined\">\n                    Content\n                </div>\n                <input type=\"text\" \n                       name=\"username\" \n                       value=\"test@example.com\" \n                       placeholder=\"Enter email\"\n                       required\n                       disabled>\n                <img src=\"/images/photo.jpg\" \n                     alt=\"Photo\" \n                     width=\"100\" \n                     height=\"100\"\n                     loading=\"lazy\">\n            </body>\n        </html>\n        \"\"\"\n\n    @pytest.fixture\n    def attributes(self, sample_html):\n        page = Selector(sample_html)\n        element = page.css(\"#main\")[0]\n        return element.attrib\n\n    def test_basic_attribute_access(self, attributes):\n        \"\"\"Test basic attribute access\"\"\"\n        # Dict-like access\n        assert attributes[\"id\"] == \"main\"\n        assert attributes[\"class\"] == \"container active\"\n        assert attributes[\"title\"] == \"Main Container\"\n\n        # Key existence\n        assert \"id\" in attributes\n        assert \"nonexistent\" not in attributes\n\n        # Get with default\n        assert attributes.get(\"id\") == \"main\"\n        assert attributes.get(\"nonexistent\") is None\n        assert attributes.get(\"nonexistent\", \"default\") == \"default\"\n\n    def test_iteration_methods(self, attributes):\n        \"\"\"Test iteration over attributes\"\"\"\n        # Keys\n        keys = list(attributes.keys())\n        assert \"id\" in keys\n        assert \"class\" in keys\n        assert \"data-config\" in keys\n\n        # Values\n        values = list(attributes.values())\n        assert \"main\" in values\n        assert \"container active\" in values\n\n        # Items\n        items = dict(attributes.items())\n        assert items[\"id\"] == \"main\"\n        assert items[\"class\"] == \"container active\"\n\n        # Length\n        assert len(attributes) > 0\n\n    def test_json_parsing(self, attributes):\n        \"\"\"Test JSON parsing from attributes\"\"\"\n        # Valid JSON object\n        config = attributes[\"data-config\"].json()\n        assert config[\"theme\"] == \"dark\"\n        assert config[\"version\"] == 2.5\n\n        # Valid JSON array\n        items = attributes[\"data-items\"].json()\n        assert items == [1, 2, 3, 4, 5]\n\n        # Nested JSON\n        nested = attributes[\"data-nested\"].json()\n        assert nested[\"user\"][\"name\"] == \"John\"\n        assert nested[\"user\"][\"age\"] == 30\n\n        # JSON null\n        assert attributes[\"data-null\"].json() is None\n\n    def test_json_error_handling(self, attributes):\n        \"\"\"Test JSON parsing error handling\"\"\"\n        # Invalid JSON should raise error or return None\n        with pytest.raises((json.JSONDecodeError, AttributeError)):\n            attributes[\"data-invalid-json\"].json()\n\n        # Non-existent attribute\n        with pytest.raises(KeyError):\n            attributes[\"nonexistent\"].json()\n\n    def test_json_string_property(self, attributes):\n        \"\"\"Test json_string property\"\"\"\n        # Should return JSON representation of all attributes\n        json_string = attributes.json_string\n        assert isinstance(json_string, bytes)\n\n        # Parse it back\n        parsed = json.loads(json_string)\n        assert parsed[\"id\"] == \"main\"\n        assert parsed[\"class\"] == \"container active\"\n\n    def test_search_values(self, attributes):\n        \"\"\"Test search_values method\"\"\"\n        # Exact match\n        results = list(attributes.search_values(\"main\", partial=False))\n        assert len(results) == 1\n        assert \"id\" in results[0]\n\n        # Partial match\n        results = list(attributes.search_values(\"container\", partial=True))\n        assert len(results) >= 1\n        found_keys = []\n        for result in results:\n            found_keys.extend(result.keys())\n        assert \"class\" in found_keys or \"title\" in found_keys\n\n        # Case sensitivity\n        results = list(attributes.search_values(\"MAIN\", partial=False))\n        assert len(results) == 0  # Should be case-sensitive by default\n\n        # Multiple matches\n        results = list(attributes.search_values(\"2\", partial=True))\n        assert len(results) > 1  # Should find multiple attributes\n\n        # No matches\n        results = list(attributes.search_values(\"nonexistent\", partial=False))\n        assert len(results) == 0\n\n    def test_special_attribute_types(self, sample_html):\n        \"\"\"Test handling of special attribute types\"\"\"\n        page = Selector(sample_html)\n\n        # Boolean attributes\n        input_elem = page.css(\"input\")[0]\n        assert \"required\" in input_elem.attrib\n        assert \"disabled\" in input_elem.attrib\n\n        # Empty attributes\n        main_elem = page.css(\"#main\")[0]\n        assert main_elem.attrib[\"data-empty\"] == \"\"\n\n        # Numeric string attributes\n        assert main_elem.attrib[\"data-number\"] == \"42\"\n        assert main_elem.attrib[\"data-bool\"] == \"true\"\n\n    def test_attribute_modification(self, sample_html):\n        \"\"\"Test that AttributesHandler is read-only (if applicable)\"\"\"\n        page = Selector(sample_html)\n        element = page.css(\"#main\")[0]\n        attrs = element.attrib\n\n        # Test if attributes can be modified\n        # This behavior depends on implementation\n        original_id = attrs[\"id\"]\n        try:\n            attrs[\"id\"] = \"new-id\"\n            # If modification is allowed\n            assert attrs[\"id\"] == \"new-id\"\n            # Reset\n            attrs[\"id\"] = original_id\n        except (TypeError, AttributeError):\n            # If modification is not allowed (read-only)\n            assert attrs[\"id\"] == original_id\n\n    def test_string_representation(self, attributes):\n        \"\"\"Test string representations\"\"\"\n        # __str__\n        str_repr = str(attributes)\n        assert isinstance(str_repr, str)\n        assert \"id\" in str_repr or \"main\" in str_repr\n\n        # __repr__\n        repr_str = repr(attributes)\n        assert isinstance(repr_str, str)\n\n    def test_edge_cases(self, sample_html):\n        \"\"\"Test edge cases and special scenarios\"\"\"\n        page = Selector(sample_html)\n\n        # Element with no attributes\n        page_with_no_attrs = Selector(\"<div>Content</div>\")\n        elem = page_with_no_attrs.css(\"div\")[0]\n        assert len(elem.attrib) == 0\n        assert list(elem.attrib.keys()) == []\n        assert elem.attrib.get(\"any\") is None\n\n        # Element with encoded content\n        main_elem = page.css(\"#main\")[0]\n        encoded = main_elem.attrib[\"data-encoded\"]\n        assert \"<\" in encoded  # Should decode it\n\n        # Style attribute parsing\n        style = main_elem.attrib[\"style\"]\n        assert \"color: red\" in style\n        assert \"background: blue\" in style\n\n    def test_url_attribute(self, attributes):\n        \"\"\"Test URL attributes\"\"\"\n        url = attributes[\"data-url\"]\n        assert url == \"https://example.com/page?param=value\"\n\n        # Could test URL joining if AttributesHandler supports it\n        # based on the parent element's base URL\n\n    def test_comparison_operations(self, sample_html):\n        \"\"\"Test comparison operations if supported\"\"\"\n        page = Selector(sample_html)\n        elem1 = page.css(\"#main\")[0]\n        elem2 = page.css(\"input\")[0]\n\n        # Different elements should have different attributes\n        assert elem1.attrib != elem2.attrib\n\n        # The same element should have equal attributes\n        elem1_again = page.css(\"#main\")[0]\n        assert elem1.attrib == elem1_again.attrib\n\n    def test_complex_search_patterns(self, attributes):\n        \"\"\"Test complex search patterns\"\"\"\n        # Search for JSON-containing attributes\n        json_attrs = []\n        for key, value in attributes.items():\n            try:\n                if isinstance(value, str) and (value.startswith('{') or value.startswith('[')):\n                    json.loads(value)\n                    json_attrs.append(key)\n            except:\n                pass\n\n        assert \"data-config\" in json_attrs\n        assert \"data-items\" in json_attrs\n        assert \"data-nested\" in json_attrs\n\n    def test_attribute_filtering(self, attributes):\n        \"\"\"Test filtering attributes by patterns\"\"\"\n        # Get all data-* attributes\n        data_attrs = {k: v for k, v in attributes.items() if k.startswith(\"data-\")}\n        assert len(data_attrs) > 5\n        assert \"data-config\" in data_attrs\n        assert \"data-items\" in data_attrs\n\n        # Get all event handler attributes\n        event_attrs = {k: v for k, v in attributes.items() if k.startswith(\"on\")}\n        assert \"onclick\" in event_attrs\n\n    def test_performance_with_many_attributes(self):\n        \"\"\"Test performance with elements having many attributes\"\"\"\n        # Create an element with many attributes\n        attrs_list = [f'data-attr{i}=\"value{i}\"' for i in range(100)]\n        html = f'<div id=\"test\" {\" \".join(attrs_list)}>Content</div>'\n\n        page = Selector(html)\n        element = page.css(\"#test\")[0]\n        attribs = element.attrib\n\n        # Should handle many attributes efficiently\n        assert len(attribs) == 101  # id + 100 data attributes\n\n        # Search should still work efficiently\n        results = list(attribs.search_values(\"value50\", partial=False))\n        assert len(results) == 1\n\n    def test_unicode_attributes(self):\n        \"\"\"Test handling of Unicode in attributes\"\"\"\n        html = \"\"\"\n        <div id=\"unicode-test\"\n             data-emoji=\"😀🎉\"\n             data-chinese=\"你好世界\"\n             data-arabic=\"مرحبا بالعالم\"\n             data-special=\"café naïve\">\n        </div>\n        \"\"\"\n\n        page = Selector(html)\n        attrs = page.css(\"#unicode-test\")[0].attrib\n\n        assert attrs[\"data-emoji\"] == \"😀🎉\"\n        assert attrs[\"data-chinese\"] == \"你好世界\"\n        assert attrs[\"data-arabic\"] == \"مرحبا بالعالم\"\n        assert attrs[\"data-special\"] == \"café naïve\"\n\n        # Search with Unicode\n        results = list(attrs.search_values(\"你好\", partial=True))\n        assert len(results) == 1\n\n    def test_malformed_attributes(self):\n        \"\"\"Test handling of malformed attributes\"\"\"\n        # Various malformed HTML scenarios\n        test_cases = [\n            '<div id=\"test\" class=>Content</div>',  # Empty attribute value\n            '<div id=\"test\" class>Content</div>',  # No attribute value\n            '<div id=\"test\" data-\"invalid\"=\"value\">Content</div>',  # Invalid attribute name\n            '<div id=test class=no-quotes>Content</div>',  # Unquoted values\n        ]\n\n        for html in test_cases:\n            try:\n                page = Selector(html)\n                if page.css(\"div\"):\n                    attrs = page.css(\"div\")[0].attrib\n                    # Should handle gracefully without crashing\n                    assert isinstance(attrs, AttributesHandler)\n            except:\n                # Some malformed HTML might not parse at all\n                pass\n"
  },
  {
    "path": "tests/parser/test_general.py",
    "content": "import pickle\nimport time\nimport logging\n\nimport pytest\nfrom cssselect import SelectorError, SelectorSyntaxError\n\nfrom scrapling import Selector\nlogging.getLogger(\"scrapling\").setLevel(logging.DEBUG)\n\n\n@pytest.fixture\ndef html_content():\n    return \"\"\"\n    <html>\n    <head>\n        <title>Complex Web Page</title>\n        <style>\n            .hidden { display: none; }\n        </style>\n    </head>\n    <body>\n        <header>\n            <nav>\n                <ul>\n                    <li><a href=\"#home\">Home</a></li>\n                    <li><a href=\"#about\">About</a></li>\n                    <li><a href=\"#contact\">Contact</a></li>\n                </ul>\n            </nav>\n        </header>\n        <main>\n            <section id=\"products\" schema='{\"jsonable\": \"data\"}'>\n                <h2>Products</h2>\n                <div class=\"product-list\">\n                    <article class=\"product\" data-id=\"1\">\n                        <h3>Product 1</h3>\n                        <p class=\"description\">This is product 1</p>\n                        <span class=\"price\">$10.99</span>\n                        <div class=\"hidden stock\">In stock: 5</div>\n                    </article>\n                    <article class=\"product\" data-id=\"2\">\n                        <h3>Product 2</h3>\n                        <p class=\"description\">This is product 2</p>\n                        <span class=\"price\">$20.99</span>\n                        <div class=\"hidden stock\">In stock: 3</div>\n                    </article>\n                    <article class=\"product\" data-id=\"3\">\n                        <h3>Product 3</h3>\n                        <p class=\"description\">This is product 3</p>\n                        <span class=\"price\">$15.99</span>\n                        <div class=\"hidden stock\">Out of stock</div>\n                    </article>\n                </div>\n            </section>\n            <section id=\"reviews\">\n                <h2>Customer Reviews</h2>\n                <div class=\"review-list\">\n                    <div class=\"review\" data-rating=\"5\">\n                        <p class=\"review-text\">Great product!</p>\n                        <span class=\"reviewer\">John Doe</span>\n                    </div>\n                    <div class=\"review\" data-rating=\"4\">\n                        <p class=\"review-text\">Good value for money.</p>\n                        <span class=\"reviewer\">Jane Smith</span>\n                    </div>\n                </div>\n            </section>\n        </main>\n        <footer>\n            <p>&copy; 2024 Our Company</p>\n        </footer>\n        <script id=\"page-data\" type=\"application/json\">\n            {\"lastUpdated\": \"2024-09-22T10:30:00Z\", \"totalProducts\": 3}\n        </script>\n    </body>\n    </html>\n    \"\"\"\n\n\n@pytest.fixture\ndef page(html_content):\n    return Selector(html_content, adaptive=False)\n\n\n# CSS Selector Tests\nclass TestCSSSelectors:\n    def test_basic_product_selection(self, page):\n        \"\"\"Test selecting all product elements\"\"\"\n        elements = page.css(\"main #products .product-list article.product\")\n        assert len(elements) == 3\n\n    def test_in_stock_product_selection(self, page):\n        \"\"\"Test selecting in-stock products\"\"\"\n        in_stock_products = page.css(\n            'main #products .product-list article.product:not(:contains(\"Out of stock\"))'\n        )\n        assert len(in_stock_products) == 2\n\n\n# XPath Selector Tests\nclass TestXPathSelectors:\n    def test_high_rating_reviews(self, page):\n        \"\"\"Test selecting reviews with high ratings\"\"\"\n        reviews = page.xpath(\n            '//section[@id=\"reviews\"]//div[contains(@class, \"review\") and @data-rating >= 4]'\n        )\n        assert len(reviews) == 2\n\n    def test_high_priced_products(self, page):\n        \"\"\"Test selecting products above a certain price\"\"\"\n        high_priced_products = page.xpath(\n            '//article[contains(@class, \"product\")]'\n            '[number(translate(substring-after(.//span[@class=\"price\"], \"$\"), \",\", \"\")) > 15]'\n        )\n        assert len(high_priced_products) == 2\n\n\n# Text Matching Tests\nclass TestTextMatching:\n    def test_regex_multiple_matches(self, page):\n        \"\"\"Test finding multiple matches with regex\"\"\"\n        stock_info = page.find_by_regex(r\"In stock: \\d+\", first_match=False)\n        assert len(stock_info) == 2\n\n    def test_regex_first_match(self, page):\n        \"\"\"Test finding the first match with regex\"\"\"\n        stock_info = page.find_by_regex(\n            r\"In stock: \\d+\", first_match=True, case_sensitive=True\n        )\n        assert stock_info.text == \"In stock: 5\"\n\n    def test_partial_text_match(self, page):\n        \"\"\"Test finding elements with partial text match\"\"\"\n        stock_info = page.find_by_text(r\"In stock:\", partial=True, first_match=False)\n        assert len(stock_info) == 2\n\n    def test_exact_text_match(self, page):\n        \"\"\"Test finding elements with exact text match\"\"\"\n        out_of_stock = page.find_by_text(\n            \"Out of stock\", partial=False, first_match=False\n        )\n        assert len(out_of_stock) == 1\n\n\n# Similar Elements Tests\nclass TestSimilarElements:\n    def test_finding_similar_products(self, page):\n        \"\"\"Test finding similar product elements\"\"\"\n        first_product = page.css(\".product\").first\n        similar_products = first_product.find_similar()\n        assert len(similar_products) == 2\n\n    def test_finding_similar_reviews(self, page):\n        \"\"\"Test finding similar review elements with additional filtering\"\"\"\n        first_review = page.find(\"div\", class_=\"review\")\n        similar_high_rated_reviews = [\n            review\n            for review in first_review.find_similar()\n            if int(review.attrib.get(\"data-rating\", 0)) >= 4\n        ]\n        assert len(similar_high_rated_reviews) == 1\n\n\n# Error Handling Tests\nclass TestErrorHandling:\n    def test_invalid_selector_initialization(self):\n        \"\"\"Test various invalid Selector initializations\"\"\"\n        # No arguments\n        with pytest.raises(ValueError):\n            _ = Selector(adaptive=False)\n\n        with pytest.raises(TypeError):\n            _ = Selector(content=1, adaptive=False)\n\n    def test_invalid_storage(self, page, html_content):\n        \"\"\"Test invalid storage parameter\"\"\"\n        with pytest.raises(ValueError):\n            _ = Selector(html_content, storage=object, adaptive=True)\n\n    def test_bad_selectors(self, page):\n        \"\"\"Test handling of invalid selectors\"\"\"\n        with pytest.raises((SelectorError, SelectorSyntaxError)):\n            page.css(\"4 ayo\")\n\n        with pytest.raises((SelectorError, SelectorSyntaxError)):\n            page.xpath(\"4 ayo\")\n\n\n# Pickling and Object Representation Tests\nclass TestPicklingAndRepresentation:\n    def test_unpickleable_objects(self, page):\n        \"\"\"Test that Selector objects cannot be pickled\"\"\"\n        table = page.css(\".product-list\")[0]\n        with pytest.raises(TypeError):\n            pickle.dumps(table)\n\n    def test_string_representations(self, page):\n        \"\"\"Test custom string representations of objects\"\"\"\n        table = page.css(\".product-list\")[0]\n        assert issubclass(type(table.__str__()), str)\n        assert issubclass(type(table.__repr__()), str)\n        assert issubclass(type(table.attrib.__str__()), str)\n        assert issubclass(type(table.attrib.__repr__()), str)\n\n\n# Navigation and Traversal Tests\nclass TestElementNavigation:\n    def test_basic_navigation_properties(self, page):\n        \"\"\"Test basic navigation properties of elements\"\"\"\n        table = page.css(\".product-list\")[0]\n        assert table.path is not None\n        assert table.html_content != \"\"\n        assert table.prettify() != \"\"\n\n    def test_parent_and_sibling_navigation(self, page):\n        \"\"\"Test parent and sibling navigation\"\"\"\n        table = page.css(\".product-list\")[0]\n        parent = table.parent\n        assert parent[\"id\"] == \"products\"\n\n        parent_siblings = parent.siblings\n        assert len(parent_siblings) == 1\n\n    def test_child_navigation(self, page):\n        \"\"\"Test child navigation\"\"\"\n        table = page.css(\".product-list\")[0]\n        children = table.children\n        assert len(children) == 3\n\n    def test_next_and_previous_navigation(self, page):\n        \"\"\"Test next and previous element navigation\"\"\"\n        child = page.css(\".product-list\")[0].find({\"data-id\": \"1\"})\n        next_element = child.next\n        assert next_element.attrib[\"data-id\"] == \"2\"\n\n        prev_element = next_element.previous\n        assert prev_element.tag == child.tag\n\n    def test_ancestor_finding(self, page):\n        \"\"\"Test finding ancestors of elements\"\"\"\n        all_prices = page.css(\".price\")\n        products_with_prices = [\n            price.find_ancestor(lambda p: p.has_class(\"product\"))\n            for price in all_prices\n        ]\n        assert len(products_with_prices) == 3\n\n\n# JSON and Attribute Tests\nclass TestJSONAndAttributes:\n    def test_json_conversion(self, page):\n        \"\"\"Test converting content to JSON\"\"\"\n        script_content = page.css(\"#page-data::text\")[0].get()\n        assert issubclass(type(script_content.sort()), str)\n        page_data = script_content.json()\n        assert page_data[\"totalProducts\"] == 3\n        assert \"lastUpdated\" in page_data\n\n    def test_attribute_operations(self, page):\n        \"\"\"Test various attribute-related operations\"\"\"\n        # Product ID extraction\n        products = page.css(\".product\")\n        product_ids = [product.attrib[\"data-id\"] for product in products]\n        assert product_ids == [\"1\", \"2\", \"3\"]\n        assert \"data-id\" in products[0]\n\n        # Review rating calculations\n        reviews = page.css(\".review\")\n        review_ratings = [int(review.attrib[\"data-rating\"]) for review in reviews]\n        assert sum(review_ratings) / len(review_ratings) == 4.5\n\n        # Attribute searching\n        key_value = list(products[0].attrib.search_values(\"1\", partial=False))\n        assert list(key_value[0].keys()) == [\"data-id\"]\n\n        key_value = list(products[0].attrib.search_values(\"1\", partial=True))\n        assert list(key_value[0].keys()) == [\"data-id\"]\n\n        # JSON attribute conversion\n        attr_json = page.css(\"#products\").first.attrib[\"schema\"].json()\n        assert attr_json == {\"jsonable\": \"data\"}\n        assert isinstance(page.css(\"#products\")[0].attrib.json_string, bytes)\n\n\n# Performance Test\ndef test_large_html_parsing_performance():\n    \"\"\"Test parsing and selecting performance on large HTML\"\"\"\n    large_html = (\n        \"<html><body>\"\n        + '<div class=\"item\">' * 5000\n        + \"</div>\" * 5000\n        + \"</body></html>\"\n    )\n\n    start_time = time.time()\n    parsed = Selector(large_html, adaptive=False)\n    elements = parsed.css(\".item\")\n    end_time = time.time()\n\n    # assert len(elements) == 5000  # GitHub actions don't like this line\n    # Converting 5000 elements to a class and doing operations on them will take time\n    # Based on my tests with 100 runs, 1 loop each Scrapling (given the extra work/features) takes 10.4ms on average\n    assert (\n        end_time - start_time < 0.5\n    )  # Locally I test on 0.1 but on GitHub actions with browsers and threading sometimes closing adds fractions of seconds\n\n\n# Selector Generation Test\ndef test_selectors_generation(page):\n    \"\"\"Try to create selectors for all elements in the page\"\"\"\n\n    def _traverse(element: Selector):\n        assert isinstance(element.generate_css_selector, str)\n        assert isinstance(element.generate_full_css_selector, str)\n        assert isinstance(element.generate_xpath_selector, str)\n        assert isinstance(element.generate_full_xpath_selector, str)\n        for branch in element.children:\n            _traverse(branch)\n\n    _traverse(page)\n\n\n# Miscellaneous Tests\ndef test_getting_all_text(page):\n    \"\"\"Test getting all text from the page\"\"\"\n    assert page.get_all_text() != \"\"\n\n\ndef test_regex_on_text(page):\n    \"\"\"Test regex operations on text\"\"\"\n    element = page.css('[data-id=\"1\"] .price')[0]\n    match = element.re_first(r\"[\\.\\d]+\")\n    assert match == \"10.99\"\n    match = element.text.re(r\"(\\d+)\", replace_entities=False)\n    assert len(match) == 2\n"
  },
  {
    "path": "tests/parser/test_parser_advanced.py",
    "content": "import re\nimport pytest\nfrom unittest.mock import Mock\n\nfrom scrapling import Selector, Selectors\nfrom scrapling.core.custom_types import TextHandler, TextHandlers\nfrom scrapling.core.storage import SQLiteStorageSystem\n\n\nclass TestSelectorAdvancedFeatures:\n    \"\"\"Test advanced Selector features like adaptive matching\"\"\"\n\n    def test_adaptive_initialization_with_storage(self):\n        \"\"\"Test adaptive initialization with custom storage\"\"\"\n        html = \"<html><body><p>Test</p></body></html>\"\n\n        # Use the actual SQLiteStorageSystem for this test\n        selector = Selector(\n            content=html,\n            adaptive=True,\n            storage=SQLiteStorageSystem,\n            storage_args={\"storage_file\": \":memory:\", \"url\": \"https://example.com\"}\n        )\n\n        assert selector._Selector__adaptive_enabled is True\n        assert selector._storage is not None\n\n    def test_adaptive_initialization_with_default_storage_args(self):\n        \"\"\"Test adaptive initialization with default storage args\"\"\"\n        html = \"<html><body><p>Test</p></body></html>\"\n        url = \"https://example.com\"\n\n        # Test that adaptive mode uses default storage when no explicit args provided\n        selector = Selector(\n            content=html,\n            url=url,\n            adaptive=True\n        )\n\n        # Should create storage with default args\n        assert selector._storage is not None\n\n    def test_adaptive_with_existing_storage(self):\n        \"\"\"Test adaptive initialization with existing storage object\"\"\"\n        html = \"<html><body><p>Test</p></body></html>\"\n\n        mock_storage = Mock()\n\n        selector = Selector(\n            content=html,\n            adaptive=True,\n            _storage=mock_storage\n        )\n\n        assert selector._storage is mock_storage\n\n\nclass TestAdvancedSelectors:\n    \"\"\"Test advanced selector functionality\"\"\"\n\n    @pytest.fixture\n    def complex_html(self):\n        return \"\"\"\n        <html>\n            <body>\n                <div class=\"container\" data-test='{\"key\": \"value\"}'>\n                    <p>First paragraph</p>\n                    <!-- Comment -->\n                    <p>Second paragraph</p>\n                    <![CDATA[Some CDATA content]]>\n                    <div class=\"nested\">\n                        <span id=\"special\">Special content</span>\n                        <span>Regular content</span>\n                    </div>\n                    <table>\n                        <tr><td>Cell 1</td><td>Cell 2</td></tr>\n                        <tr><td>Cell 3</td><td>Cell 4</td></tr>\n                    </table>\n                </div>\n            </body>\n        </html>\n        \"\"\"\n\n    def test_comment_and_cdata_handling(self, complex_html):\n        \"\"\"Test handling of comments and CDATA\"\"\"\n        # With comments/CDATA kept\n        page = Selector(\n            complex_html,\n            keep_comments=True,\n            keep_cdata=True\n        )\n        content = page.body\n        assert \"Comment\" in content\n        assert \"CDATA\" in content\n\n        # Without comments/CDATA\n        page = Selector(\n            complex_html,\n            keep_comments=False,\n            keep_cdata=False\n        )\n        content = page.html_content\n        assert \"Comment\" not in content\n\n    def test_advanced_xpath_variables(self, complex_html):\n        \"\"\"Test XPath with variables\"\"\"\n        page = Selector(complex_html)\n\n        # Using XPath variables\n        cells = page.xpath(\n            \"//td[text()=$cell_text]\",\n            cell_text=\"Cell 1\"\n        )\n        assert len(cells) == 1\n        assert cells[0].text == \"Cell 1\"\n\n    def test_pseudo_elements(self, complex_html):\n        \"\"\"Test CSS pseudo-elements\"\"\"\n        page = Selector(complex_html)\n\n        # ::text pseudo-element\n        texts = page.css(\"p::text\")\n        assert len(texts) == 2\n        assert isinstance(texts[0], Selector)\n        assert isinstance(texts[0].get(), TextHandler)\n\n        # ::attr() pseudo-element\n        attrs = page.css(\"div::attr(class)\")\n        assert \"container\" in attrs.getall()\n\n    def test_complex_attribute_operations(self, complex_html):\n        \"\"\"Test complex attribute handling\"\"\"\n        page = Selector(complex_html)\n        container = page.css(\".container\")[0]\n\n        # JSON in attributes\n        data = container.attrib[\"data-test\"].json()\n        assert data[\"key\"] == \"value\"\n\n        # Attribute searching\n        matches = list(container.attrib.search_values(\"container\"))\n        assert len(matches) == 1\n\n    def test_url_joining(self):\n        \"\"\"Test URL joining functionality\"\"\"\n        page = Selector(\"<html></html>\", url=\"https://example.com/page\")\n\n        # Relative URL\n        assert page.urljoin(\"../other\") == \"https://example.com/other\"\n        assert page.urljoin(\"/absolute\") == \"https://example.com/absolute\"\n        assert page.urljoin(\"relative\") == \"https://example.com/relative\"\n\n    def test_find_operations_edge_cases(self, complex_html):\n        \"\"\"Test edge cases in find operations\"\"\"\n        page = Selector(complex_html)\n\n        # Multiple argument types\n        _ = page.find_all(\n            \"span\",\n            [\"div\"],\n            {\"class\": \"nested\"},\n            lambda e: e.text != \"\"\n        )\n\n        # Regex pattern matching\n        pattern = re.compile(r\"Cell \\d+\")\n        cells = page.find_all(pattern)\n        assert len(cells) == 4\n\n    def test_text_operations_edge_cases(self, complex_html):\n        \"\"\"Test text operation edge cases\"\"\"\n        page = Selector(complex_html)\n\n        # get_all_text with a custom separator\n        text = page.get_all_text(separator=\" | \", strip=True)\n        assert \" | \" in text\n\n        # Ignore specific tags\n        text = page.get_all_text(ignore_tags=(\"table\",))\n        assert \"Cell\" not in text\n\n        # With empty values\n        text = page.get_all_text(valid_values=False)\n        assert text != \"\"\n\n    def test_get_all_text_preserves_interleaved_text_nodes(self):\n        \"\"\"Test get_all_text preserves interleaved text nodes\"\"\"\n        html = \"\"\"\n        <html>\n        <body>\n            <main>\n                string1\n                <b>string2</b>\n                string3\n                <div>\n                    <span>string4</span>\n                </div>\n                string5\n                <script>ignored</script>\n                string6\n                <style>ignored</style>\n                string7\n            </main>\n        </body>\n        </html>\n        \"\"\"\n\n        page = Selector(html, adaptive=False)\n        node = page.css(\"main\")[0]\n\n        assert node.get_all_text(\"\\n\", strip=True) == \"string1\\nstring2\\nstring3\\nstring4\\nstring5\\nstring6\\nstring7\"\n\n\nclass TestTextHandlerAdvanced:\n    \"\"\"Test advanced TextHandler functionality\"\"\"\n\n    def test_text_handler_operations(self):\n        \"\"\"Test various TextHandler operations\"\"\"\n        text = TextHandler(\"  Hello World  \")\n\n        # All string methods should return TextHandler\n        assert isinstance(text.strip(), TextHandler)\n        assert isinstance(text.upper(), TextHandler)\n        assert isinstance(text.lower(), TextHandler)\n        assert isinstance(text.replace(\"World\", \"Python\"), TextHandler)\n\n        # Custom methods\n        assert text.clean() == \"Hello World\"\n\n        # Sorting\n        text2 = TextHandler(\"dcba\")\n        assert text2.sort() == \"abcd\"\n\n    def test_text_handler_regex(self):\n        \"\"\"Test regex operations on TextHandler\"\"\"\n        text = TextHandler(\"Price: $10.99, Sale: $8.99\")\n\n        # Basic regex\n        prices = text.re(r\"\\$[\\d.]+\")\n        assert len(prices) == 2\n        assert prices[0] == \"$10.99\"\n\n        # Case insensitive\n        text2 = TextHandler(\"HELLO hello HeLLo\")\n        matches = text2.re(r\"hello\", case_sensitive=False)\n        assert len(matches) == 3\n\n        # Clean match\n        text3 = TextHandler(\" He  l  lo  \")\n        matches = text3.re(r\"He l lo\", clean_match=True, case_sensitive=False)\n        assert len(matches) == 1\n\n    def test_text_handlers_operations(self):\n        \"\"\"Test TextHandlers list operations\"\"\"\n        handlers = TextHandlers([\n            TextHandler(\"First\"),\n            TextHandler(\"Second\"),\n            TextHandler(\"Third\")\n        ])\n\n        # Slicing should return TextHandlers\n        assert isinstance(handlers[0:2], TextHandlers)\n\n        # Get methods\n        assert handlers.get() == \"First\"\n        assert handlers.get(\"default\") == \"First\"\n        assert TextHandlers([]).get(\"default\") == \"default\"\n\n\nclass TestSelectorsAdvanced:\n    \"\"\"Test advanced Selectors functionality\"\"\"\n\n    def test_selectors_filtering(self):\n        \"\"\"Test filtering operations on Selectors\"\"\"\n        html = \"\"\"\n        <div>\n            <p class=\"highlight\">Important</p>\n            <p>Regular</p>\n            <p class=\"highlight\">Also important</p>\n        </div>\n        \"\"\"\n        page = Selector(html)\n        paragraphs = page.css(\"p\")\n\n        # Filter by class\n        highlighted = paragraphs.filter(lambda p: p.has_class(\"highlight\"))\n        assert len(highlighted) == 2\n\n        # Search for a specific element\n        found = paragraphs.search(lambda p: p.text == \"Regular\")\n        assert found is not None\n        assert found.text == \"Regular\"\n\n    def test_selectors_properties(self):\n        \"\"\"Test Selectors properties\"\"\"\n        html = \"<div><p>1</p><p>2</p><p>3</p></div>\"\n        page = Selector(html)\n        paragraphs = page.css(\"p\")\n\n        assert paragraphs.first.text == \"1\"\n        assert paragraphs.last.text == \"3\"\n        assert paragraphs.length == 3\n"
  },
  {
    "path": "tests/requirements.txt",
    "content": "pytest>=2.8.0,<9\npytest-cov\nplaywright==1.58.0\nwerkzeug<3.0.0\npytest-httpbin==2.1.0\npytest-asyncio\nhttpbin~=0.10.0\npytest-xdist\n"
  },
  {
    "path": "tests/spiders/__init__.py",
    "content": ""
  },
  {
    "path": "tests/spiders/test_checkpoint.py",
    "content": "\"\"\"Tests for the CheckpointManager and CheckpointData classes.\"\"\"\n\nimport pickle\nimport tempfile\nfrom pathlib import Path\n\nimport pytest\nimport anyio\n\nfrom scrapling.spiders.request import Request\nfrom scrapling.spiders.checkpoint import CheckpointData, CheckpointManager\n\n\nclass TestCheckpointData:\n    \"\"\"Test CheckpointData dataclass.\"\"\"\n\n    def test_default_values(self):\n        \"\"\"Test CheckpointData with default values.\"\"\"\n        data = CheckpointData()\n\n        assert data.requests == []\n        assert data.seen == set()\n\n    def test_with_requests_and_seen(self):\n        \"\"\"Test CheckpointData with requests and seen URLs.\"\"\"\n        requests = [\n            Request(\"https://example.com/1\", priority=10),\n            Request(\"https://example.com/2\", priority=5),\n        ]\n        seen = {\"url1\", \"url2\", \"url3\"}\n\n        data = CheckpointData(requests=requests, seen=seen)\n\n        assert len(data.requests) == 2\n        assert data.requests[0].url == \"https://example.com/1\"\n        assert data.seen == {\"url1\", \"url2\", \"url3\"}\n\n    def test_pickle_roundtrip(self):\n        \"\"\"Test that CheckpointData can be pickled and unpickled.\"\"\"\n        requests = [Request(\"https://example.com\", priority=5)]\n        seen = {\"fingerprint1\", \"fingerprint2\"}\n        data = CheckpointData(requests=requests, seen=seen)\n\n        pickled = pickle.dumps(data)\n        restored = pickle.loads(pickled)\n\n        assert len(restored.requests) == 1\n        assert restored.requests[0].url == \"https://example.com\"\n        assert restored.seen == {\"fingerprint1\", \"fingerprint2\"}\n\n\nclass TestCheckpointManagerInit:\n    \"\"\"Test CheckpointManager initialization.\"\"\"\n\n    def test_init_with_string_path(self):\n        \"\"\"Test initialization with string path.\"\"\"\n        manager = CheckpointManager(\"/tmp/test_crawl\")\n\n        assert str(manager.crawldir) == \"/tmp/test_crawl\"\n        assert manager.interval == 300.0\n\n    def test_init_with_pathlib_path(self):\n        \"\"\"Test initialization with pathlib.Path.\"\"\"\n        path = Path(\"/tmp/test_crawl\")\n        manager = CheckpointManager(path)\n\n        assert str(manager.crawldir) == \"/tmp/test_crawl\"\n\n    def test_init_with_custom_interval(self):\n        \"\"\"Test initialization with custom interval.\"\"\"\n        manager = CheckpointManager(\"/tmp/test\", interval=60.0)\n        assert manager.interval == 60.0\n\n    def test_init_with_zero_interval(self):\n        \"\"\"Test initialization with zero interval (disable periodic checkpoints).\"\"\"\n        manager = CheckpointManager(\"/tmp/test\", interval=0)\n        assert manager.interval == 0\n\n    def test_init_with_negative_interval_raises(self):\n        \"\"\"Test that negative interval raises ValueError.\"\"\"\n        with pytest.raises(ValueError, match=\"greater than 0\"):\n            CheckpointManager(\"/tmp/test\", interval=-1)\n\n    def test_init_with_invalid_interval_type_raises(self):\n        \"\"\"Test that invalid interval type raises TypeError.\"\"\"\n        with pytest.raises(TypeError, match=\"integer or float\"):\n            CheckpointManager(\"/tmp/test\", interval=\"invalid\")  # type: ignore\n\n    def test_checkpoint_file_path(self):\n        \"\"\"Test that checkpoint file path is correctly constructed.\"\"\"\n        manager = CheckpointManager(\"/tmp/test_crawl\")\n\n        expected_path = \"/tmp/test_crawl/checkpoint.pkl\"\n        assert str(manager._checkpoint_path) == expected_path\n\n\nclass TestCheckpointManagerOperations:\n    \"\"\"Test CheckpointManager save/load/cleanup operations.\"\"\"\n\n    @pytest.fixture\n    def temp_dir(self):\n        \"\"\"Create a temporary directory for testing.\"\"\"\n        with tempfile.TemporaryDirectory() as tmpdir:\n            yield Path(tmpdir)\n\n    @pytest.mark.asyncio\n    async def test_has_checkpoint_false_when_no_file(self, temp_dir: Path):\n        \"\"\"Test has_checkpoint returns False when no checkpoint exists.\"\"\"\n        manager = CheckpointManager(temp_dir / \"crawl\")\n\n        result = await manager.has_checkpoint()\n\n        assert result is False\n\n    @pytest.mark.asyncio\n    async def test_save_creates_checkpoint_file(self, temp_dir: Path):\n        \"\"\"Test that save creates the checkpoint file.\"\"\"\n        crawl_dir = temp_dir / \"crawl\"\n        manager = CheckpointManager(crawl_dir)\n\n        data = CheckpointData(\n            requests=[Request(\"https://example.com\")],\n            seen={\"fp1\", \"fp2\"},\n        )\n\n        await manager.save(data)\n\n        checkpoint_path = crawl_dir / \"checkpoint.pkl\"\n        assert checkpoint_path.exists()\n\n    @pytest.mark.asyncio\n    async def test_save_creates_directory_if_not_exists(self, temp_dir: Path):\n        \"\"\"Test that save creates the directory if it doesn't exist.\"\"\"\n        crawl_dir = temp_dir / \"nested\" / \"crawl\" / \"dir\"\n        manager = CheckpointManager(crawl_dir)\n\n        data = CheckpointData()\n        await manager.save(data)\n\n        assert crawl_dir.exists()\n\n    @pytest.mark.asyncio\n    async def test_has_checkpoint_true_after_save(self, temp_dir: Path):\n        \"\"\"Test has_checkpoint returns True after saving.\"\"\"\n        manager = CheckpointManager(temp_dir / \"crawl\")\n\n        data = CheckpointData()\n        await manager.save(data)\n\n        result = await manager.has_checkpoint()\n        assert result is True\n\n    @pytest.mark.asyncio\n    async def test_load_returns_none_when_no_checkpoint(self, temp_dir: Path):\n        \"\"\"Test load returns None when no checkpoint exists.\"\"\"\n        manager = CheckpointManager(temp_dir / \"crawl\")\n\n        result = await manager.load()\n\n        assert result is None\n\n    @pytest.mark.asyncio\n    async def test_save_and_load_roundtrip(self, temp_dir: Path):\n        \"\"\"Test saving and loading checkpoint data.\"\"\"\n        manager = CheckpointManager(temp_dir / \"crawl\")\n\n        original_data = CheckpointData(\n            requests=[\n                Request(\"https://example.com/1\", priority=10),\n                Request(\"https://example.com/2\", priority=5),\n            ],\n            seen={\"fp1\", \"fp2\", \"fp3\"},\n        )\n\n        await manager.save(original_data)\n        loaded_data = await manager.load()\n\n        assert loaded_data is not None\n        assert len(loaded_data.requests) == 2\n        assert loaded_data.requests[0].url == \"https://example.com/1\"\n        assert loaded_data.requests[0].priority == 10\n        assert loaded_data.seen == {\"fp1\", \"fp2\", \"fp3\"}\n\n    @pytest.mark.asyncio\n    async def test_save_is_atomic(self, temp_dir: Path):\n        \"\"\"Test that save uses atomic write (temp file + rename).\"\"\"\n        crawl_dir = temp_dir / \"crawl\"\n        manager = CheckpointManager(crawl_dir)\n\n        data = CheckpointData(requests=[Request(\"https://example.com\")])\n        await manager.save(data)\n\n        # Temp file should not exist after successful save\n        temp_path = crawl_dir / \"checkpoint.tmp\"\n        assert not temp_path.exists()\n\n        # Checkpoint file should exist\n        checkpoint_path = crawl_dir / \"checkpoint.pkl\"\n        assert checkpoint_path.exists()\n\n    @pytest.mark.asyncio\n    async def test_cleanup_removes_checkpoint_file(self, temp_dir: Path):\n        \"\"\"Test that cleanup removes the checkpoint file.\"\"\"\n        crawl_dir = temp_dir / \"crawl\"\n        manager = CheckpointManager(crawl_dir)\n\n        # Save a checkpoint first\n        data = CheckpointData()\n        await manager.save(data)\n\n        checkpoint_path = crawl_dir / \"checkpoint.pkl\"\n        assert checkpoint_path.exists()\n\n        # Cleanup should remove it\n        await manager.cleanup()\n\n        assert not checkpoint_path.exists()\n\n    @pytest.mark.asyncio\n    async def test_cleanup_no_error_when_no_file(self, temp_dir: Path):\n        \"\"\"Test that cleanup doesn't raise error when no file exists.\"\"\"\n        manager = CheckpointManager(temp_dir / \"crawl\")\n\n        # Should not raise\n        await manager.cleanup()\n\n    @pytest.mark.asyncio\n    async def test_load_returns_none_on_corrupt_file(self, temp_dir: Path):\n        \"\"\"Test load returns None when checkpoint file is corrupt.\"\"\"\n        crawl_dir = temp_dir / \"crawl\"\n        crawl_dir.mkdir(parents=True)\n\n        checkpoint_path = crawl_dir / \"checkpoint.pkl\"\n        checkpoint_path.write_bytes(b\"not valid pickle data\")\n\n        manager = CheckpointManager(crawl_dir)\n\n        result = await manager.load()\n\n        assert result is None\n\n    @pytest.mark.asyncio\n    async def test_multiple_saves_overwrite(self, temp_dir: Path):\n        \"\"\"Test that multiple saves overwrite the checkpoint.\"\"\"\n        manager = CheckpointManager(temp_dir / \"crawl\")\n\n        # First save\n        data1 = CheckpointData(\n            requests=[Request(\"https://example.com/1\")],\n            seen={\"fp1\"},\n        )\n        await manager.save(data1)\n\n        # Second save\n        data2 = CheckpointData(\n            requests=[Request(\"https://example.com/2\"), Request(\"https://example.com/3\")],\n            seen={\"fp2\", \"fp3\"},\n        )\n        await manager.save(data2)\n\n        # Load should return the second save\n        loaded = await manager.load()\n\n        assert loaded is not None\n        assert len(loaded.requests) == 2\n        assert loaded.requests[0].url == \"https://example.com/2\"\n        assert loaded.seen == {\"fp2\", \"fp3\"}\n\n\nclass TestCheckpointManagerEdgeCases:\n    \"\"\"Test edge cases for CheckpointManager.\"\"\"\n\n    @pytest.fixture\n    def temp_dir(self):\n        \"\"\"Create a temporary directory for testing.\"\"\"\n        with tempfile.TemporaryDirectory() as tmpdir:\n            yield Path(tmpdir)\n\n    @pytest.mark.asyncio\n    async def test_save_empty_checkpoint(self, temp_dir: Path):\n        \"\"\"Test saving empty checkpoint data.\"\"\"\n        manager = CheckpointManager(temp_dir / \"crawl\")\n\n        data = CheckpointData(requests=[], seen=set())\n        await manager.save(data)\n\n        loaded = await manager.load()\n\n        assert loaded is not None\n        assert loaded.requests == []\n        assert loaded.seen == set()\n\n    @pytest.mark.asyncio\n    async def test_save_large_checkpoint(self, temp_dir: Path):\n        \"\"\"Test saving checkpoint with many requests.\"\"\"\n        manager = CheckpointManager(temp_dir / \"crawl\")\n\n        # Create 1000 requests\n        requests = [\n            Request(f\"https://example.com/{i}\", priority=i % 10)\n            for i in range(1000)\n        ]\n        seen = {f\"fp_{i}\" for i in range(2000)}\n\n        data = CheckpointData(requests=requests, seen=seen)\n        await manager.save(data)\n\n        loaded = await manager.load()\n\n        assert loaded is not None\n        assert len(loaded.requests) == 1000\n        assert len(loaded.seen) == 2000\n\n    @pytest.mark.asyncio\n    async def test_requests_preserve_metadata(self, temp_dir: Path):\n        \"\"\"Test that request metadata is preserved through checkpoint.\"\"\"\n        manager = CheckpointManager(temp_dir / \"crawl\")\n\n        original_request = Request(\n            url=\"https://example.com\",\n            sid=\"my_session\",\n            priority=42,\n            dont_filter=True,\n            meta={\"item_id\": 123, \"page\": 5},\n            proxy=\"http://proxy:8080\",\n        )\n\n        data = CheckpointData(requests=[original_request], seen=set())\n        await manager.save(data)\n\n        loaded = await manager.load()\n\n        assert loaded is not None\n        restored = loaded.requests[0]\n\n        assert restored.url == \"https://example.com\"\n        assert restored.sid == \"my_session\"\n        assert restored.priority == 42\n        assert restored.dont_filter is True\n        assert restored.meta == {\"item_id\": 123, \"page\": 5}\n        assert restored._session_kwargs == {\"proxy\": \"http://proxy:8080\"}\n"
  },
  {
    "path": "tests/spiders/test_engine.py",
    "content": "\"\"\"Tests for the CrawlerEngine class.\"\"\"\n\nimport tempfile\nfrom pathlib import Path\n\nimport anyio\nimport pytest\n\nfrom scrapling.spiders.engine import CrawlerEngine, _dump\nfrom scrapling.spiders.request import Request\nfrom scrapling.spiders.session import SessionManager\nfrom scrapling.spiders.result import CrawlStats, ItemList\nfrom scrapling.spiders.checkpoint import CheckpointData\nfrom scrapling.core._types import Any, Dict, Set, AsyncGenerator\n\n\n# ---------------------------------------------------------------------------\n# Mock helpers\n# ---------------------------------------------------------------------------\n\n\nclass MockResponse:\n    \"\"\"Minimal Response stand-in.\"\"\"\n\n    def __init__(self, status: int = 200, body: bytes = b\"ok\", url: str = \"https://example.com\"):\n        self.status = status\n        self.body = body\n        self.url = url\n        self.request: Any = None\n        self.meta: Dict[str, Any] = {}\n\n    def __str__(self) -> str:\n        return self.url\n\n\nclass MockSession:\n    \"\"\"Mock session that returns a canned response.\"\"\"\n\n    def __init__(self, name: str = \"mock\", response: MockResponse | None = None):\n        self.name = name\n        self._is_alive = False\n        self._response = response or MockResponse()\n        self.fetch_calls: list[dict] = []\n\n    async def __aenter__(self):\n        self._is_alive = True\n        return self\n\n    async def __aexit__(self, *args):\n        self._is_alive = False\n\n    async def fetch(self, url: str, **kwargs):\n        self.fetch_calls.append({\"url\": url, **kwargs})\n        resp = MockResponse(status=self._response.status, body=self._response.body, url=url)\n        return resp\n\n\nclass ErrorSession(MockSession):\n    \"\"\"Session that raises on fetch.\"\"\"\n\n    def __init__(self, error: Exception | None = None):\n        super().__init__(\"error\")\n        self._error = error or RuntimeError(\"fetch failed\")\n\n    async def fetch(self, url: str, **kwargs):\n        raise self._error\n\n\nclass MockSpider:\n    \"\"\"Lightweight spider stub for engine tests.\"\"\"\n\n    def __init__(\n        self,\n        *,\n        concurrent_requests: int = 4,\n        concurrent_requests_per_domain: int = 0,\n        download_delay: float = 0.0,\n        max_blocked_retries: int = 3,\n        allowed_domains: Set[str] | None = None,\n        fp_include_kwargs: bool = False,\n        fp_include_headers: bool = False,\n        fp_keep_fragments: bool = False,\n        is_blocked_fn=None,\n        on_scraped_item_fn=None,\n        retry_blocked_request_fn=None,\n    ):\n        self.concurrent_requests = concurrent_requests\n        self.concurrent_requests_per_domain = concurrent_requests_per_domain\n        self.download_delay = download_delay\n        self.max_blocked_retries = max_blocked_retries\n        self.allowed_domains = allowed_domains or set()\n        self.fp_include_kwargs = fp_include_kwargs\n        self.fp_include_headers = fp_include_headers\n        self.fp_keep_fragments = fp_keep_fragments\n        self.name = \"test_spider\"\n\n        # Tracking lists\n        self.on_start_calls: list[dict] = []\n        self.on_close_calls: int = 0\n        self.on_error_calls: list[tuple[Request, Exception]] = []\n        self.scraped_items: list[dict] = []\n        self.blocked_responses: list = []\n        self.retry_requests: list = []\n\n        # Pluggable behaviour\n        self._is_blocked_fn = is_blocked_fn\n        self._on_scraped_item_fn = on_scraped_item_fn\n        self._retry_blocked_request_fn = retry_blocked_request_fn\n\n        # Log counter stub\n        self._log_counter = _LogCounterStub()\n\n    async def parse(self, response) -> AsyncGenerator[Dict[str, Any] | Request | None, None]:\n        yield {\"url\": str(response)}\n\n    async def on_start(self, resuming: bool = False) -> None:\n        self.on_start_calls.append({\"resuming\": resuming})\n\n    async def on_close(self) -> None:\n        self.on_close_calls += 1\n\n    async def on_error(self, request: Request, error: Exception) -> None:\n        self.on_error_calls.append((request, error))\n\n    async def on_scraped_item(self, item: Dict[str, Any]) -> Dict[str, Any] | None:\n        if self._on_scraped_item_fn:\n            return self._on_scraped_item_fn(item)\n        self.scraped_items.append(item)\n        return item\n\n    async def is_blocked(self, response) -> bool:\n        if self._is_blocked_fn:\n            return self._is_blocked_fn(response)\n        return False\n\n    async def retry_blocked_request(self, request: Request, response) -> Request:\n        self.retry_requests.append(request)\n        if self._retry_blocked_request_fn:\n            return self._retry_blocked_request_fn(request, response)\n        return request\n\n    async def start_requests(self) -> AsyncGenerator[Request, None]:\n        yield Request(\"https://example.com\", sid=\"default\")\n\n\nclass _LogCounterStub:\n    \"\"\"Stub for LogCounterHandler.\"\"\"\n\n    def get_counts(self) -> Dict[str, int]:\n        return {\"debug\": 0, \"info\": 0, \"warning\": 0, \"error\": 0, \"critical\": 0}\n\n\ndef _make_engine(\n    spider: MockSpider | None = None,\n    session: MockSession | None = None,\n    crawldir: str | None = None,\n    interval: float = 300.0,\n) -> CrawlerEngine:\n    \"\"\"Create a CrawlerEngine wired to mock objects.\"\"\"\n    spider = spider or MockSpider()\n    sm = SessionManager()\n    sm.add(\"default\", session or MockSession())\n    return CrawlerEngine(spider, sm, crawldir=crawldir, interval=interval)\n\n\n# ---------------------------------------------------------------------------\n# Tests: _dump helper\n# ---------------------------------------------------------------------------\n\n\nclass TestDumpHelper:\n    def test_dump_returns_json_string(self):\n        result = _dump({\"key\": \"value\"})\n        assert '\"key\": \"value\"' in result\n\n    def test_dump_handles_nested(self):\n        result = _dump({\"a\": {\"b\": 1}})\n        assert '\"a\"' in result\n        assert '\"b\"' in result\n\n\n# ---------------------------------------------------------------------------\n# Tests: __init__\n# ---------------------------------------------------------------------------\n\n\nclass TestCrawlerEngineInit:\n    def test_default_initialisation(self):\n        engine = _make_engine()\n\n        assert engine._running is False\n        assert engine._active_tasks == 0\n        assert engine._pause_requested is False\n        assert engine._force_stop is False\n        assert engine.paused is False\n        assert isinstance(engine.stats, CrawlStats)\n        assert isinstance(engine.items, ItemList)\n\n    def test_checkpoint_system_disabled_by_default(self):\n        engine = _make_engine()\n        assert engine._checkpoint_system_enabled is False\n\n    def test_checkpoint_system_enabled_with_crawldir(self):\n        with tempfile.TemporaryDirectory() as tmpdir:\n            engine = _make_engine(crawldir=tmpdir)\n            assert engine._checkpoint_system_enabled is True\n\n    def test_global_limiter_uses_concurrent_requests(self):\n        spider = MockSpider(concurrent_requests=8)\n        engine = _make_engine(spider=spider)\n        assert engine._global_limiter.total_tokens == 8\n\n    def test_allowed_domains_from_spider(self):\n        spider = MockSpider(allowed_domains={\"example.com\", \"test.org\"})\n        engine = _make_engine(spider=spider)\n        assert engine._allowed_domains == {\"example.com\", \"test.org\"}\n\n\n# ---------------------------------------------------------------------------\n# Tests: _is_domain_allowed\n# ---------------------------------------------------------------------------\n\n\nclass TestIsDomainAllowed:\n    def test_all_allowed_when_empty(self):\n        engine = _make_engine()\n        request = Request(\"https://anything.com/page\")\n        assert engine._is_domain_allowed(request) is True\n\n    def test_exact_domain_match(self):\n        spider = MockSpider(allowed_domains={\"example.com\"})\n        engine = _make_engine(spider=spider)\n\n        assert engine._is_domain_allowed(Request(\"https://example.com/page\")) is True\n        assert engine._is_domain_allowed(Request(\"https://other.com/page\")) is False\n\n    def test_subdomain_match(self):\n        spider = MockSpider(allowed_domains={\"example.com\"})\n        engine = _make_engine(spider=spider)\n\n        assert engine._is_domain_allowed(Request(\"https://sub.example.com/page\")) is True\n        assert engine._is_domain_allowed(Request(\"https://deep.sub.example.com/x\")) is True\n\n    def test_partial_name_not_matched(self):\n        spider = MockSpider(allowed_domains={\"example.com\"})\n        engine = _make_engine(spider=spider)\n\n        # \"notexample.com\" should NOT match \"example.com\"\n        assert engine._is_domain_allowed(Request(\"https://notexample.com/x\")) is False\n\n    def test_multiple_allowed_domains(self):\n        spider = MockSpider(allowed_domains={\"a.com\", \"b.org\"})\n        engine = _make_engine(spider=spider)\n\n        assert engine._is_domain_allowed(Request(\"https://a.com/\")) is True\n        assert engine._is_domain_allowed(Request(\"https://b.org/\")) is True\n        assert engine._is_domain_allowed(Request(\"https://c.net/\")) is False\n\n\n# ---------------------------------------------------------------------------\n# Tests: _rate_limiter\n# ---------------------------------------------------------------------------\n\n\nclass TestRateLimiter:\n    def test_returns_global_limiter_when_per_domain_disabled(self):\n        engine = _make_engine()  # concurrent_requests_per_domain=0\n        limiter = engine._rate_limiter(\"example.com\")\n        assert limiter is engine._global_limiter\n\n    def test_returns_per_domain_limiter_when_enabled(self):\n        spider = MockSpider(concurrent_requests_per_domain=2)\n        engine = _make_engine(spider=spider)\n\n        limiter = engine._rate_limiter(\"example.com\")\n        assert limiter is not engine._global_limiter\n        assert limiter.total_tokens == 2\n\n    def test_same_domain_returns_same_limiter(self):\n        spider = MockSpider(concurrent_requests_per_domain=2)\n        engine = _make_engine(spider=spider)\n\n        l1 = engine._rate_limiter(\"example.com\")\n        l2 = engine._rate_limiter(\"example.com\")\n        assert l1 is l2\n\n    def test_different_domains_get_different_limiters(self):\n        spider = MockSpider(concurrent_requests_per_domain=2)\n        engine = _make_engine(spider=spider)\n\n        l1 = engine._rate_limiter(\"a.com\")\n        l2 = engine._rate_limiter(\"b.com\")\n        assert l1 is not l2\n\n\n# ---------------------------------------------------------------------------\n# Tests: _normalize_request\n# ---------------------------------------------------------------------------\n\n\nclass TestNormalizeRequest:\n    def test_sets_default_sid_when_empty(self):\n        engine = _make_engine()\n        request = Request(\"https://example.com\")\n        assert request.sid == \"\"\n\n        engine._normalize_request(request)\n        assert request.sid == \"default\"\n\n    def test_preserves_existing_sid(self):\n        engine = _make_engine()\n        request = Request(\"https://example.com\", sid=\"custom\")\n\n        engine._normalize_request(request)\n        assert request.sid == \"custom\"\n\n\n# ---------------------------------------------------------------------------\n# Tests: _process_request\n# ---------------------------------------------------------------------------\n\n\nclass TestProcessRequest:\n    @pytest.mark.asyncio\n    async def test_successful_fetch_updates_stats(self):\n        spider = MockSpider()\n        session = MockSession(response=MockResponse(status=200, body=b\"hello\"))\n        engine = _make_engine(spider=spider, session=session)\n\n        request = Request(\"https://example.com\", sid=\"default\")\n        await engine._process_request(request)\n\n        assert engine.stats.requests_count == 1\n        assert engine.stats.response_bytes == 5  # len(b\"hello\") from MockSession\n        assert \"status_200\" in engine.stats.response_status_count\n\n    @pytest.mark.asyncio\n    async def test_failed_fetch_increments_failed_count(self):\n        spider = MockSpider()\n        sm = SessionManager()\n        sm.add(\"default\", ErrorSession())\n        engine = CrawlerEngine(spider, sm)\n\n        request = Request(\"https://example.com\", sid=\"default\")\n        await engine._process_request(request)\n\n        assert engine.stats.failed_requests_count == 1\n        assert len(spider.on_error_calls) == 1\n\n    @pytest.mark.asyncio\n    async def test_failed_fetch_does_not_increment_requests_count(self):\n        spider = MockSpider()\n        sm = SessionManager()\n        sm.add(\"default\", ErrorSession())\n        engine = CrawlerEngine(spider, sm)\n\n        request = Request(\"https://example.com\", sid=\"default\")\n        await engine._process_request(request)\n\n        assert engine.stats.requests_count == 0\n\n    @pytest.mark.asyncio\n    async def test_blocked_response_triggers_retry(self):\n        spider = MockSpider(is_blocked_fn=lambda r: True, max_blocked_retries=2)\n        engine = _make_engine(spider=spider)\n\n        request = Request(\"https://example.com\", sid=\"default\")\n        await engine._process_request(request)\n\n        assert engine.stats.blocked_requests_count == 1\n        # A retry request should be enqueued\n        assert not engine.scheduler.is_empty\n\n    @pytest.mark.asyncio\n    async def test_blocked_response_max_retries_exceeded(self):\n        spider = MockSpider(is_blocked_fn=lambda r: True, max_blocked_retries=2)\n        engine = _make_engine(spider=spider)\n\n        request = Request(\"https://example.com\", sid=\"default\")\n        request._retry_count = 2  # Already at max\n        await engine._process_request(request)\n\n        assert engine.stats.blocked_requests_count == 1\n        # No retry enqueued\n        assert engine.scheduler.is_empty\n\n    @pytest.mark.asyncio\n    async def test_retry_request_has_dont_filter(self):\n        spider = MockSpider(is_blocked_fn=lambda r: True, max_blocked_retries=3)\n        engine = _make_engine(spider=spider)\n\n        request = Request(\"https://example.com\", sid=\"default\")\n        await engine._process_request(request)\n\n        retry = await engine.scheduler.dequeue()\n        assert retry.dont_filter is True\n        assert retry._retry_count == 1\n\n    @pytest.mark.asyncio\n    async def test_retry_clears_proxy_kwargs(self):\n        spider = MockSpider(is_blocked_fn=lambda r: True, max_blocked_retries=3)\n        engine = _make_engine(spider=spider)\n\n        request = Request(\"https://example.com\", sid=\"default\", proxy=\"http://proxy:8080\")\n        await engine._process_request(request)\n\n        retry = await engine.scheduler.dequeue()\n        assert \"proxy\" not in retry._session_kwargs\n        assert \"proxies\" not in retry._session_kwargs\n\n    @pytest.mark.asyncio\n    async def test_callback_yielding_dict_increments_items(self):\n        spider = MockSpider()\n        engine = _make_engine(spider=spider)\n\n        request = Request(\"https://example.com\", sid=\"default\")\n        await engine._process_request(request)\n\n        assert engine.stats.items_scraped == 1\n        assert len(engine.items) == 1\n\n    @pytest.mark.asyncio\n    async def test_callback_yielding_request_enqueues(self):\n        async def callback(response) -> AsyncGenerator:\n            yield Request(\"https://example.com/page2\", sid=\"default\")\n\n        spider = MockSpider()\n        engine = _make_engine(spider=spider)\n\n        request = Request(\"https://example.com\", sid=\"default\", callback=callback)\n        await engine._process_request(request)\n\n        assert not engine.scheduler.is_empty\n\n    @pytest.mark.asyncio\n    async def test_callback_yielding_offsite_request_filtered(self):\n        async def callback(response) -> AsyncGenerator:\n            yield Request(\"https://other.com/page\", sid=\"default\")\n\n        spider = MockSpider(allowed_domains={\"example.com\"})\n        engine = _make_engine(spider=spider)\n\n        request = Request(\"https://example.com\", sid=\"default\", callback=callback)\n        await engine._process_request(request)\n\n        assert engine.stats.offsite_requests_count == 1\n        assert engine.scheduler.is_empty\n\n    @pytest.mark.asyncio\n    async def test_dropped_item_when_on_scraped_item_returns_none(self):\n        spider = MockSpider(on_scraped_item_fn=lambda item: None)\n        engine = _make_engine(spider=spider)\n\n        request = Request(\"https://example.com\", sid=\"default\")\n        await engine._process_request(request)\n\n        assert engine.stats.items_dropped == 1\n        assert engine.stats.items_scraped == 0\n        assert len(engine.items) == 0\n\n    @pytest.mark.asyncio\n    async def test_callback_exception_calls_on_error(self):\n        async def bad_callback(response) -> AsyncGenerator:\n            raise ValueError(\"callback boom\")\n            yield  # noqa: unreachable\n\n        spider = MockSpider()\n        engine = _make_engine(spider=spider)\n\n        request = Request(\"https://example.com\", sid=\"default\", callback=bad_callback)\n        await engine._process_request(request)\n\n        assert len(spider.on_error_calls) == 1\n        assert isinstance(spider.on_error_calls[0][1], ValueError)\n\n    @pytest.mark.asyncio\n    async def test_proxy_tracked_in_stats(self):\n        spider = MockSpider()\n        engine = _make_engine(spider=spider)\n\n        request = Request(\"https://example.com\", sid=\"default\", proxy=\"http://p:8080\")\n        await engine._process_request(request)\n\n        assert \"http://p:8080\" in engine.stats.proxies\n\n    @pytest.mark.asyncio\n    async def test_proxies_dict_tracked_in_stats(self):\n        spider = MockSpider()\n        engine = _make_engine(spider=spider)\n\n        proxies = {\"http\": \"http://p:8080\", \"https\": \"https://p:8443\"}\n        request = Request(\"https://example.com\", sid=\"default\", proxies=proxies)\n        await engine._process_request(request)\n\n        assert len(engine.stats.proxies) == 1\n        assert engine.stats.proxies[0] == proxies\n\n    @pytest.mark.asyncio\n    async def test_uses_parse_when_no_callback(self):\n        items_seen = []\n\n        async def custom_parse(response) -> AsyncGenerator:\n            yield {\"from\": \"custom_parse\"}\n\n        spider = MockSpider()\n        spider.parse = custom_parse  # type: ignore[assignment]\n        engine = _make_engine(spider=spider)\n\n        request = Request(\"https://example.com\", sid=\"default\")\n        # No callback set → should use spider.parse\n        await engine._process_request(request)\n\n        assert engine.stats.items_scraped == 1\n\n\n# ---------------------------------------------------------------------------\n# Tests: _task_wrapper\n# ---------------------------------------------------------------------------\n\n\nclass TestTaskWrapper:\n    @pytest.mark.asyncio\n    async def test_decrements_active_tasks(self):\n        engine = _make_engine()\n        engine._active_tasks = 1\n\n        request = Request(\"https://example.com\", sid=\"default\")\n        await engine._task_wrapper(request)\n\n        assert engine._active_tasks == 0\n\n    @pytest.mark.asyncio\n    async def test_decrements_even_on_error(self):\n        spider = MockSpider()\n        sm = SessionManager()\n        sm.add(\"default\", ErrorSession())\n        engine = CrawlerEngine(spider, sm)\n        engine._active_tasks = 1\n\n        request = Request(\"https://example.com\", sid=\"default\")\n        await engine._task_wrapper(request)\n\n        assert engine._active_tasks == 0\n\n\n# ---------------------------------------------------------------------------\n# Tests: request_pause\n# ---------------------------------------------------------------------------\n\n\nclass TestRequestPause:\n    def test_first_call_sets_pause_requested(self):\n        engine = _make_engine()\n\n        engine.request_pause()\n\n        assert engine._pause_requested is True\n        assert engine._force_stop is False\n\n    def test_second_call_sets_force_stop(self):\n        engine = _make_engine()\n\n        engine.request_pause()  # first\n        engine.request_pause()  # second\n\n        assert engine._pause_requested is True\n        assert engine._force_stop is True\n\n    def test_third_call_after_force_stop_is_noop(self):\n        engine = _make_engine()\n\n        engine.request_pause()\n        engine.request_pause()\n        engine.request_pause()  # should not raise\n\n        assert engine._force_stop is True\n\n\n# ---------------------------------------------------------------------------\n# Tests: checkpoint methods\n# ---------------------------------------------------------------------------\n\n\nclass TestCheckpointMethods:\n    def test_is_checkpoint_time_false_when_disabled(self):\n        engine = _make_engine()  # no crawldir\n        assert engine._is_checkpoint_time() is False\n\n    @pytest.mark.asyncio\n    async def test_save_and_restore_checkpoint(self):\n        with tempfile.TemporaryDirectory() as tmpdir:\n            spider = MockSpider()\n            engine = _make_engine(spider=spider, crawldir=tmpdir)\n\n            # Enqueue a request so snapshot has data\n            req = Request(\"https://example.com\", sid=\"default\")\n            engine._normalize_request(req)\n            await engine.scheduler.enqueue(req)\n\n            await engine._save_checkpoint()\n\n            # Verify checkpoint file exists\n            checkpoint_path = Path(tmpdir) / \"checkpoint.pkl\"\n            assert checkpoint_path.exists()\n\n    @pytest.mark.asyncio\n    async def test_restore_when_no_checkpoint_returns_false(self):\n        with tempfile.TemporaryDirectory() as tmpdir:\n            engine = _make_engine(crawldir=tmpdir)\n            result = await engine._restore_from_checkpoint()\n            assert result is False\n\n    @pytest.mark.asyncio\n    async def test_restore_from_checkpoint_raises_when_disabled(self):\n        engine = _make_engine()  # no crawldir → checkpoint disabled\n        with pytest.raises(RuntimeError):\n            await engine._restore_from_checkpoint()\n\n\n# ---------------------------------------------------------------------------\n# Tests: crawl\n# ---------------------------------------------------------------------------\n\n\nclass TestCrawl:\n    @pytest.mark.asyncio\n    async def test_basic_crawl_returns_stats(self):\n        spider = MockSpider()\n        engine = _make_engine(spider=spider)\n\n        stats = await engine.crawl()\n\n        assert isinstance(stats, CrawlStats)\n        assert stats.requests_count >= 1\n        assert stats.items_scraped >= 1\n\n    @pytest.mark.asyncio\n    async def test_crawl_calls_on_start_and_on_close(self):\n        spider = MockSpider()\n        engine = _make_engine(spider=spider)\n\n        await engine.crawl()\n\n        assert len(spider.on_start_calls) == 1\n        assert spider.on_start_calls[0][\"resuming\"] is False\n        assert spider.on_close_calls == 1\n\n    @pytest.mark.asyncio\n    async def test_crawl_sets_stats_timing(self):\n        spider = MockSpider()\n        engine = _make_engine(spider=spider)\n\n        stats = await engine.crawl()\n\n        assert stats.start_time > 0\n        assert stats.end_time > 0\n        assert stats.end_time >= stats.start_time\n\n    @pytest.mark.asyncio\n    async def test_crawl_sets_concurrency_stats(self):\n        spider = MockSpider(concurrent_requests=16, concurrent_requests_per_domain=4)\n        engine = _make_engine(spider=spider)\n\n        stats = await engine.crawl()\n\n        assert stats.concurrent_requests == 16\n        assert stats.concurrent_requests_per_domain == 4\n\n    @pytest.mark.asyncio\n    async def test_crawl_processes_multiple_start_urls(self):\n        spider = MockSpider()\n\n        urls = [\"https://example.com/1\", \"https://example.com/2\", \"https://example.com/3\"]\n\n        async def multi_start_requests() -> AsyncGenerator[Request, None]:\n            for url in urls:\n                yield Request(url, sid=\"default\")\n\n        spider.start_requests = multi_start_requests  # type: ignore[assignment]\n        engine = _make_engine(spider=spider)\n\n        stats = await engine.crawl()\n\n        assert stats.requests_count == 3\n        assert stats.items_scraped == 3\n\n    @pytest.mark.asyncio\n    async def test_crawl_follows_yielded_requests(self):\n        \"\"\"Test that requests yielded from callbacks are processed.\"\"\"\n        call_count = 0\n\n        async def parse_with_follow(response) -> AsyncGenerator:\n            nonlocal call_count\n            call_count += 1\n            if call_count == 1:\n                yield Request(\"https://example.com/page2\", sid=\"default\")\n            yield {\"page\": str(response)}\n\n        spider = MockSpider()\n        spider.parse = parse_with_follow  # type: ignore[assignment]\n        engine = _make_engine(spider=spider)\n\n        stats = await engine.crawl()\n\n        assert stats.requests_count == 2\n        assert stats.items_scraped == 2\n\n    @pytest.mark.asyncio\n    async def test_crawl_with_download_delay(self):\n        spider = MockSpider(download_delay=0.01)\n        engine = _make_engine(spider=spider)\n\n        stats = await engine.crawl()\n\n        assert stats.download_delay == 0.01\n        assert stats.requests_count >= 1\n\n    @pytest.mark.asyncio\n    async def test_crawl_filters_offsite_requests(self):\n        async def parse_offsite(response) -> AsyncGenerator:\n            yield Request(\"https://other-domain.com/page\", sid=\"default\")\n            yield {\"url\": str(response)}\n\n        spider = MockSpider(allowed_domains={\"example.com\"})\n        spider.parse = parse_offsite  # type: ignore[assignment]\n        engine = _make_engine(spider=spider)\n\n        stats = await engine.crawl()\n\n        assert stats.offsite_requests_count == 1\n        assert stats.requests_count == 1  # Only the initial request\n\n    @pytest.mark.asyncio\n    async def test_crawl_cleans_up_checkpoint_on_completion(self):\n        with tempfile.TemporaryDirectory() as tmpdir:\n            spider = MockSpider()\n            engine = _make_engine(spider=spider, crawldir=tmpdir)\n\n            await engine.crawl()\n\n            checkpoint_path = Path(tmpdir) / \"checkpoint.pkl\"\n            assert not checkpoint_path.exists()  # Cleaned up\n\n    @pytest.mark.asyncio\n    async def test_crawl_handles_fetch_error_gracefully(self):\n        spider = MockSpider()\n        sm = SessionManager()\n        sm.add(\"default\", ErrorSession())\n        engine = CrawlerEngine(spider, sm)\n\n        stats = await engine.crawl()\n\n        assert stats.failed_requests_count == 1\n        assert len(spider.on_error_calls) == 1\n\n    @pytest.mark.asyncio\n    async def test_crawl_log_levels_populated(self):\n        spider = MockSpider()\n        engine = _make_engine(spider=spider)\n\n        stats = await engine.crawl()\n\n        assert isinstance(stats.log_levels_counter, dict)\n\n    @pytest.mark.asyncio\n    async def test_crawl_resets_state_on_each_run(self):\n        spider = MockSpider()\n        engine = _make_engine(spider=spider)\n\n        # Run first crawl\n        await engine.crawl()\n        assert engine.stats.requests_count >= 1\n\n        # Run second crawl - stats should reset\n        stats = await engine.crawl()\n        # Items are cleared on each crawl\n        assert engine.paused is False\n\n\n# ---------------------------------------------------------------------------\n# Tests: items property\n# ---------------------------------------------------------------------------\n\n\nclass TestItemsProperty:\n    def test_items_returns_item_list(self):\n        engine = _make_engine()\n        assert isinstance(engine.items, ItemList)\n\n    def test_items_initially_empty(self):\n        engine = _make_engine()\n        assert len(engine.items) == 0\n\n    @pytest.mark.asyncio\n    async def test_items_populated_after_crawl(self):\n        engine = _make_engine()\n        await engine.crawl()\n        assert len(engine.items) >= 1\n\n\n# ---------------------------------------------------------------------------\n# Tests: streaming (__aiter__ / _stream)\n# ---------------------------------------------------------------------------\n\n\nclass TestStreaming:\n    @pytest.mark.asyncio\n    async def test_stream_yields_items(self):\n        spider = MockSpider()\n        engine = _make_engine(spider=spider)\n\n        items = []\n        async for item in engine:\n            items.append(item)\n\n        assert len(items) >= 1\n        assert isinstance(items[0], dict)\n\n    @pytest.mark.asyncio\n    async def test_stream_processes_follow_up_requests(self):\n        call_count = 0\n\n        async def parse_with_follow(response) -> AsyncGenerator:\n            nonlocal call_count\n            call_count += 1\n            if call_count == 1:\n                yield Request(\"https://example.com/page2\", sid=\"default\")\n            yield {\"page\": call_count}\n\n        spider = MockSpider()\n        spider.parse = parse_with_follow  # type: ignore[assignment]\n        engine = _make_engine(spider=spider)\n\n        items = []\n        async for item in engine:\n            items.append(item)\n\n        assert len(items) == 2\n\n    @pytest.mark.asyncio\n    async def test_stream_items_not_stored_in_items_list(self):\n        \"\"\"When streaming, items go to the stream, not to engine._items.\"\"\"\n        spider = MockSpider()\n        engine = _make_engine(spider=spider)\n\n        items = []\n        async for item in engine:\n            items.append(item)\n\n        # Items were sent through stream, not appended to _items\n        assert len(items) >= 1\n        assert len(engine.items) == 0\n\n\n# ---------------------------------------------------------------------------\n# Tests: pause during crawl\n# ---------------------------------------------------------------------------\n\n\nclass TestPauseDuringCrawl:\n    @pytest.mark.asyncio\n    async def test_pause_stops_crawl_gracefully(self):\n        processed = 0\n\n        async def slow_parse(response) -> AsyncGenerator:\n            nonlocal processed\n            processed += 1\n            # Yield more requests to keep the crawl going\n            if processed <= 2:\n                yield Request(f\"https://example.com/p{processed + 1}\", sid=\"default\")\n            yield {\"n\": processed}\n\n        spider = MockSpider()\n        spider.parse = slow_parse  # type: ignore[assignment]\n        engine = _make_engine(spider=spider)\n\n        # Request pause immediately - the engine will stop as soon as active tasks complete\n        engine._pause_requested = True\n\n        stats = await engine.crawl()\n        # Should stop without processing everything\n        assert engine._running is False\n\n    @pytest.mark.asyncio\n    async def test_pause_with_checkpoint_sets_paused(self):\n        with tempfile.TemporaryDirectory() as tmpdir:\n            parse_count = 0\n\n            async def parse_and_pause(response) -> AsyncGenerator:\n                nonlocal parse_count\n                parse_count += 1\n                # Request pause after first request, but yield follow-ups\n                if parse_count == 1:\n                    engine.request_pause()\n                    yield Request(\"https://example.com/p2\", sid=\"default\")\n                yield {\"n\": parse_count}\n\n            spider = MockSpider()\n            spider.parse = parse_and_pause  # type: ignore[assignment]\n            engine = _make_engine(spider=spider, crawldir=tmpdir)\n\n            await engine.crawl()\n\n            assert engine.paused is True\n\n    @pytest.mark.asyncio\n    async def test_pause_without_checkpoint_does_not_set_paused(self):\n        spider = MockSpider()\n        engine = _make_engine(spider=spider)\n\n        engine._pause_requested = True\n\n        await engine.crawl()\n\n        assert engine.paused is False\n"
  },
  {
    "path": "tests/spiders/test_request.py",
    "content": "\"\"\"Tests for the Request class.\"\"\"\n\nimport pickle\n\nimport pytest\n\nfrom scrapling.spiders.request import Request\nfrom scrapling.core._types import Any, Dict, AsyncGenerator\n\n\nclass TestRequestCreation:\n    \"\"\"Test Request initialization and basic attributes.\"\"\"\n\n    def test_basic_request_creation(self):\n        \"\"\"Test creating a request with just a URL.\"\"\"\n        request = Request(\"https://example.com\")\n\n        assert request.url == \"https://example.com\"\n        assert request.sid == \"\"\n        assert request.callback is None\n        assert request.priority == 0\n        assert request.dont_filter is False\n        assert request.meta == {}\n        assert request._retry_count == 0\n        assert request._session_kwargs == {}\n\n    def test_request_with_all_parameters(self):\n        \"\"\"Test creating a request with all parameters.\"\"\"\n\n        async def my_callback(response) -> AsyncGenerator[Dict[str, Any] | Request | None, None]:\n            yield {\"test\": \"data\"}\n\n        request = Request(\n            url=\"https://example.com/page\",\n            sid=\"my_session\",\n            callback=my_callback,\n            priority=10,\n            dont_filter=True,\n            meta={\"key\": \"value\"},\n            _retry_count=2,\n            proxy=\"http://proxy:8080\",\n            timeout=30,\n        )\n\n        assert request.url == \"https://example.com/page\"\n        assert request.sid == \"my_session\"\n        assert request.callback == my_callback\n        assert request.priority == 10\n        assert request.dont_filter is True\n        assert request.meta == {\"key\": \"value\"}\n        assert request._retry_count == 2\n        assert request._session_kwargs == {\"proxy\": \"http://proxy:8080\", \"timeout\": 30}\n\n    def test_request_meta_default_is_empty_dict(self):\n        \"\"\"Test that meta defaults to empty dict, not shared reference.\"\"\"\n        r1 = Request(\"https://example.com\")\n        r2 = Request(\"https://example.com\")\n\n        r1.meta[\"key\"] = \"value\"\n\n        assert r1.meta == {\"key\": \"value\"}\n        assert r2.meta == {}\n\n\nclass TestRequestProperties:\n    \"\"\"Test Request computed properties.\"\"\"\n\n    def test_domain_extraction(self):\n        \"\"\"Test domain property extracts netloc correctly.\"\"\"\n        request = Request(\"https://www.example.com/path/page.html?query=1\")\n        assert request.domain == \"www.example.com\"\n\n    def test_domain_with_port(self):\n        \"\"\"Test domain extraction with port number.\"\"\"\n        request = Request(\"http://localhost:8080/api\")\n        assert request.domain == \"localhost:8080\"\n\n    def test_domain_with_subdomain(self):\n        \"\"\"Test domain extraction with subdomains.\"\"\"\n        request = Request(\"https://api.v2.example.com/endpoint\")\n        assert request.domain == \"api.v2.example.com\"\n\n    def test_fingerprint_returns_bytes(self):\n        \"\"\"Test fingerprint generation returns bytes.\"\"\"\n        request = Request(\"https://example.com\")\n        fp = request.update_fingerprint()\n        assert isinstance(fp, bytes)\n        assert len(fp) == 20  # SHA1 produces 20 bytes\n\n    def test_fingerprint_is_deterministic(self):\n        \"\"\"Test same request produces same fingerprint.\"\"\"\n        r1 = Request(\"https://example.com\", data={\"key\": \"value\"})\n        r2 = Request(\"https://example.com\", data={\"key\": \"value\"})\n        assert r1.update_fingerprint() == r2.update_fingerprint()\n\n    def test_fingerprint_different_urls(self):\n        \"\"\"Test different URLs produce different fingerprints.\"\"\"\n        r1 = Request(\"https://example.com/page1\")\n        r2 = Request(\"https://example.com/page2\")\n        assert r1.update_fingerprint() != r2.update_fingerprint()\n\n\nclass TestRequestCopy:\n    \"\"\"Test Request copy functionality.\"\"\"\n\n    def test_copy_creates_independent_request(self):\n        \"\"\"Test that copy creates a new independent request.\"\"\"\n\n        async def callback(response) -> AsyncGenerator[Dict[str, Any] | Request | None, None]:\n            yield None\n\n        original = Request(\n            url=\"https://example.com\",\n            sid=\"session\",\n            callback=callback,\n            priority=5,\n            dont_filter=True,\n            meta={\"original\": True},\n            _retry_count=1,\n            proxy=\"http://proxy:8080\",\n        )\n\n        copied = original.copy()\n\n        # Check all values are copied\n        assert copied.url == original.url\n        assert copied.sid == original.sid\n        assert copied.callback == original.callback\n        assert copied.priority == original.priority\n        assert copied.dont_filter == original.dont_filter\n        assert copied.meta == original.meta\n        assert copied._retry_count == original._retry_count\n        assert copied._session_kwargs == original._session_kwargs\n\n        # Check they are different objects\n        assert copied is not original\n        assert copied.meta is not original.meta  # Meta should be a copy\n\n    def test_copy_meta_is_independent(self):\n        \"\"\"Test that modifying copied meta doesn't affect original.\"\"\"\n        original = Request(\"https://example.com\", meta={\"key\": \"original\"})\n        copied = original.copy()\n\n        copied.meta[\"key\"] = \"modified\"\n        copied.meta[\"new_key\"] = \"new_value\"\n\n        assert original.meta == {\"key\": \"original\"}\n        assert copied.meta == {\"key\": \"modified\", \"new_key\": \"new_value\"}\n\n\nclass TestRequestComparison:\n    \"\"\"Test Request comparison operators.\"\"\"\n\n    def test_priority_less_than(self):\n        \"\"\"Test less than comparison by priority.\"\"\"\n        low_priority = Request(\"https://example.com/1\", priority=1)\n        high_priority = Request(\"https://example.com/2\", priority=10)\n\n        assert low_priority < high_priority\n        assert not high_priority < low_priority\n\n    def test_priority_greater_than(self):\n        \"\"\"Test greater than comparison by priority.\"\"\"\n        low_priority = Request(\"https://example.com/1\", priority=1)\n        high_priority = Request(\"https://example.com/2\", priority=10)\n\n        assert high_priority > low_priority\n        assert not low_priority > high_priority\n\n    def test_equality_by_fingerprint(self):\n        \"\"\"Test equality comparison by fingerprint.\"\"\"\n        r1 = Request(\"https://example.com\")\n        r2 = Request(\"https://example.com\")\n        r3 = Request(\"https://example.com/other\")\n\n        # Generate fingerprints first (required for equality)\n        r1.update_fingerprint()\n        r2.update_fingerprint()\n        r3.update_fingerprint()\n\n        assert r1 == r2\n        assert r1 != r3\n\n    def test_equality_different_priorities_same_fingerprint(self):\n        \"\"\"Test requests with same fingerprint are equal despite different priorities.\"\"\"\n        r1 = Request(\"https://example.com\", priority=1)\n        r2 = Request(\"https://example.com\", priority=100)\n\n        # Generate fingerprints first\n        r1.update_fingerprint()\n        r2.update_fingerprint()\n\n        assert r1 == r2  # Same fingerprint means equal\n\n    def test_comparison_with_non_request(self):\n        \"\"\"Test comparison with non-Request types returns NotImplemented.\"\"\"\n        request = Request(\"https://example.com\")\n\n        assert request.__lt__(\"not a request\") == NotImplemented\n        assert request.__gt__(\"not a request\") == NotImplemented\n        assert request.__eq__(\"not a request\") == NotImplemented\n\n\nclass TestRequestStringRepresentation:\n    \"\"\"Test Request string representations.\"\"\"\n\n    def test_str_returns_url(self):\n        \"\"\"Test __str__ returns the URL.\"\"\"\n        request = Request(\"https://example.com/page\")\n        assert str(request) == \"https://example.com/page\"\n\n    def test_repr_without_callback(self):\n        \"\"\"Test __repr__ without callback.\"\"\"\n        request = Request(\"https://example.com\", priority=5)\n        repr_str = repr(request)\n\n        assert \"Request\" in repr_str\n        assert \"https://example.com\" in repr_str\n        assert \"priority=5\" in repr_str\n        assert \"callback=None\" in repr_str\n\n    def test_repr_with_callback(self):\n        \"\"\"Test __repr__ with named callback.\"\"\"\n\n        async def my_custom_callback(response) -> AsyncGenerator[Dict[str, Any] | Request | None, None]:\n            yield None\n\n        request = Request(\"https://example.com\", callback=my_custom_callback)\n        repr_str = repr(request)\n\n        assert \"callback=my_custom_callback\" in repr_str\n\n\nclass TestRequestPickling:\n    \"\"\"Test Request serialization for checkpointing.\"\"\"\n\n    def test_pickle_without_callback(self):\n        \"\"\"Test pickling request without callback.\"\"\"\n        original = Request(\n            url=\"https://example.com\",\n            sid=\"session\",\n            priority=5,\n            meta={\"key\": \"value\"},\n        )\n\n        pickled = pickle.dumps(original)\n        restored = pickle.loads(pickled)\n\n        assert restored.url == original.url\n        assert restored.sid == original.sid\n        assert restored.priority == original.priority\n        assert restored.meta == original.meta\n        assert restored.callback is None\n\n    def test_pickle_with_callback_stores_name(self):\n        \"\"\"Test that callback name is stored when pickling.\"\"\"\n\n        async def parse_page(response) -> AsyncGenerator[Dict[str, Any] | Request | None, None]:\n            yield {\"data\": \"test\"}\n\n        original = Request(\"https://example.com\", callback=parse_page)\n\n        # Check getstate stores callback name\n        state = original.__getstate__()\n        assert state[\"_callback_name\"] == \"parse_page\"\n        assert state[\"callback\"] is None\n\n    def test_pickle_with_none_callback(self):\n        \"\"\"Test pickling with None callback.\"\"\"\n        original = Request(\"https://example.com\", callback=None)\n\n        state = original.__getstate__()\n        assert state[\"_callback_name\"] is None\n        assert state[\"callback\"] is None\n\n    def test_setstate_stores_callback_name(self):\n        \"\"\"Test that setstate correctly handles callback name.\"\"\"\n        request = Request(\"https://example.com\")\n        state = {\n            \"url\": \"https://example.com\",\n            \"sid\": \"\",\n            \"callback\": None,\n            \"priority\": 0,\n            \"dont_filter\": False,\n            \"meta\": {},\n            \"_retry_count\": 0,\n            \"_session_kwargs\": {},\n            \"_callback_name\": \"custom_parse\",\n        }\n\n        request.__setstate__(state)\n\n        assert hasattr(request, \"_callback_name\")\n        assert request._callback_name == \"custom_parse\"\n\n    def test_pickle_roundtrip_preserves_session_kwargs(self):\n        \"\"\"Test that session kwargs are preserved through pickle.\"\"\"\n        original = Request(\n            \"https://example.com\",\n            proxy=\"http://proxy:8080\",\n            timeout=30,\n            headers={\"User-Agent\": \"test\"},\n        )\n\n        pickled = pickle.dumps(original)\n        restored = pickle.loads(pickled)\n\n        assert restored._session_kwargs == {\n            \"proxy\": \"http://proxy:8080\",\n            \"timeout\": 30,\n            \"headers\": {\"User-Agent\": \"test\"},\n        }\n\n\nclass TestRequestRestoreCallback:\n    \"\"\"Test callback restoration from spider.\"\"\"\n\n    def test_restore_callback_from_spider(self):\n        \"\"\"Test restoring callback from spider instance.\"\"\"\n\n        class MockSpider:\n            async def parse(self, response) -> AsyncGenerator[Dict[str, Any] | Request | None, None]:\n                yield None\n\n            async def parse_detail(self, response) -> AsyncGenerator[Dict[str, Any] | Request | None, None]:\n                yield {\"detail\": True}\n\n        spider = MockSpider()\n        request = Request(\"https://example.com\")\n        request._callback_name = \"parse_detail\"\n\n        request._restore_callback(spider)  # type: ignore[arg-type]\n\n        assert request.callback == spider.parse_detail\n        assert not hasattr(request, \"_callback_name\")\n\n    def test_restore_callback_falls_back_to_parse(self):\n        \"\"\"Test that missing callback falls back to spider.parse.\"\"\"\n\n        class MockSpider:\n            async def parse(self, response) -> AsyncGenerator[Dict[str, Any] | Request | None, None]:\n                yield None\n\n        spider = MockSpider()\n        request = Request(\"https://example.com\")\n        request._callback_name = \"nonexistent_method\"\n\n        request._restore_callback(spider)  # type: ignore[arg-type]\n\n        assert request.callback == spider.parse\n        assert not hasattr(request, \"_callback_name\")\n\n    def test_restore_callback_with_none_name(self):\n        \"\"\"Test restore callback when _callback_name is None.\"\"\"\n\n        class MockSpider:\n            async def parse(self, response) -> AsyncGenerator[Dict[str, Any] | Request | None, None]:\n                yield None\n\n        spider = MockSpider()\n        request = Request(\"https://example.com\")\n        request._callback_name = None\n\n        request._restore_callback(spider)  # type: ignore[arg-type]\n\n        # Should clean up _callback_name attribute\n        assert not hasattr(request, \"_callback_name\")\n\n    def test_restore_callback_without_callback_name_attr(self):\n        \"\"\"Test restore callback when _callback_name attribute doesn't exist.\"\"\"\n\n        class MockSpider:\n            async def parse(self, response) -> AsyncGenerator[Dict[str, Any] | Request | None, None]:\n                yield None\n\n        spider = MockSpider()\n        request = Request(\"https://example.com\")\n        # Don't set _callback_name\n\n        # Should not raise an error\n        request._restore_callback(spider)  # type: ignore[arg-type]\n"
  },
  {
    "path": "tests/spiders/test_result.py",
    "content": "\"\"\"Tests for the result module (ItemList, CrawlStats, CrawlResult).\"\"\"\n\nimport json\nimport tempfile\nfrom pathlib import Path\n\nimport pytest\n\nfrom scrapling.spiders.result import ItemList, CrawlStats, CrawlResult\n\n\nclass TestItemList:\n    \"\"\"Test ItemList functionality.\"\"\"\n\n    def test_itemlist_is_list(self):\n        \"\"\"Test that ItemList is a list subclass.\"\"\"\n        items = ItemList()\n\n        assert isinstance(items, list)\n\n    def test_itemlist_basic_operations(self):\n        \"\"\"Test basic list operations work.\"\"\"\n        items = ItemList()\n\n        items.append({\"id\": 1})\n        items.append({\"id\": 2})\n\n        assert len(items) == 2\n        assert items[0] == {\"id\": 1}\n\n    def test_to_json_creates_file(self):\n        \"\"\"Test to_json creates JSON file.\"\"\"\n        items = ItemList()\n        items.append({\"name\": \"test\", \"value\": 123})\n        items.append({\"name\": \"test2\", \"value\": 456})\n\n        with tempfile.TemporaryDirectory() as tmpdir:\n            path = Path(tmpdir) / \"output.json\"\n            items.to_json(path)\n\n            assert path.exists()\n\n            content = json.loads(path.read_text())\n            assert len(content) == 2\n            assert content[0][\"name\"] == \"test\"\n\n    def test_to_json_creates_parent_directory(self):\n        \"\"\"Test to_json creates parent directories.\"\"\"\n        items = ItemList()\n        items.append({\"data\": \"test\"})\n\n        with tempfile.TemporaryDirectory() as tmpdir:\n            path = Path(tmpdir) / \"nested\" / \"dirs\" / \"output.json\"\n            items.to_json(path)\n\n            assert path.exists()\n\n    def test_to_json_with_indent(self):\n        \"\"\"Test to_json with indentation.\"\"\"\n        items = ItemList()\n        items.append({\"key\": \"value\"})\n\n        with tempfile.TemporaryDirectory() as tmpdir:\n            path = Path(tmpdir) / \"output.json\"\n            items.to_json(path, indent=True)\n\n            content = path.read_text()\n            # Indented JSON should have newlines\n            assert \"\\n\" in content\n\n    def test_to_jsonl_creates_file(self):\n        \"\"\"Test to_jsonl creates JSON Lines file.\"\"\"\n        items = ItemList()\n        items.append({\"id\": 1, \"name\": \"first\"})\n        items.append({\"id\": 2, \"name\": \"second\"})\n        items.append({\"id\": 3, \"name\": \"third\"})\n\n        with tempfile.TemporaryDirectory() as tmpdir:\n            path = Path(tmpdir) / \"output.jsonl\"\n            items.to_jsonl(path)\n\n            assert path.exists()\n\n            lines = path.read_text().strip().split(\"\\n\")\n            assert len(lines) == 3\n\n            # Each line should be valid JSON\n            for line in lines:\n                parsed = json.loads(line)\n                assert \"id\" in parsed\n                assert \"name\" in parsed\n\n    def test_to_jsonl_one_object_per_line(self):\n        \"\"\"Test that JSONL has one JSON object per line.\"\"\"\n        items = ItemList()\n        items.append({\"line\": 1})\n        items.append({\"line\": 2})\n\n        with tempfile.TemporaryDirectory() as tmpdir:\n            path = Path(tmpdir) / \"output.jsonl\"\n            items.to_jsonl(path)\n\n            lines = path.read_text().strip().split(\"\\n\")\n\n            assert json.loads(lines[0])[\"line\"] == 1\n            assert json.loads(lines[1])[\"line\"] == 2\n\n\nclass TestCrawlStats:\n    \"\"\"Test CrawlStats dataclass.\"\"\"\n\n    def test_default_values(self):\n        \"\"\"Test CrawlStats default values.\"\"\"\n        stats = CrawlStats()\n\n        assert stats.requests_count == 0\n        assert stats.concurrent_requests == 0\n        assert stats.failed_requests_count == 0\n        assert stats.response_bytes == 0\n        assert stats.items_scraped == 0\n        assert stats.items_dropped == 0\n        assert stats.start_time == 0.0\n        assert stats.end_time == 0.0\n        assert stats.custom_stats == {}\n        assert stats.response_status_count == {}\n        assert stats.proxies == []\n\n    def test_elapsed_seconds(self):\n        \"\"\"Test elapsed_seconds property.\"\"\"\n        stats = CrawlStats(start_time=100.0, end_time=150.0)\n\n        assert stats.elapsed_seconds == 50.0\n\n    def test_requests_per_second(self):\n        \"\"\"Test requests_per_second calculation.\"\"\"\n        stats = CrawlStats(\n            requests_count=100,\n            start_time=0.0,\n            end_time=10.0,\n        )\n\n        assert stats.requests_per_second == 10.0\n\n    def test_requests_per_second_zero_elapsed(self):\n        \"\"\"Test requests_per_second when elapsed is zero.\"\"\"\n        stats = CrawlStats(\n            requests_count=100,\n            start_time=0.0,\n            end_time=0.0,\n        )\n\n        assert stats.requests_per_second == 0.0\n\n    def test_increment_status(self):\n        \"\"\"Test increment_status method.\"\"\"\n        stats = CrawlStats()\n\n        stats.increment_status(200)\n        stats.increment_status(200)\n        stats.increment_status(404)\n\n        assert stats.response_status_count == {\"status_200\": 2, \"status_404\": 1}\n\n    def test_increment_response_bytes(self):\n        \"\"\"Test increment_response_bytes method.\"\"\"\n        stats = CrawlStats()\n\n        stats.increment_response_bytes(\"example.com\", 1000)\n        stats.increment_response_bytes(\"example.com\", 500)\n        stats.increment_response_bytes(\"other.com\", 2000)\n\n        assert stats.response_bytes == 3500\n        assert stats.domains_response_bytes == {\n            \"example.com\": 1500,\n            \"other.com\": 2000,\n        }\n\n    def test_increment_requests_count(self):\n        \"\"\"Test increment_requests_count method.\"\"\"\n        stats = CrawlStats()\n\n        stats.increment_requests_count(\"session1\")\n        stats.increment_requests_count(\"session1\")\n        stats.increment_requests_count(\"session2\")\n\n        assert stats.requests_count == 3\n        assert stats.sessions_requests_count == {\"session1\": 2, \"session2\": 1}\n\n    def test_to_dict(self):\n        \"\"\"Test to_dict method returns all stats.\"\"\"\n        stats = CrawlStats(\n            items_scraped=10,\n            items_dropped=2,\n            requests_count=15,\n            start_time=0.0,\n            end_time=5.0,\n        )\n        stats.increment_status(200)\n\n        result = stats.to_dict()\n\n        assert result[\"items_scraped\"] == 10\n        assert result[\"items_dropped\"] == 2\n        assert result[\"requests_count\"] == 15\n        assert result[\"elapsed_seconds\"] == 5.0\n        assert result[\"requests_per_second\"] == 3.0\n        assert result[\"response_status_count\"] == {\"status_200\": 1}\n\n    def test_custom_stats(self):\n        \"\"\"Test custom_stats can be used.\"\"\"\n        stats = CrawlStats()\n        stats.custom_stats[\"my_metric\"] = 42\n        stats.custom_stats[\"another\"] = \"value\"\n\n        assert stats.custom_stats[\"my_metric\"] == 42\n        assert stats.to_dict()[\"custom_stats\"][\"my_metric\"] == 42\n\n\nclass TestCrawlResult:\n    \"\"\"Test CrawlResult dataclass.\"\"\"\n\n    def test_basic_creation(self):\n        \"\"\"Test basic CrawlResult creation.\"\"\"\n        stats = CrawlStats(items_scraped=5)\n        items = ItemList()\n        items.extend([{\"id\": i} for i in range(5)])\n\n        result = CrawlResult(stats=stats, items=items)\n\n        assert result.stats.items_scraped == 5\n        assert len(result.items) == 5\n        assert result.paused is False\n\n    def test_completed_property_true_when_not_paused(self):\n        \"\"\"Test completed is True when not paused.\"\"\"\n        result = CrawlResult(\n            stats=CrawlStats(),\n            items=ItemList(),\n            paused=False,\n        )\n\n        assert result.completed is True\n\n    def test_completed_property_false_when_paused(self):\n        \"\"\"Test completed is False when paused.\"\"\"\n        result = CrawlResult(\n            stats=CrawlStats(),\n            items=ItemList(),\n            paused=True,\n        )\n\n        assert result.completed is False\n\n    def test_len_returns_item_count(self):\n        \"\"\"Test len returns number of items.\"\"\"\n        items = ItemList()\n        items.extend([{\"id\": i} for i in range(10)])\n\n        result = CrawlResult(stats=CrawlStats(), items=items)\n\n        assert len(result) == 10\n\n    def test_iter_yields_items(self):\n        \"\"\"Test iteration yields items.\"\"\"\n        items = ItemList()\n        items.extend([{\"id\": 1}, {\"id\": 2}, {\"id\": 3}])\n\n        result = CrawlResult(stats=CrawlStats(), items=items)\n\n        collected = list(result)\n\n        assert collected == [{\"id\": 1}, {\"id\": 2}, {\"id\": 3}]\n\n    def test_result_with_stats(self):\n        \"\"\"Test CrawlResult with populated stats.\"\"\"\n        stats = CrawlStats(\n            requests_count=100,\n            items_scraped=50,\n            failed_requests_count=5,\n            start_time=0.0,\n            end_time=10.0,\n        )\n        items = ItemList()\n\n        result = CrawlResult(stats=stats, items=items)\n\n        assert result.stats.requests_count == 100\n        assert result.stats.items_scraped == 50\n        assert result.stats.requests_per_second == 10.0\n\n\nclass TestCrawlResultIntegration:\n    \"\"\"Integration tests for result classes.\"\"\"\n\n    def test_full_workflow(self):\n        \"\"\"Test realistic workflow with all result classes.\"\"\"\n        # Simulate a crawl\n        stats = CrawlStats(start_time=1000.0)\n\n        # Simulate requests\n        for _ in range(10):\n            stats.increment_requests_count(\"default\")\n            stats.increment_status(200)\n            stats.increment_response_bytes(\"example.com\", 5000)\n\n        # Simulate some failures\n        stats.failed_requests_count = 2\n        stats.blocked_requests_count = 1\n\n        # Collect items\n        items = ItemList()\n        for i in range(8):\n            items.append({\"product_id\": i, \"name\": f\"Product {i}\"})\n            stats.items_scraped += 1\n\n        # Finish crawl\n        stats.end_time = 1005.0\n\n        # Create result\n        result = CrawlResult(stats=stats, items=items, paused=False)\n\n        # Verify\n        assert result.completed is True\n        assert len(result) == 8\n        assert result.stats.requests_count == 10\n        assert result.stats.requests_per_second == 2.0\n        assert result.stats.response_bytes == 50000\n"
  },
  {
    "path": "tests/spiders/test_scheduler.py",
    "content": "\"\"\"Tests for the Scheduler class.\"\"\"\n\nimport pytest\n\nfrom scrapling.spiders.request import Request\nfrom scrapling.spiders.scheduler import Scheduler\nfrom scrapling.spiders.checkpoint import CheckpointData\n\n\nclass TestSchedulerInit:\n    \"\"\"Test Scheduler initialization.\"\"\"\n\n    def test_scheduler_starts_empty(self):\n        \"\"\"Test that scheduler starts with empty queue.\"\"\"\n        scheduler = Scheduler()\n\n        assert len(scheduler) == 0\n        assert scheduler.is_empty is True\n\n\nclass TestSchedulerEnqueue:\n    \"\"\"Test Scheduler enqueue functionality.\"\"\"\n\n    @pytest.mark.asyncio\n    async def test_enqueue_single_request(self):\n        \"\"\"Test enqueueing a single request.\"\"\"\n        scheduler = Scheduler()\n        request = Request(\"https://example.com\")\n\n        result = await scheduler.enqueue(request)\n\n        assert result is True\n        assert len(scheduler) == 1\n        assert scheduler.is_empty is False\n\n    @pytest.mark.asyncio\n    async def test_enqueue_multiple_requests(self):\n        \"\"\"Test enqueueing multiple requests.\"\"\"\n        scheduler = Scheduler()\n\n        for i in range(5):\n            request = Request(f\"https://example.com/{i}\")\n            await scheduler.enqueue(request)\n\n        assert len(scheduler) == 5\n\n    @pytest.mark.asyncio\n    async def test_enqueue_duplicate_filtered(self):\n        \"\"\"Test that duplicate requests are filtered by default.\"\"\"\n        scheduler = Scheduler()\n\n        request1 = Request(\"https://example.com\", sid=\"s1\")\n        request2 = Request(\"https://example.com\", sid=\"s1\")  # Same fingerprint\n\n        result1 = await scheduler.enqueue(request1)\n        result2 = await scheduler.enqueue(request2)\n\n        assert result1 is True\n        assert result2 is False  # Duplicate filtered\n        assert len(scheduler) == 1\n\n    @pytest.mark.asyncio\n    async def test_enqueue_duplicate_allowed_with_dont_filter(self):\n        \"\"\"Test that dont_filter allows duplicate requests.\"\"\"\n        scheduler = Scheduler()\n\n        request1 = Request(\"https://example.com\", sid=\"s1\")\n        request2 = Request(\"https://example.com\", sid=\"s1\", dont_filter=True)\n\n        result1 = await scheduler.enqueue(request1)\n        result2 = await scheduler.enqueue(request2)\n\n        assert result1 is True\n        assert result2 is True\n        assert len(scheduler) == 2\n\n    @pytest.mark.asyncio\n    async def test_enqueue_different_methods_not_duplicate(self):\n        \"\"\"Test that same URL with different methods are not duplicates.\"\"\"\n        scheduler = Scheduler()\n\n        request1 = Request(\"https://example.com\", method=\"GET\")\n        request2 = Request(\"https://example.com\", method=\"POST\")\n\n        result1 = await scheduler.enqueue(request1)\n        result2 = await scheduler.enqueue(request2)\n\n        assert result1 is True\n        assert result2 is True\n        assert len(scheduler) == 2\n\n\nclass TestSchedulerDequeue:\n    \"\"\"Test Scheduler dequeue functionality.\"\"\"\n\n    @pytest.mark.asyncio\n    async def test_dequeue_returns_request(self):\n        \"\"\"Test that dequeue returns the enqueued request.\"\"\"\n        scheduler = Scheduler()\n        original = Request(\"https://example.com\")\n\n        await scheduler.enqueue(original)\n        dequeued = await scheduler.dequeue()\n\n        assert dequeued.url == original.url\n\n    @pytest.mark.asyncio\n    async def test_dequeue_respects_priority_order(self):\n        \"\"\"Test that higher priority requests are dequeued first.\"\"\"\n        scheduler = Scheduler()\n\n        low = Request(\"https://example.com/low\", priority=1)\n        high = Request(\"https://example.com/high\", priority=10)\n        medium = Request(\"https://example.com/medium\", priority=5)\n\n        await scheduler.enqueue(low)\n        await scheduler.enqueue(high)\n        await scheduler.enqueue(medium)\n\n        # Should get high priority first\n        first = await scheduler.dequeue()\n        assert first.url == \"https://example.com/high\"\n\n        second = await scheduler.dequeue()\n        assert second.url == \"https://example.com/medium\"\n\n        third = await scheduler.dequeue()\n        assert third.url == \"https://example.com/low\"\n\n    @pytest.mark.asyncio\n    async def test_dequeue_fifo_for_same_priority(self):\n        \"\"\"Test FIFO ordering for requests with same priority.\"\"\"\n        scheduler = Scheduler()\n\n        for i in range(3):\n            request = Request(f\"https://example.com/{i}\", priority=5)\n            await scheduler.enqueue(request)\n\n        first = await scheduler.dequeue()\n        second = await scheduler.dequeue()\n        third = await scheduler.dequeue()\n\n        # Should be in FIFO order since same priority\n        assert first.url == \"https://example.com/0\"\n        assert second.url == \"https://example.com/1\"\n        assert third.url == \"https://example.com/2\"\n\n    @pytest.mark.asyncio\n    async def test_dequeue_updates_length(self):\n        \"\"\"Test that dequeue decreases the queue length.\"\"\"\n        scheduler = Scheduler()\n\n        await scheduler.enqueue(Request(\"https://example.com/1\"))\n        await scheduler.enqueue(Request(\"https://example.com/2\"))\n\n        assert len(scheduler) == 2\n\n        await scheduler.dequeue()\n        assert len(scheduler) == 1\n\n        await scheduler.dequeue()\n        assert len(scheduler) == 0\n        assert scheduler.is_empty is True\n\n\nclass TestSchedulerSnapshot:\n    \"\"\"Test Scheduler snapshot functionality for checkpointing.\"\"\"\n\n    @pytest.mark.asyncio\n    async def test_snapshot_empty_scheduler(self):\n        \"\"\"Test snapshot of empty scheduler.\"\"\"\n        scheduler = Scheduler()\n\n        requests, seen = scheduler.snapshot()\n\n        assert requests == []\n        assert seen == set()\n\n    @pytest.mark.asyncio\n    async def test_snapshot_captures_pending_requests(self):\n        \"\"\"Test snapshot captures all pending requests.\"\"\"\n        scheduler = Scheduler()\n\n        await scheduler.enqueue(Request(\"https://example.com/1\", priority=5))\n        await scheduler.enqueue(Request(\"https://example.com/2\", priority=10))\n        await scheduler.enqueue(Request(\"https://example.com/3\", priority=1))\n\n        requests, seen = scheduler.snapshot()\n\n        assert len(requests) == 3\n        # Should be sorted by priority (highest first due to negative priority in queue)\n        assert requests[0].url == \"https://example.com/2\"  # priority 10\n        assert requests[1].url == \"https://example.com/1\"  # priority 5\n        assert requests[2].url == \"https://example.com/3\"  # priority 1\n\n    @pytest.mark.asyncio\n    async def test_snapshot_captures_seen_set(self):\n        \"\"\"Test snapshot captures seen fingerprints.\"\"\"\n        scheduler = Scheduler()\n\n        await scheduler.enqueue(Request(\"https://example.com/1\"))\n        await scheduler.enqueue(Request(\"https://example.com/2\"))\n\n        requests, seen = scheduler.snapshot()\n\n        assert len(seen) == 2\n        # Fingerprints are now bytes (SHA1 hashes)\n        for fp in seen:\n            assert isinstance(fp, bytes)\n            assert len(fp) == 20  # SHA1 produces 20 bytes\n\n    @pytest.mark.asyncio\n    async def test_snapshot_returns_copies(self):\n        \"\"\"Test that snapshot returns copies, not references.\"\"\"\n        scheduler = Scheduler()\n\n        await scheduler.enqueue(Request(\"https://example.com\"))\n\n        requests, seen = scheduler.snapshot()\n\n        # Modifying snapshot shouldn't affect scheduler\n        requests.append(Request(\"https://modified.com\"))\n        seen.add(b\"new_fingerprint_bytes\")\n\n        original_requests, original_seen = scheduler.snapshot()\n\n        assert len(original_requests) == 1\n        assert b\"new_fingerprint_bytes\" not in original_seen\n\n    @pytest.mark.asyncio\n    async def test_snapshot_excludes_dequeued_requests(self):\n        \"\"\"Test snapshot only includes pending requests.\"\"\"\n        scheduler = Scheduler()\n\n        await scheduler.enqueue(Request(\"https://example.com/1\"))\n        await scheduler.enqueue(Request(\"https://example.com/2\"))\n        await scheduler.enqueue(Request(\"https://example.com/3\"))\n\n        # Dequeue one\n        await scheduler.dequeue()\n\n        requests, seen = scheduler.snapshot()\n\n        # Snapshot should only have 2 pending requests\n        assert len(requests) == 2\n        # But seen should still have all 3 (deduplication tracking)\n        assert len(seen) == 3\n\n\nclass TestSchedulerRestore:\n    \"\"\"Test Scheduler restore functionality from checkpoint.\"\"\"\n\n    @pytest.mark.asyncio\n    async def test_restore_requests(self):\n        \"\"\"Test restoring requests from checkpoint data.\"\"\"\n        scheduler = Scheduler()\n\n        checkpoint_requests = [\n            Request(\"https://example.com/1\", priority=10),\n            Request(\"https://example.com/2\", priority=5),\n        ]\n        checkpoint_seen = {b\"fp1_bytes_padded!\", b\"fp2_bytes_padded!\", b\"fp3_bytes_padded!\"}\n\n        data = CheckpointData(requests=checkpoint_requests, seen=checkpoint_seen)\n\n        scheduler.restore(data)\n\n        assert len(scheduler) == 2\n\n    @pytest.mark.asyncio\n    async def test_restore_seen_set(self):\n        \"\"\"Test that restore sets up seen fingerprints.\"\"\"\n        scheduler = Scheduler()\n\n        data = CheckpointData(\n            requests=[],\n            seen={b\"fp1_bytes_here_pad\", b\"fp2_bytes_here_pad\"},  # Bytes fingerprints\n        )\n\n        scheduler.restore(data)\n\n        # Verify seen set was restored\n        _, seen = scheduler.snapshot()\n        assert seen == {b\"fp1_bytes_here_pad\", b\"fp2_bytes_here_pad\"}\n\n    @pytest.mark.asyncio\n    async def test_restore_maintains_priority_order(self):\n        \"\"\"Test that restored requests maintain priority order.\"\"\"\n        scheduler = Scheduler()\n\n        # Requests should already be sorted by priority in checkpoint\n        checkpoint_requests = [\n            Request(\"https://example.com/high\", priority=10),\n            Request(\"https://example.com/low\", priority=1),\n        ]\n\n        data = CheckpointData(requests=checkpoint_requests, seen=set())\n        scheduler.restore(data)\n\n        # Dequeue should return high priority first\n        first = await scheduler.dequeue()\n        assert first.url == \"https://example.com/high\"\n\n        second = await scheduler.dequeue()\n        assert second.url == \"https://example.com/low\"\n\n    @pytest.mark.asyncio\n    async def test_restore_empty_checkpoint(self):\n        \"\"\"Test restoring from empty checkpoint.\"\"\"\n        scheduler = Scheduler()\n\n        data = CheckpointData(requests=[], seen=set())\n        scheduler.restore(data)\n\n        assert len(scheduler) == 0\n        assert scheduler.is_empty is True\n\n\nclass TestSchedulerIntegration:\n    \"\"\"Integration tests for Scheduler with checkpoint roundtrip.\"\"\"\n\n    @pytest.mark.asyncio\n    async def test_snapshot_and_restore_roundtrip(self):\n        \"\"\"Test that snapshot -> restore works correctly.\"\"\"\n        # Create and populate original scheduler\n        original = Scheduler()\n\n        await original.enqueue(Request(\"https://example.com/1\", sid=\"s1\", priority=10))\n        await original.enqueue(Request(\"https://example.com/2\", sid=\"s1\", priority=5))\n        await original.enqueue(Request(\"https://example.com/3\", sid=\"s2\", priority=7))\n\n        # Snapshot\n        requests, seen = original.snapshot()\n        data = CheckpointData(requests=requests, seen=seen)\n\n        # Restore to new scheduler\n        restored = Scheduler()\n        restored.restore(data)\n\n        # Verify state matches\n        assert len(restored) == len(original)\n\n        # Dequeue from both and compare\n        for _ in range(3):\n            orig_req = await original.dequeue()\n            rest_req = await restored.dequeue()\n            assert orig_req.url == rest_req.url\n            assert orig_req.priority == rest_req.priority\n\n    @pytest.mark.asyncio\n    async def test_partial_processing_then_checkpoint(self):\n        \"\"\"Test checkpointing after partial processing.\"\"\"\n        scheduler = Scheduler()\n\n        # Enqueue 5 requests\n        for i in range(5):\n            await scheduler.enqueue(Request(f\"https://example.com/{i}\"))\n\n        # Process 2\n        await scheduler.dequeue()\n        await scheduler.dequeue()\n\n        # Snapshot should show 3 pending, 5 seen\n        requests, seen = scheduler.snapshot()\n\n        assert len(requests) == 3\n        assert len(seen) == 5\n\n    @pytest.mark.asyncio\n    async def test_deduplication_after_restore(self):\n        \"\"\"Test that deduplication works after restore.\"\"\"\n        scheduler = Scheduler()\n\n        await scheduler.enqueue(Request(\"https://example.com\", sid=\"s1\"))\n\n        requests, seen = scheduler.snapshot()\n        data = CheckpointData(requests=requests, seen=seen)\n\n        # Restore to new scheduler\n        new_scheduler = Scheduler()\n        new_scheduler.restore(data)\n\n        # Try to add duplicate - should be filtered\n        result = await new_scheduler.enqueue(Request(\"https://example.com\", sid=\"s1\"))\n\n        assert result is False  # Duplicate filtered based on restored seen set\n"
  },
  {
    "path": "tests/spiders/test_session.py",
    "content": "\"\"\"Tests for the SessionManager class.\"\"\"\n\nfrom scrapling.core._types import Any\nimport pytest\n\nfrom scrapling.spiders.session import SessionManager\n\n\nclass MockSession:  # type: ignore[type-arg]\n    \"\"\"Mock session for testing without actual network calls.\"\"\"\n\n    def __init__(self, name: str = \"mock\"):\n        self.name = name\n        self._is_alive = False\n        self._started = False\n        self._closed = False\n\n    async def __aenter__(self):\n        self._is_alive = True\n        self._started = True\n        return self\n\n    async def __aexit__(self, *args):\n        self._is_alive = False\n        self._closed = True\n\n    async def fetch(self, url: str, **kwargs):\n        pass\n\n\nclass TestSessionManagerInit:\n    \"\"\"Test SessionManager initialization.\"\"\"\n\n    def test_manager_starts_empty(self):\n        \"\"\"Test that manager starts with no sessions.\"\"\"\n        manager = SessionManager()\n\n        assert len(manager) == 0\n\n    def test_manager_no_default_session_when_empty(self):\n        \"\"\"Test that accessing default_session_id raises when empty.\"\"\"\n        manager = SessionManager()\n\n        with pytest.raises(RuntimeError, match=\"No sessions registered\"):\n            _ = manager.default_session_id\n\n\nclass TestSessionManagerAdd:\n    \"\"\"Test SessionManager add functionality.\"\"\"\n\n    def test_add_single_session(self):\n        \"\"\"Test adding a single session.\"\"\"\n        manager = SessionManager()\n        session = MockSession()\n\n        manager.add(\"test\", session)\n\n        assert len(manager) == 1\n        assert \"test\" in manager\n        assert manager.session_ids == [\"test\"]\n\n    def test_first_session_becomes_default(self):\n        \"\"\"Test that first added session becomes default.\"\"\"\n        manager = SessionManager()\n        session = MockSession()\n\n        manager.add(\"first\", session)\n\n        assert manager.default_session_id == \"first\"\n\n    def test_add_multiple_sessions(self):\n        \"\"\"Test adding multiple sessions.\"\"\"\n        manager = SessionManager()\n\n        manager.add(\"session1\", MockSession(\"s1\"))\n        manager.add(\"session2\", MockSession(\"s2\"))\n        manager.add(\"session3\", MockSession(\"s3\"))\n\n        assert len(manager) == 3\n        assert \"session1\" in manager\n        assert \"session2\" in manager\n        assert \"session3\" in manager\n\n    def test_explicit_default_session(self):\n        \"\"\"Test setting explicit default session.\"\"\"\n        manager = SessionManager()\n\n        manager.add(\"first\", MockSession())\n        manager.add(\"second\", MockSession(), default=True)\n\n        assert manager.default_session_id == \"second\"\n\n    def test_add_duplicate_id_raises(self):\n        \"\"\"Test that adding duplicate session ID raises.\"\"\"\n        manager = SessionManager()\n        manager.add(\"test\", MockSession())\n\n        with pytest.raises(ValueError, match=\"already registered\"):\n            manager.add(\"test\", MockSession())\n\n    def test_add_returns_self_for_chaining(self):\n        \"\"\"Test that add returns self for method chaining.\"\"\"\n        manager = SessionManager()\n\n        result = manager.add(\"test\", MockSession())\n\n        assert result is manager\n\n    def test_method_chaining(self):\n        \"\"\"Test fluent interface for adding sessions.\"\"\"\n        manager = SessionManager()\n\n        manager.add(\"s1\", MockSession()).add(\"s2\", MockSession()).add(\"s3\", MockSession())\n\n        assert len(manager) == 3\n\n    def test_add_lazy_session(self):\n        \"\"\"Test adding lazy session.\"\"\"\n        manager = SessionManager()\n\n        manager.add(\"lazy\", MockSession(), lazy=True)\n\n        assert \"lazy\" in manager\n        assert \"lazy\" in manager._lazy_sessions\n\n\nclass TestSessionManagerRemove:\n    \"\"\"Test SessionManager remove/pop functionality.\"\"\"\n\n    def test_remove_session(self):\n        \"\"\"Test removing a session.\"\"\"\n        manager = SessionManager()\n        manager.add(\"test\", MockSession())\n\n        manager.remove(\"test\")\n\n        assert \"test\" not in manager\n        assert len(manager) == 0\n\n    def test_remove_nonexistent_raises(self):\n        \"\"\"Test removing nonexistent session raises.\"\"\"\n        manager = SessionManager()\n\n        with pytest.raises(KeyError, match=\"not found\"):\n            manager.remove(\"nonexistent\")\n\n    def test_pop_returns_session(self):\n        \"\"\"Test pop returns the removed session.\"\"\"\n        manager = SessionManager()\n        session = MockSession(\"original\")\n        manager.add(\"test\", session)\n\n        popped = manager.pop(\"test\")\n\n        assert popped is session\n        assert \"test\" not in manager\n\n    def test_remove_default_updates_default(self):\n        \"\"\"Test that removing default session updates default.\"\"\"\n        manager = SessionManager()\n        manager.add(\"first\", MockSession())\n        manager.add(\"second\", MockSession())\n\n        assert manager.default_session_id == \"first\"\n\n        manager.remove(\"first\")\n\n        assert manager.default_session_id == \"second\"\n\n    def test_remove_lazy_session_cleans_up(self):\n        \"\"\"Test that removing lazy session cleans up lazy set.\"\"\"\n        manager = SessionManager()\n        manager.add(\"lazy\", MockSession(), lazy=True)\n\n        manager.remove(\"lazy\")\n\n        assert \"lazy\" not in manager._lazy_sessions\n\n\nclass TestSessionManagerGet:\n    \"\"\"Test SessionManager get functionality.\"\"\"\n\n    def test_get_existing_session(self):\n        \"\"\"Test getting an existing session.\"\"\"\n        manager = SessionManager()\n        session = MockSession(\"test\")\n        manager.add(\"test\", session)\n\n        retrieved = manager.get(\"test\")\n\n        assert retrieved is session\n\n    def test_get_nonexistent_raises_with_available(self):\n        \"\"\"Test getting nonexistent session shows available sessions.\"\"\"\n        manager = SessionManager()\n        manager.add(\"session1\", MockSession())\n        manager.add(\"session2\", MockSession())\n\n        with pytest.raises(KeyError, match=\"Available:\"):\n            manager.get(\"nonexistent\")\n\n\nclass TestSessionManagerContains:\n    \"\"\"Test SessionManager contains functionality.\"\"\"\n\n    def test_contains_existing(self):\n        \"\"\"Test contains for existing session.\"\"\"\n        manager = SessionManager()\n        manager.add(\"test\", MockSession())\n\n        assert \"test\" in manager\n\n    def test_not_contains_missing(self):\n        \"\"\"Test contains for missing session.\"\"\"\n        manager = SessionManager()\n        manager.add(\"test\", MockSession())\n\n        assert \"other\" not in manager\n\n\nclass TestSessionManagerAsyncContext:\n    \"\"\"Test SessionManager async context manager.\"\"\"\n\n    @pytest.mark.asyncio\n    async def test_start_activates_sessions(self):\n        \"\"\"Test that start activates non-lazy sessions.\"\"\"\n        manager = SessionManager()\n        session = MockSession()\n        manager.add(\"test\", session)\n\n        await manager.start()\n\n        assert session._is_alive is True\n        assert manager._started is True\n\n    @pytest.mark.asyncio\n    async def test_start_skips_lazy_sessions(self):\n        \"\"\"Test that start skips lazy sessions.\"\"\"\n        manager = SessionManager()\n        eager_session = MockSession(\"eager\")\n        lazy_session = MockSession(\"lazy\")\n\n        manager.add(\"eager\", eager_session)\n        manager.add(\"lazy\", lazy_session, lazy=True)\n\n        await manager.start()\n\n        assert eager_session._is_alive is True\n        assert lazy_session._is_alive is False\n\n    @pytest.mark.asyncio\n    async def test_close_deactivates_sessions(self):\n        \"\"\"Test that close deactivates all sessions.\"\"\"\n        manager = SessionManager()\n        session = MockSession()\n        manager.add(\"test\", session)\n\n        await manager.start()\n        assert session._is_alive is True\n\n        await manager.close()\n        assert session._is_alive is False\n        assert manager._started is False\n\n    @pytest.mark.asyncio\n    async def test_async_context_manager(self):\n        \"\"\"Test using SessionManager as async context manager.\"\"\"\n        manager = SessionManager()\n        session = MockSession()\n        manager.add(\"test\", session)\n\n        async with manager:\n            assert session._is_alive is True\n\n        assert session._is_alive is False\n\n    @pytest.mark.asyncio\n    async def test_start_idempotent(self):\n        \"\"\"Test that calling start multiple times is safe.\"\"\"\n        manager = SessionManager()\n        session = MockSession()\n        manager.add(\"test\", session)\n\n        await manager.start()\n        await manager.start()  # Should not raise or double-start\n\n        assert session._started is True\n\n\nclass TestSessionManagerProperties:\n    \"\"\"Test SessionManager properties.\"\"\"\n\n    def test_session_ids_returns_list(self):\n        \"\"\"Test session_ids returns list of IDs.\"\"\"\n        manager = SessionManager()\n        manager.add(\"a\", MockSession())\n        manager.add(\"b\", MockSession())\n        manager.add(\"c\", MockSession())\n\n        ids = manager.session_ids\n\n        assert isinstance(ids, list)\n        assert set(ids) == {\"a\", \"b\", \"c\"}\n\n    def test_len_returns_session_count(self):\n        \"\"\"Test len returns number of sessions.\"\"\"\n        manager = SessionManager()\n\n        assert len(manager) == 0\n\n        manager.add(\"s1\", MockSession())\n        assert len(manager) == 1\n\n        manager.add(\"s2\", MockSession())\n        assert len(manager) == 2\n\n\nclass TestSessionManagerIntegration:\n    \"\"\"Integration tests for SessionManager.\"\"\"\n\n    def test_realistic_setup(self):\n        \"\"\"Test realistic session manager setup.\"\"\"\n        manager = SessionManager()\n\n        # Add different types of sessions\n        manager.add(\"default\", MockSession(\"default\"))\n        manager.add(\"backup\", MockSession(\"backup\"))\n        manager.add(\"lazy_special\", MockSession(\"special\"), lazy=True)\n\n        assert len(manager) == 3\n        assert manager.default_session_id == \"default\"\n        assert \"lazy_special\" in manager._lazy_sessions\n\n    @pytest.mark.asyncio\n    async def test_lifecycle_management(self):\n        \"\"\"Test complete lifecycle of session manager.\"\"\"\n        manager = SessionManager()\n        sessions = [MockSession(f\"s{i}\") for i in range(3)]\n\n        for i, session in enumerate(sessions):\n            manager.add(f\"session{i}\", session)\n\n        # Before start - no sessions active\n        assert all(not s._is_alive for s in sessions)\n\n        # After start - all active\n        await manager.start()\n        assert all(s._is_alive for s in sessions)\n\n        # After close - all inactive\n        await manager.close()\n        assert all(not s._is_alive for s in sessions)\n"
  },
  {
    "path": "tests/spiders/test_spider.py",
    "content": "\"\"\"Tests for the Spider class and related components.\"\"\"\n\nimport logging\nimport tempfile\nfrom pathlib import Path\n\nimport pytest\n\nfrom scrapling.spiders.spider import Spider, SessionConfigurationError, LogCounterHandler, BLOCKED_CODES\nfrom scrapling.spiders.request import Request\nfrom scrapling.spiders.session import SessionManager\nfrom scrapling.spiders.result import CrawlStats\nfrom scrapling.core._types import Any, Dict, AsyncGenerator\n\n\nclass TestLogCounterHandler:\n    \"\"\"Test LogCounterHandler for tracking log counts.\"\"\"\n\n    def test_initial_counts_are_zero(self):\n        \"\"\"Test that handler starts with zero counts.\"\"\"\n        handler = LogCounterHandler()\n        counts = handler.get_counts()\n\n        assert counts[\"debug\"] == 0\n        assert counts[\"info\"] == 0\n        assert counts[\"warning\"] == 0\n        assert counts[\"error\"] == 0\n        assert counts[\"critical\"] == 0\n\n    def test_counts_debug_messages(self):\n        \"\"\"Test counting debug level messages.\"\"\"\n        handler = LogCounterHandler()\n        record = logging.LogRecord(\n            name=\"test\",\n            level=logging.DEBUG,\n            pathname=\"\",\n            lineno=0,\n            msg=\"test\",\n            args=(),\n            exc_info=None,\n        )\n\n        handler.emit(record)\n        handler.emit(record)\n\n        assert handler.get_counts()[\"debug\"] == 2\n\n    def test_counts_info_messages(self):\n        \"\"\"Test counting info level messages.\"\"\"\n        handler = LogCounterHandler()\n        record = logging.LogRecord(\n            name=\"test\",\n            level=logging.INFO,\n            pathname=\"\",\n            lineno=0,\n            msg=\"test\",\n            args=(),\n            exc_info=None,\n        )\n\n        handler.emit(record)\n\n        assert handler.get_counts()[\"info\"] == 1\n\n    def test_counts_warning_messages(self):\n        \"\"\"Test counting warning level messages.\"\"\"\n        handler = LogCounterHandler()\n        record = logging.LogRecord(\n            name=\"test\",\n            level=logging.WARNING,\n            pathname=\"\",\n            lineno=0,\n            msg=\"test\",\n            args=(),\n            exc_info=None,\n        )\n\n        handler.emit(record)\n\n        assert handler.get_counts()[\"warning\"] == 1\n\n    def test_counts_error_messages(self):\n        \"\"\"Test counting error level messages.\"\"\"\n        handler = LogCounterHandler()\n        record = logging.LogRecord(\n            name=\"test\",\n            level=logging.ERROR,\n            pathname=\"\",\n            lineno=0,\n            msg=\"test\",\n            args=(),\n            exc_info=None,\n        )\n\n        handler.emit(record)\n\n        assert handler.get_counts()[\"error\"] == 1\n\n    def test_counts_critical_messages(self):\n        \"\"\"Test counting critical level messages.\"\"\"\n        handler = LogCounterHandler()\n        record = logging.LogRecord(\n            name=\"test\",\n            level=logging.CRITICAL,\n            pathname=\"\",\n            lineno=0,\n            msg=\"test\",\n            args=(),\n            exc_info=None,\n        )\n\n        handler.emit(record)\n\n        assert handler.get_counts()[\"critical\"] == 1\n\n    def test_counts_multiple_levels(self):\n        \"\"\"Test counting messages at different levels.\"\"\"\n        handler = LogCounterHandler()\n\n        levels = [\n            logging.DEBUG,\n            logging.DEBUG,\n            logging.INFO,\n            logging.WARNING,\n            logging.ERROR,\n            logging.ERROR,\n            logging.ERROR,\n            logging.CRITICAL,\n        ]\n\n        for level in levels:\n            record = logging.LogRecord(\n                name=\"test\",\n                level=level,\n                pathname=\"\",\n                lineno=0,\n                msg=\"test\",\n                args=(),\n                exc_info=None,\n            )\n            handler.emit(record)\n\n        counts = handler.get_counts()\n        assert counts[\"debug\"] == 2\n        assert counts[\"info\"] == 1\n        assert counts[\"warning\"] == 1\n        assert counts[\"error\"] == 3\n        assert counts[\"critical\"] == 1\n\n\nclass TestBlockedCodes:\n    \"\"\"Test BLOCKED_CODES constant.\"\"\"\n\n    def test_blocked_codes_contains_expected_values(self):\n        \"\"\"Test that BLOCKED_CODES contains expected HTTP status codes.\"\"\"\n        assert 401 in BLOCKED_CODES  # Unauthorized\n        assert 403 in BLOCKED_CODES  # Forbidden\n        assert 407 in BLOCKED_CODES  # Proxy Authentication Required\n        assert 429 in BLOCKED_CODES  # Too Many Requests\n        assert 444 in BLOCKED_CODES  # Connection Closed Without Response (nginx)\n        assert 500 in BLOCKED_CODES  # Internal Server Error\n        assert 502 in BLOCKED_CODES  # Bad Gateway\n        assert 503 in BLOCKED_CODES  # Service Unavailable\n        assert 504 in BLOCKED_CODES  # Gateway Timeout\n\n    def test_blocked_codes_does_not_contain_success(self):\n        \"\"\"Test that success codes are not blocked.\"\"\"\n        assert 200 not in BLOCKED_CODES\n        assert 201 not in BLOCKED_CODES\n        assert 204 not in BLOCKED_CODES\n        assert 301 not in BLOCKED_CODES\n        assert 302 not in BLOCKED_CODES\n\n\nclass ConcreteSpider(Spider):\n    \"\"\"Concrete spider implementation for testing.\"\"\"\n\n    name = \"test_spider\"\n    start_urls = [\"https://example.com\"]\n\n    async def parse(self, response) -> AsyncGenerator[Dict[str, Any] | Request | None, None]:\n        yield {\"url\": str(response)}\n\n\nclass TestSpiderInit:\n    \"\"\"Test Spider initialization.\"\"\"\n\n    def test_spider_requires_name(self):\n        \"\"\"Test that spider without name raises ValueError.\"\"\"\n\n        class NoNameSpider(Spider):\n            async def parse(self, response) -> AsyncGenerator[Dict[str, Any] | Request | None, None]:\n                yield None\n\n        with pytest.raises(ValueError, match=\"must have a name\"):\n            NoNameSpider()\n\n    def test_spider_initializes_logger(self):\n        \"\"\"Test that spider creates a logger.\"\"\"\n        spider = ConcreteSpider()\n\n        assert spider.logger is not None\n        assert spider.logger.name == \"scrapling.spiders.test_spider\"\n\n    def test_spider_logger_has_log_counter(self):\n        \"\"\"Test that spider logger has log counter handler.\"\"\"\n        spider = ConcreteSpider()\n\n        assert spider._log_counter is not None\n        assert isinstance(spider._log_counter, LogCounterHandler)\n\n    def test_spider_with_crawldir(self):\n        \"\"\"Test spider initialization with crawldir.\"\"\"\n        with tempfile.TemporaryDirectory() as tmpdir:\n            spider = ConcreteSpider(crawldir=tmpdir)\n\n            assert spider.crawldir == Path(tmpdir)\n\n    def test_spider_without_crawldir(self):\n        \"\"\"Test spider initialization without crawldir.\"\"\"\n        spider = ConcreteSpider()\n\n        assert spider.crawldir is None\n\n    def test_spider_custom_interval(self):\n        \"\"\"Test spider with custom checkpoint interval.\"\"\"\n        spider = ConcreteSpider(interval=60.0)\n\n        assert spider._interval == 60.0\n\n    def test_spider_default_interval(self):\n        \"\"\"Test spider has default checkpoint interval.\"\"\"\n        spider = ConcreteSpider()\n\n        assert spider._interval == 300.0\n\n    def test_spider_repr(self):\n        \"\"\"Test spider string representation.\"\"\"\n        spider = ConcreteSpider()\n\n        repr_str = repr(spider)\n\n        assert \"ConcreteSpider\" in repr_str\n        assert \"test_spider\" in repr_str\n\n\nclass TestSpiderClassAttributes:\n    \"\"\"Test Spider class attribute defaults.\"\"\"\n\n    def test_default_concurrent_requests(self):\n        \"\"\"Test default concurrent_requests is 4.\"\"\"\n        assert ConcreteSpider.concurrent_requests == 4\n\n    def test_default_concurrent_requests_per_domain(self):\n        \"\"\"Test default concurrent_requests_per_domain is 0 (disabled).\"\"\"\n        assert ConcreteSpider.concurrent_requests_per_domain == 0\n\n    def test_default_download_delay(self):\n        \"\"\"Test default download_delay is 0.\"\"\"\n        assert ConcreteSpider.download_delay == 0.0\n\n    def test_default_max_blocked_retries(self):\n        \"\"\"Test default max_blocked_retries is 3.\"\"\"\n        assert ConcreteSpider.max_blocked_retries == 3\n\n    def test_default_logging_level(self):\n        \"\"\"Test default logging level is DEBUG.\"\"\"\n        assert ConcreteSpider.logging_level == logging.DEBUG\n\n    def test_default_allowed_domains_empty(self):\n        \"\"\"Test default allowed_domains is empty set.\"\"\"\n        assert ConcreteSpider.allowed_domains == set()\n\n\nclass TestSpiderSessionConfiguration:\n    \"\"\"Test Spider session configuration.\"\"\"\n\n    def test_default_configure_sessions(self):\n        \"\"\"Test that default configure_sessions adds a session.\"\"\"\n        spider = ConcreteSpider()\n\n        assert len(spider._session_manager) > 0\n\n    def test_configure_sessions_error_raises_custom_exception(self):\n        \"\"\"Test that errors in configure_sessions raise SessionConfigurationError.\"\"\"\n\n        class BadSessionSpider(Spider):\n            name = \"bad_spider\"\n\n            def configure_sessions(self, manager: SessionManager) -> None:\n                raise RuntimeError(\"Configuration failed!\")\n\n            async def parse(self, response) -> AsyncGenerator[Dict[str, Any] | Request | None, None]:\n                yield None\n\n        with pytest.raises(SessionConfigurationError, match=\"Configuration failed\"):\n            BadSessionSpider()\n\n    def test_configure_sessions_no_sessions_raises(self):\n        \"\"\"Test that not adding any sessions raises SessionConfigurationError.\"\"\"\n\n        class NoSessionSpider(Spider):\n            name = \"no_session_spider\"\n\n            def configure_sessions(self, manager: SessionManager) -> None:\n                pass  # Don't add any sessions\n\n            async def parse(self, response) -> AsyncGenerator[Dict[str, Any] | Request | None, None]:\n                yield None\n\n        with pytest.raises(SessionConfigurationError, match=\"did not add any sessions\"):\n            NoSessionSpider()\n\n\nclass TestSpiderStartRequests:\n    \"\"\"Test Spider start_requests method.\"\"\"\n\n    @pytest.mark.asyncio\n    async def test_start_requests_yields_from_start_urls(self):\n        \"\"\"Test that start_requests yields requests for start_urls.\"\"\"\n\n        class MultiUrlSpider(Spider):\n            name = \"multi_url\"\n            start_urls = [\n                \"https://example.com/1\",\n                \"https://example.com/2\",\n                \"https://example.com/3\",\n            ]\n\n            async def parse(self, response) -> AsyncGenerator[Dict[str, Any] | Request | None, None]:\n                yield None\n\n        spider = MultiUrlSpider()\n        requests = [r async for r in spider.start_requests()]\n\n        assert len(requests) == 3\n        assert requests[0].url == \"https://example.com/1\"\n        assert requests[1].url == \"https://example.com/2\"\n        assert requests[2].url == \"https://example.com/3\"\n\n    @pytest.mark.asyncio\n    async def test_start_requests_no_urls_raises(self):\n        \"\"\"Test that start_requests raises when no start_urls.\"\"\"\n\n        class NoUrlSpider(Spider):\n            name = \"no_url\"\n            start_urls = []\n\n            async def parse(self, response) -> AsyncGenerator[Dict[str, Any] | Request | None, None]:\n                yield None\n\n        spider = NoUrlSpider()\n\n        with pytest.raises(RuntimeError, match=\"no starting point\"):\n            async for _ in spider.start_requests():\n                pass\n\n    @pytest.mark.asyncio\n    async def test_start_requests_uses_default_session(self):\n        \"\"\"Test that start_requests uses default session ID.\"\"\"\n        spider = ConcreteSpider()\n        requests = [r async for r in spider.start_requests()]\n\n        # Should use the default session from session manager\n        default_sid = spider._session_manager.default_session_id\n        assert requests[0].sid == default_sid\n\n\nclass TestSpiderHooks:\n    \"\"\"Test Spider lifecycle hooks.\"\"\"\n\n    @pytest.mark.asyncio\n    async def test_on_start_default(self):\n        \"\"\"Test default on_start doesn't raise.\"\"\"\n        spider = ConcreteSpider()\n\n        # Should not raise\n        await spider.on_start(resuming=False)\n        await spider.on_start(resuming=True)\n\n    @pytest.mark.asyncio\n    async def test_on_close_default(self):\n        \"\"\"Test default on_close doesn't raise.\"\"\"\n        spider = ConcreteSpider()\n\n        # Should not raise\n        await spider.on_close()\n\n    @pytest.mark.asyncio\n    async def test_on_error_default(self):\n        \"\"\"Test default on_error logs the error.\"\"\"\n        spider = ConcreteSpider()\n        request = Request(\"https://example.com\")\n        error = ValueError(\"test error\")\n\n        # Should not raise\n        await spider.on_error(request, error)\n\n    @pytest.mark.asyncio\n    async def test_on_scraped_item_default_returns_item(self):\n        \"\"\"Test default on_scraped_item returns the item unchanged.\"\"\"\n        spider = ConcreteSpider()\n        item = {\"key\": \"value\", \"nested\": {\"a\": 1}}\n\n        result = await spider.on_scraped_item(item)\n\n        assert result == item\n\n    @pytest.mark.asyncio\n    async def test_is_blocked_default_checks_status_codes(self):\n        \"\"\"Test default is_blocked checks blocked status codes.\"\"\"\n\n        class MockResponse:\n            def __init__(self, status: int):\n                self.status = status\n\n        spider = ConcreteSpider()\n\n        # Test blocked codes\n        assert await spider.is_blocked(MockResponse(403)) is True\n        assert await spider.is_blocked(MockResponse(429)) is True\n        assert await spider.is_blocked(MockResponse(503)) is True\n\n        # Test non-blocked codes\n        assert await spider.is_blocked(MockResponse(200)) is False\n        assert await spider.is_blocked(MockResponse(404)) is False\n\n    @pytest.mark.asyncio\n    async def test_retry_blocked_request_default_returns_request(self):\n        \"\"\"Test default retry_blocked_request returns the request unchanged.\"\"\"\n\n        class MockResponse:\n            status = 429\n\n        spider = ConcreteSpider()\n        request = Request(\"https://example.com\", priority=5)\n\n        result = await spider.retry_blocked_request(request, MockResponse())\n\n        assert result is request\n\n\nclass TestSpiderPause:\n    \"\"\"Test Spider pause functionality.\"\"\"\n\n    def test_pause_without_engine_raises(self):\n        \"\"\"Test that pause without active engine raises RuntimeError.\"\"\"\n        spider = ConcreteSpider()\n\n        with pytest.raises(RuntimeError, match=\"No active crawl to stop\"):\n            spider.pause()\n\n\nclass TestSpiderStats:\n    \"\"\"Test Spider stats property.\"\"\"\n\n    def test_stats_without_engine_raises(self):\n        \"\"\"Test that accessing stats without active crawl raises.\"\"\"\n        spider = ConcreteSpider()\n\n        with pytest.raises(RuntimeError, match=\"No active crawl\"):\n            _ = spider.stats\n\n\nclass TestSpiderCustomization:\n    \"\"\"Test Spider customization patterns.\"\"\"\n\n    def test_custom_concurrent_requests(self):\n        \"\"\"Test spider with custom concurrent_requests.\"\"\"\n\n        class CustomSpider(Spider):\n            name = \"custom\"\n            concurrent_requests = 32\n            start_urls = [\"https://example.com\"]\n\n            async def parse(self, response) -> AsyncGenerator[Dict[str, Any] | Request | None, None]:\n                yield None\n\n        spider = CustomSpider()\n        assert spider.concurrent_requests == 32\n\n    def test_custom_allowed_domains(self):\n        \"\"\"Test spider with allowed_domains.\"\"\"\n\n        class DomainSpider(Spider):\n            name = \"domain_spider\"\n            start_urls = [\"https://example.com\"]\n            allowed_domains = {\"example.com\", \"api.example.com\"}\n\n            async def parse(self, response) -> AsyncGenerator[Dict[str, Any] | Request | None, None]:\n                yield None\n\n        spider = DomainSpider()\n        assert \"example.com\" in spider.allowed_domains\n        assert \"api.example.com\" in spider.allowed_domains\n\n    def test_custom_download_delay(self):\n        \"\"\"Test spider with download delay.\"\"\"\n\n        class SlowSpider(Spider):\n            name = \"slow\"\n            download_delay = 1.5\n            start_urls = [\"https://example.com\"]\n\n            async def parse(self, response) -> AsyncGenerator[Dict[str, Any] | Request | None, None]:\n                yield None\n\n        spider = SlowSpider()\n        assert spider.download_delay == 1.5\n\n\nclass TestSpiderLogging:\n    \"\"\"Test Spider logging configuration.\"\"\"\n\n    def test_custom_logging_level(self):\n        \"\"\"Test spider with custom logging level.\"\"\"\n\n        class QuietSpider(Spider):\n            name = \"quiet\"\n            logging_level = logging.WARNING\n            start_urls = [\"https://example.com\"]\n\n            async def parse(self, response) -> AsyncGenerator[Dict[str, Any] | Request | None, None]:\n                yield None\n\n        spider = QuietSpider()\n        assert spider.logger.level == logging.WARNING\n\n    def test_log_file_creates_handler(self):\n        \"\"\"Test spider with log file creates file handler.\"\"\"\n        with tempfile.TemporaryDirectory() as tmpdir:\n            log_path = Path(tmpdir) / \"spider.log\"\n\n            class FileLogSpider(Spider):\n                name = \"file_log\"\n                log_file = str(log_path)\n                start_urls = [\"https://example.com\"]\n\n                async def parse(self, response) -> AsyncGenerator[Dict[str, Any] | Request | None, None]:\n                    yield None\n\n            spider = FileLogSpider()\n\n            # Should have a file handler\n            file_handlers = [\n                h for h in spider.logger.handlers if isinstance(h, logging.FileHandler)\n            ]\n            assert len(file_handlers) == 1\n\n            # Clean up\n            for h in file_handlers:\n                h.close()\n\n    def test_logger_does_not_propagate(self):\n        \"\"\"Test that spider logger does not propagate to parent.\"\"\"\n        spider = ConcreteSpider()\n\n        assert spider.logger.propagate is False\n\n\nclass TestSessionConfigurationError:\n    \"\"\"Test SessionConfigurationError exception.\"\"\"\n\n    def test_exception_message(self):\n        \"\"\"Test that exception preserves message.\"\"\"\n        error = SessionConfigurationError(\"Custom error message\")\n\n        assert str(error) == \"Custom error message\"\n\n    def test_exception_is_exception(self):\n        \"\"\"Test that it's a proper exception.\"\"\"\n        error = SessionConfigurationError(\"test\")\n\n        assert isinstance(error, Exception)\n"
  },
  {
    "path": "tox.ini",
    "content": "# Tox (https://tox.readthedocs.io/) is a tool for running tests\n# in multiple virtualenvs. This configuration file will run the\n# test suite on all supported python versions. To use it, \"pip install tox\"\n# and then run \"tox\" from this directory.\n\n[tox]\nenvlist = pre-commit,py{310,311,312,313}\n\n[testenv]\nusedevelop = True\nchangedir = tests\ndeps =\n    playwright==1.58.0\n    patchright==1.58.2\n    -r{toxinidir}/tests/requirements.txt\nextras = ai,shell\ncommands =\n    # Run browser tests without parallelization (avoid browser conflicts)\n    pytest --cov=scrapling --cov-report=xml -k \"DynamicFetcher or StealthyFetcher\" --verbose\n    # Run asyncio tests without parallelization (avoid GitHub CI nested loop issues)\n    pytest --cov=scrapling --cov-report=xml -m \"asyncio\" -k \"not (DynamicFetcher or StealthyFetcher)\" --verbose --cov-append\n    # Run everything else with parallelization (for speed)\n    pytest --cov=scrapling --cov-report=xml -m \"not asyncio\" -k \"not (DynamicFetcher or StealthyFetcher)\" -n auto --cov-append\n\n[testenv:pre-commit]\nbasepython = python3\ndeps = pre-commit\ncommands = pre-commit run --all-files --show-diff-on-failure\nskip_install = true"
  },
  {
    "path": "zensical.toml",
    "content": "[project]\nsite_name = \"Scrapling\"\nsite_description = \"Scrapling - Effortless Web Scraping for the Modern Web!\"\nsite_author = \"Karim Shoair\"\nrepo_url = \"https://github.com/D4Vinci/Scrapling\"\nsite_url = \"https://scrapling.readthedocs.io/en/latest/\"\nrepo_name = \"D4Vinci/Scrapling\"\ncopyright = \"Copyright &copy; 2025 Karim Shoair - <a href=\\\"#__consent\\\">Change cookie settings</a>\"\ndocs_dir = \"docs\"\nuse_directory_urls = false\nexclude_docs = \"\"\"\nREADME*.md\n\"\"\"\nextra_css = [\"stylesheets/extra.css\"]\n\nnav = [\n    {Introduction = \"index.md\"},\n    {Overview = \"overview.md\"},\n    {\"Performance Benchmarks\" = \"benchmarks.md\"},\n    {\"User Guide\" = [\n        {Parsing = [\n            {\"Querying elements\" = \"parsing/selection.md\"},\n            {\"Main classes\" = \"parsing/main_classes.md\"},\n            {\"Adaptive scraping\" = \"parsing/adaptive.md\"}\n        ]},\n        {Fetching = [\n            {\"Fetchers basics\" = \"fetching/choosing.md\"},\n            {\"HTTP requests\" = \"fetching/static.md\"},\n            {\"Dynamic websites\" = \"fetching/dynamic.md\"},\n            {\"Dynamic websites with hard protections\" = \"fetching/stealthy.md\"}\n        ]},\n        {Spiders = [\n            {\"Architecture\" = \"spiders/architecture.md\"},\n            {\"Getting started\" = \"spiders/getting-started.md\"},\n            {\"Requests & Responses\" = \"spiders/requests-responses.md\"},\n            {\"Sessions\" = \"spiders/sessions.md\"},\n            {\"Proxy management & Blocking\" = \"spiders/proxy-blocking.md\"},\n            {\"Advanced features\" = \"spiders/advanced.md\"}\n        ]},\n        {\"Command Line Interface\" = [\n            {Overview = \"cli/overview.md\"},\n            {\"Interactive shell\" = \"cli/interactive-shell.md\"},\n            {\"Extract commands\" = \"cli/extract-commands.md\"}\n        ]},\n        {Integrations = [\n            {\"AI MCP server\" = \"ai/mcp-server.md\"}\n        ]}\n    ]},\n    {Tutorials = [\n        {\"A Free Alternative to AI for Robust Web Scraping\" = \"tutorials/replacing_ai.md\"},\n        {\"Migrating from BeautifulSoup\" = \"tutorials/migrating_from_beautifulsoup.md\"}\n    ]},\n    {Development = [\n        {\"API Reference\" = [\n            {Selector = \"api-reference/selector.md\"},\n            {Fetchers = \"api-reference/fetchers.md\"},\n            {\"MCP Server\" = \"api-reference/mcp-server.md\"},\n            {\"Custom Types\" = \"api-reference/custom-types.md\"},\n            {Response = \"api-reference/response.md\"},\n            {Spiders = \"api-reference/spiders.md\"},\n            {\"Proxy Rotation\" = \"api-reference/proxy-rotation.md\"}\n        ]},\n        {\"Writing your retrieval system\" = \"development/adaptive_storage_system.md\"},\n        {\"Using Scrapling's custom types\" = \"development/scrapling_custom_types.md\"}\n    ]},\n    {\"Support and Advertisement\" = \"donate.md\"},\n    {Contributing = \"https://github.com/D4Vinci/Scrapling/blob/main/CONTRIBUTING.md\"},\n    {Changelog = \"https://github.com/D4Vinci/Scrapling/releases\"}\n]\n\n[project.theme]\nlanguage = \"en\"\ncustom_dir = \"docs/overrides\"\nlogo = \"assets/logo.png\"\nfavicon = \"assets/favicon.ico\"\nfeatures = [\n    \"navigation.path\",\n#    \"announce.dismiss\",\n    \"navigation.top\",\n    \"navigation.footer\",\n    \"navigation.indexes\",\n    \"navigation.sections\",\n    \"navigation.tracking\",\n    \"navigation.instant\",\n    \"navigation.instant.prefetch\",\n    \"navigation.instant.progress\",\n#    \"navigation.tabs\",\n#    \"navigation.expand\",\n#    \"toc.integrate\",\n    \"search.share\",\n    \"search.suggest\",\n    \"search.highlight\",\n]\n\n[[project.theme.palette]]\nmedia = \"(prefers-color-scheme: light)\"\nscheme = \"default\"\naccent = \"green\"\nprimary = \"deep purple\"\ntoggle.icon = \"lucide/sun\"\ntoggle.name = \"Switch to dark mode\"\n\n[[project.theme.palette]]\nmedia = \"(prefers-color-scheme: dark)\"\nscheme = \"slate\"\naccent = \"light green\"\nprimary = \"deep purple\"\ntoggle.icon = \"lucide/moon\"\ntoggle.name = \"Switch to light mode\"\n\n# Uncomment if needed:\n# [project.theme.font]\n# text = \"Open Sans\"\n# code = \"JetBrains Mono\"\n\n[project.markdown_extensions.pymdownx.caret]\n[project.markdown_extensions.pymdownx.mark]\n[project.markdown_extensions.pymdownx.tilde]\n[project.markdown_extensions.admonition]\n[project.markdown_extensions.abbr]\n#[project.markdown_extensions.mkautodoc]\n[project.markdown_extensions.pymdownx.details]\n[project.markdown_extensions.pymdownx.superfences]\ncustom_fences = [\n    {name = \"mermaid\", class = \"mermaid\", format = \"pymdownx.superfences.fence_code_format\"}\n]\n[project.markdown_extensions.pymdownx.inlinehilite]\n[project.markdown_extensions.pymdownx.snippets]\n[project.markdown_extensions.tables]\n\n[project.markdown_extensions.pymdownx.emoji]\nemoji_index = \"zensical.extensions.emoji.twemoji\"\nemoji_generator = \"zensical.extensions.emoji.to_svg\"\n\n[project.markdown_extensions.pymdownx.highlight]\npygments_lang_class = true\nanchor_linenums = true\nline_spans = \"__span\"\n\n[project.markdown_extensions.pymdownx.tabbed]\nalternate_style = true\n\n[project.markdown_extensions.codehilite]\ncss_class = \"highlight\"\n\n[project.markdown_extensions.toc]\ntitle = \"On this page\"\npermalink = true\ntoc_depth = 3\n\n[project.plugins.mkdocstrings.handlers.python]\ninventories = [\"https://docs.python.org/3/objects.inv\"]\npaths = [\"scrapling\"]\n\n[project.plugins.mkdocstrings.handlers.python.options]\ndocstring_style = \"sphinx\"\nshow_source = true\nshow_root_heading = true\nshow_if_no_docstring = true\ninherited_members = true\nmembers_order = \"source\"\nseparate_signature = true\nunwrap_annotated = true\nfilters = \"public\"\nmerge_init_into_class = true\ndocstring_section_style = \"spacy\"\nsignature_crossrefs = true\nshow_symbol_type_heading = true\nshow_symbol_type_toc = true\nshow_inheritance_diagram = true\nmodernize_annotations = true\nextensions = [\n    \"griffe_runtime_objects\",\n    \"griffe_sphinx\",\n    {griffe_inherited_docstrings = {merge = true}}\n]\n\n[[project.extra.social]]\nicon = \"fontawesome/brands/github\"\nlink = \"https://github.com/D4Vinci/Scrapling\"\n\n[[project.extra.social]]\nicon = \"fontawesome/brands/x-twitter\"\nlink = \"https://x.com/Scrapling_dev\"\n\n[[project.extra.social]]\nicon = \"fontawesome/brands/discord\"\nlink = \"https://discord.gg/EMgGbDceNQ\"\n\n[[project.extra.social]]\nicon = \"fontawesome/brands/python\"\nlink = \"https://pypi.org/project/scrapling/\"\n\n[[project.extra.social]]\nicon = \"fontawesome/brands/docker\"\nlink = \"https://hub.docker.com/r/pyd4vinci/scrapling\"\n\n[project.extra.analytics]\nprovider = \"google\"\nproperty = \"G-CS3DKLY73Z\"\n\n[project.extra.analytics.feedback]\ntitle = \"Was this page helpful?\"\n\n[[project.extra.analytics.feedback.ratings]]\nicon = \"material/heart\"\nname = \"This page was helpful\"\ndata = 1\nnote = \"Thanks for your feedback!\"\n\n[[project.extra.analytics.feedback.ratings]]\nicon = \"material/heart-broken\"\nname = \"This page could be improved\"\ndata = 0\nnote = \"\"\"\nThanks for your feedback! Help us improve this page by\n<a href=\"https://github.com/D4Vinci/Scrapling/issues/new?template=04-docs_issue.yml\" target=\"_blank\" rel=\"noopener\">opening a documentation issue</a>.\n\"\"\"\n\n[project.extra.consent]\ntitle = \"Cookie consent\"\ndescription = \"\"\"\nWe use cookies to recognize your repeated visits and preferences, as well\nas to measure the effectiveness of our documentation and whether users\nfind what they're searching for. With your consent, you're helping us to\nmake our documentation better.\n\"\"\"\nactions = [\n    \"accept\",\n    \"reject\",\n    \"manage\"\n]"
  }
]