Repository: janreges/siteone-crawler
Branch: main
Commit: 63203298f00e
Files: 132
Total size: 2.0 MB

Directory structure:
gitextract_j_gpbi69/

├── .githooks/
│   └── pre-commit
├── .github/
│   └── workflows/
│       ├── ci.yml
│       ├── publish.yml
│       └── release.yml
├── .gitignore
├── CHANGELOG.md
├── CLAUDE.md
├── Cargo.toml
├── LICENSE
├── README.md
├── docs/
│   ├── JSON-OUTPUT.md
│   ├── OUTPUT-crawler.siteone.io.json
│   ├── OUTPUT-crawler.siteone.io.txt
│   └── TEXT-OUTPUT.md
├── rustfmt.toml
├── src/
│   ├── analysis/
│   │   ├── accessibility_analyzer.rs
│   │   ├── analyzer.rs
│   │   ├── base_analyzer.rs
│   │   ├── best_practice_analyzer.rs
│   │   ├── caching_analyzer.rs
│   │   ├── content_type_analyzer.rs
│   │   ├── dns_analyzer.rs
│   │   ├── external_links_analyzer.rs
│   │   ├── fastest_analyzer.rs
│   │   ├── headers_analyzer.rs
│   │   ├── manager.rs
│   │   ├── mod.rs
│   │   ├── page404_analyzer.rs
│   │   ├── redirects_analyzer.rs
│   │   ├── result/
│   │   │   ├── analyzer_stats.rs
│   │   │   ├── dns_analysis_result.rs
│   │   │   ├── header_stats.rs
│   │   │   ├── heading_tree_item.rs
│   │   │   ├── mod.rs
│   │   │   ├── security_checked_header.rs
│   │   │   ├── security_result.rs
│   │   │   ├── seo_opengraph_result.rs
│   │   │   └── url_analysis_result.rs
│   │   ├── security_analyzer.rs
│   │   ├── seo_opengraph_analyzer.rs
│   │   ├── skipped_urls_analyzer.rs
│   │   ├── slowest_analyzer.rs
│   │   ├── source_domains_analyzer.rs
│   │   └── ssl_tls_analyzer.rs
│   ├── components/
│   │   ├── mod.rs
│   │   ├── summary/
│   │   │   ├── item.rs
│   │   │   ├── item_status.rs
│   │   │   ├── mod.rs
│   │   │   └── summary.rs
│   │   ├── super_table.rs
│   │   └── super_table_column.rs
│   ├── content_processor/
│   │   ├── astro_processor.rs
│   │   ├── base_processor.rs
│   │   ├── content_processor.rs
│   │   ├── css_processor.rs
│   │   ├── html_processor.rs
│   │   ├── javascript_processor.rs
│   │   ├── manager.rs
│   │   ├── mod.rs
│   │   ├── nextjs_processor.rs
│   │   ├── svelte_processor.rs
│   │   └── xml_processor.rs
│   ├── debugger.rs
│   ├── engine/
│   │   ├── crawler.rs
│   │   ├── found_url.rs
│   │   ├── found_urls.rs
│   │   ├── http_client.rs
│   │   ├── http_response.rs
│   │   ├── initiator.rs
│   │   ├── manager.rs
│   │   ├── mod.rs
│   │   ├── parsed_url.rs
│   │   └── robots_txt.rs
│   ├── error.rs
│   ├── export/
│   │   ├── base_exporter.rs
│   │   ├── exporter.rs
│   │   ├── file_exporter.rs
│   │   ├── html_report/
│   │   │   ├── badge.rs
│   │   │   ├── mod.rs
│   │   │   ├── report.rs
│   │   │   ├── tab.rs
│   │   │   └── template.html
│   │   ├── mailer_exporter.rs
│   │   ├── markdown_exporter.rs
│   │   ├── mod.rs
│   │   ├── offline_website_exporter.rs
│   │   ├── sitemap_exporter.rs
│   │   ├── upload_exporter.rs
│   │   └── utils/
│   │       ├── html_to_markdown.rs
│   │       ├── markdown_site_aggregator.rs
│   │       ├── mod.rs
│   │       ├── offline_url_converter.rs
│   │       └── target_domain_relation.rs
│   ├── extra_column.rs
│   ├── info.rs
│   ├── lib.rs
│   ├── main.rs
│   ├── options/
│   │   ├── core_options.rs
│   │   ├── group.rs
│   │   ├── mod.rs
│   │   ├── option.rs
│   │   ├── option_type.rs
│   │   └── options.rs
│   ├── output/
│   │   ├── json_output.rs
│   │   ├── mod.rs
│   │   ├── multi_output.rs
│   │   ├── output.rs
│   │   ├── output_type.rs
│   │   └── text_output.rs
│   ├── result/
│   │   ├── basic_stats.rs
│   │   ├── manager_stats.rs
│   │   ├── mod.rs
│   │   ├── status.rs
│   │   ├── storage/
│   │   │   ├── file_storage.rs
│   │   │   ├── memory_storage.rs
│   │   │   ├── mod.rs
│   │   │   ├── storage.rs
│   │   │   └── storage_type.rs
│   │   └── visited_url.rs
│   ├── scoring/
│   │   ├── ci_gate.rs
│   │   ├── mod.rs
│   │   ├── quality_score.rs
│   │   └── scorer.rs
│   ├── server.rs
│   ├── types.rs
│   ├── utils.rs
│   ├── version.rs
│   └── wizard/
│       ├── form.rs
│       ├── mod.rs
│       └── presets.rs
└── tests/
    ├── common/
    │   └── mod.rs
    └── integration_crawl.rs

================================================
FILE CONTENTS
================================================

================================================
FILE: .githooks/pre-commit
================================================
#!/bin/bash
# Pre-commit hook: run cargo fmt, clippy, and tests before committing.
set -e

echo "=== Pre-commit: cargo fmt --check ==="
cargo fmt -- --check

echo "=== Pre-commit: cargo clippy ==="
cargo clippy -- -D warnings

echo "=== Pre-commit: cargo test ==="
cargo test

echo "=== Pre-commit checks passed ==="


================================================
FILE: .github/workflows/ci.yml
================================================
name: CI

on:
  push:
    branches: [main]
  pull_request:
    branches: [main]

env:
  CARGO_TERM_COLOR: always

jobs:
  check:
    name: Check & Lint
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v6

      - uses: dtolnay/rust-toolchain@stable
        with:
          components: rustfmt, clippy

      - uses: actions/cache@v5
        with:
          path: |
            ~/.cargo/registry
            ~/.cargo/git
            target
          key: ${{ runner.os }}-cargo-${{ hashFiles('**/Cargo.lock') }}
          restore-keys: |
            ${{ runner.os }}-cargo-

      - name: Check formatting
        run: cargo fmt -- --check

      - name: Clippy
        run: cargo clippy -- -D warnings

      - name: Build
        run: cargo build

      - name: Run tests
        run: cargo test


================================================
FILE: .github/workflows/publish.yml
================================================
name: Publish to package managers

# Triggers when a draft release is published (manually via GitHub UI)
on:
  release:
    types: [published]
  workflow_dispatch:
    inputs:
      tag:
        description: 'Release tag (e.g. v2.0.1)'
        required: true

permissions:
  contents: write

jobs:
  # ─────────────────────────────────────────────────────────────────
  # Publish to crates.io
  # ─────────────────────────────────────────────────────────────────
  publish-crates:
    name: Publish to crates.io
    runs-on: ubuntu-latest
    if: vars.PUBLISH_CRATES == 'true'
    steps:
      - name: Checkout
        uses: actions/checkout@v6
        with:
          ref: ${{ github.event.release.tag_name || inputs.tag }}

      - name: Determine version
        id: version
        run: |
          TAG="${{ github.event.release.tag_name || inputs.tag }}"
          echo "version=${TAG#v}" >> "$GITHUB_OUTPUT"

      - name: Ensure Cargo.toml has correct version
        env:
          VERSION: ${{ steps.version.outputs.version }}
        run: sed -i "s/^version = .*/version = \"${VERSION}\"/" Cargo.toml

      - name: Install Rust toolchain
        uses: dtolnay/rust-toolchain@stable

      - name: Publish
        env:
          CARGO_REGISTRY_TOKEN: ${{ secrets.CARGO_REGISTRY_TOKEN }}
        run: cargo publish --no-verify --allow-dirty || echo "Already published (skipping)"

  # ─────────────────────────────────────────────────────────────────
  # Update Homebrew tap
  # ─────────────────────────────────────────────────────────────────
  publish-homebrew:
    name: Update Homebrew formula
    runs-on: ubuntu-latest
    if: vars.PUBLISH_HOMEBREW == 'true'
    steps:
      - name: Determine version
        id: version
        run: |
          TAG="${{ github.event.release.tag_name || inputs.tag }}"
          echo "version=${TAG#v}" >> "$GITHUB_OUTPUT"

      - name: Download release archives and compute SHA256
        env:
          VERSION: ${{ steps.version.outputs.version }}
        run: |
          BASE_URL="https://github.com/${{ github.repository }}/releases/download/v${VERSION}"
          for SUFFIX in linux-x64 linux-arm64 macos-x64 macos-arm64; do
            FILE="siteone-crawler-v${VERSION}-${SUFFIX}.tar.gz"
            curl -sfL "${BASE_URL}/${FILE}" -o "${FILE}"
            SHA=$(sha256sum "${FILE}" | cut -d' ' -f1)
            VAR_NAME="SHA_$(echo "${SUFFIX}" | tr '[:lower:]-' '[:upper:]_')"
            echo "${VAR_NAME}=${SHA}" >> "$GITHUB_ENV"
            echo "${VAR_NAME}=${SHA}"
          done

      - name: Clone Homebrew tap
        env:
          TAP_TOKEN: ${{ secrets.HOMEBREW_TAP_TOKEN }}
        run: |
          git clone "https://x-access-token:${TAP_TOKEN}@github.com/janreges/homebrew-tap.git" tap

      - name: Update formula
        env:
          VERSION: ${{ steps.version.outputs.version }}
        run: |
          cat > tap/Formula/siteone-crawler.rb <<'FORMULA'
          class SiteoneCrawler < Formula
            desc "Website crawler and QA toolkit in Rust for security, performance, SEO, and accessibility audits, offline cloning, markdown export, sitemap generation, cache warming, and CI/CD gating — one dependency-free binary for all major platforms, 10 tools in one."
            homepage "https://crawler.siteone.io/"
            version "VERSION_PLACEHOLDER"
            license "MIT"

            on_macos do
              if Hardware::CPU.arm?
                url "https://github.com/janreges/siteone-crawler/releases/download/v#{version}/siteone-crawler-v#{version}-macos-arm64.tar.gz"
                sha256 "SHA_MACOS_ARM64_PLACEHOLDER"
              else
                url "https://github.com/janreges/siteone-crawler/releases/download/v#{version}/siteone-crawler-v#{version}-macos-x64.tar.gz"
                sha256 "SHA_MACOS_X64_PLACEHOLDER"
              end
            end

            on_linux do
              if Hardware::CPU.arm?
                url "https://github.com/janreges/siteone-crawler/releases/download/v#{version}/siteone-crawler-v#{version}-linux-arm64.tar.gz"
                sha256 "SHA_LINUX_ARM64_PLACEHOLDER"
              else
                url "https://github.com/janreges/siteone-crawler/releases/download/v#{version}/siteone-crawler-v#{version}-linux-x64.tar.gz"
                sha256 "SHA_LINUX_X64_PLACEHOLDER"
              end
            end

            def install
              bin.install "siteone-crawler"
            end

            test do
              assert_match "SiteOne Crawler", shell_output("#{bin}/siteone-crawler --version")
            end
          end
          FORMULA
          sed -i "s/VERSION_PLACEHOLDER/${VERSION}/g" tap/Formula/siteone-crawler.rb
          sed -i "s/SHA_MACOS_ARM64_PLACEHOLDER/${SHA_MACOS_ARM64}/g" tap/Formula/siteone-crawler.rb
          sed -i "s/SHA_MACOS_X64_PLACEHOLDER/${SHA_MACOS_X64}/g" tap/Formula/siteone-crawler.rb
          sed -i "s/SHA_LINUX_ARM64_PLACEHOLDER/${SHA_LINUX_ARM64}/g" tap/Formula/siteone-crawler.rb
          sed -i "s/SHA_LINUX_X64_PLACEHOLDER/${SHA_LINUX_X64}/g" tap/Formula/siteone-crawler.rb

      - name: Push updated formula
        run: |
          cd tap
          git config user.name "github-actions[bot]"
          git config user.email "github-actions[bot]@users.noreply.github.com"
          git add Formula/siteone-crawler.rb
          git diff --cached --quiet && echo "Formula already up to date" && exit 0
          git commit -m "chore: update siteone-crawler to v${{ steps.version.outputs.version }}"
          git push

  # ─────────────────────────────────────────────────────────────────
  # Update Scoop bucket
  # ─────────────────────────────────────────────────────────────────
  publish-scoop:
    name: Update Scoop manifest
    runs-on: ubuntu-latest
    if: vars.PUBLISH_SCOOP == 'true'
    steps:
      - name: Determine version
        id: version
        run: |
          TAG="${{ github.event.release.tag_name || inputs.tag }}"
          echo "version=${TAG#v}" >> "$GITHUB_OUTPUT"

      - name: Download Windows archives and compute SHA256
        env:
          VERSION: ${{ steps.version.outputs.version }}
        run: |
          BASE_URL="https://github.com/${{ github.repository }}/releases/download/v${VERSION}"
          for SUFFIX in win-x64 win-arm64; do
            FILE="siteone-crawler-v${VERSION}-${SUFFIX}.zip"
            curl -sfL "${BASE_URL}/${FILE}" -o "${FILE}"
            SHA=$(sha256sum "${FILE}" | cut -d' ' -f1)
            VAR_NAME="SHA_$(echo "${SUFFIX}" | tr '[:lower:]-' '[:upper:]_')"
            echo "${VAR_NAME}=${SHA}" >> "$GITHUB_ENV"
          done

      - name: Clone Scoop bucket
        env:
          BUCKET_TOKEN: ${{ secrets.SCOOP_BUCKET_TOKEN }}
        run: |
          git clone "https://x-access-token:${BUCKET_TOKEN}@github.com/janreges/scoop-siteone.git" bucket

      - name: Update manifest
        env:
          VERSION: ${{ steps.version.outputs.version }}
        run: |
          mkdir -p bucket/bucket
          cat > bucket/bucket/siteone-crawler.json << 'TEMPLATE'
          {
              "version": "VERSION_PLACEHOLDER",
              "description": "Website crawler and QA toolkit in Rust for security, performance, SEO, and accessibility audits, offline cloning, markdown export, sitemap generation, cache warming, and CI/CD gating — one dependency-free binary for all major platforms, 10 tools in one.",
              "homepage": "https://crawler.siteone.io/",
              "license": "MIT",
              "architecture": {
                  "64bit": {
                      "url": "https://github.com/janreges/siteone-crawler/releases/download/vVERSION_PLACEHOLDER/siteone-crawler-vVERSION_PLACEHOLDER-win-x64.zip",
                      "hash": "HASH_X64_PLACEHOLDER"
                  },
                  "arm64": {
                      "url": "https://github.com/janreges/siteone-crawler/releases/download/vVERSION_PLACEHOLDER/siteone-crawler-vVERSION_PLACEHOLDER-win-arm64.zip",
                      "hash": "HASH_ARM64_PLACEHOLDER"
                  }
              },
              "extract_dir": "siteone-crawler",
              "bin": "siteone-crawler.exe",
              "checkver": "github",
              "autoupdate": {
                  "architecture": {
                      "64bit": {
                          "url": "https://github.com/janreges/siteone-crawler/releases/download/v$version/siteone-crawler-v$version-win-x64.zip"
                      },
                      "arm64": {
                          "url": "https://github.com/janreges/siteone-crawler/releases/download/v$version/siteone-crawler-v$version-win-arm64.zip"
                      }
                  }
              }
          }
          TEMPLATE
          sed -i "s/VERSION_PLACEHOLDER/${VERSION}/g" bucket/bucket/siteone-crawler.json
          sed -i "s/HASH_X64_PLACEHOLDER/${SHA_WIN_X64}/g" bucket/bucket/siteone-crawler.json
          sed -i "s/HASH_ARM64_PLACEHOLDER/${SHA_WIN_ARM64}/g" bucket/bucket/siteone-crawler.json

      - name: Push updated manifest
        run: |
          cd bucket
          git config user.name "github-actions[bot]"
          git config user.email "github-actions[bot]@users.noreply.github.com"
          git add bucket/siteone-crawler.json
          git commit -m "chore: update siteone-crawler to v${{ steps.version.outputs.version }}"
          git push

  # ─────────────────────────────────────────────────────────────────
  # Submit to WinGet
  # ─────────────────────────────────────────────────────────────────
  publish-winget:
    name: Submit to WinGet
    runs-on: windows-latest
    # Requires initial manual submission to microsoft/winget-pkgs first.
    # Once JanReges.SiteOneCrawler exists in winget-pkgs, set PUBLISH_WINGET=true.
    if: vars.PUBLISH_WINGET == 'true'
    steps:
      - name: Determine version
        id: version
        shell: bash
        run: |
          TAG="${{ github.event.release.tag_name || inputs.tag }}"
          echo "version=${TAG#v}" >> "$GITHUB_OUTPUT"

      - name: Install wingetcreate
        run: winget install Microsoft.WingetCreate --accept-source-agreements --accept-package-agreements

      - name: Update WinGet manifest
        env:
          VERSION: ${{ steps.version.outputs.version }}
          WINGET_TOKEN: ${{ secrets.WINGET_TOKEN }}
        run: |
          $url_x64 = "https://github.com/janreges/siteone-crawler/releases/download/v$env:VERSION/siteone-crawler-v$env:VERSION-win-x64.zip"
          $url_arm64 = "https://github.com/janreges/siteone-crawler/releases/download/v$env:VERSION/siteone-crawler-v$env:VERSION-win-arm64.zip"
          wingetcreate update JanReges.SiteOneCrawler `
            --version $env:VERSION `
            --urls $url_x64 $url_arm64 `
            --token $env:WINGET_TOKEN `
            --submit

  # ─────────────────────────────────────────────────────────────────
  # Update AUR package
  # ─────────────────────────────────────────────────────────────────
  publish-aur:
    name: Update AUR package
    runs-on: ubuntu-latest
    if: vars.PUBLISH_AUR == 'true'
    steps:
      - name: Determine version
        id: version
        run: |
          TAG="${{ github.event.release.tag_name || inputs.tag }}"
          echo "version=${TAG#v}" >> "$GITHUB_OUTPUT"

      - name: Compute SHA256 for Linux archives
        env:
          VERSION: ${{ steps.version.outputs.version }}
        run: |
          BASE_URL="https://github.com/${{ github.repository }}/releases/download/v${VERSION}"
          for SUFFIX in linux-x64 linux-arm64; do
            FILE="siteone-crawler-v${VERSION}-${SUFFIX}.tar.gz"
            curl -sfL "${BASE_URL}/${FILE}" -o "${FILE}"
            SHA=$(sha256sum "${FILE}" | cut -d' ' -f1)
            VAR_NAME="SHA_$(echo "${SUFFIX}" | tr '[:lower:]-' '[:upper:]_')"
            echo "${VAR_NAME}=${SHA}" >> "$GITHUB_ENV"
          done

      - name: Setup SSH for AUR
        env:
          AUR_SSH_KEY: ${{ secrets.AUR_SSH_KEY }}
        run: |
          mkdir -p ~/.ssh
          echo "$AUR_SSH_KEY" > ~/.ssh/aur
          chmod 600 ~/.ssh/aur
          echo "Host aur.archlinux.org" >> ~/.ssh/config
          echo "  IdentityFile ~/.ssh/aur" >> ~/.ssh/config
          echo "  User aur" >> ~/.ssh/config
          ssh-keyscan aur.archlinux.org >> ~/.ssh/known_hosts

      - name: Clone AUR repo and update PKGBUILD
        env:
          VERSION: ${{ steps.version.outputs.version }}
        run: |
          git clone ssh://aur@aur.archlinux.org/siteone-crawler-bin.git aur
          cd aur

          cat > PKGBUILD << PKGBUILD
          # Maintainer: Jan Reges <jan.reges@siteone.cz>
          pkgname=siteone-crawler-bin
          pkgver=${VERSION}
          pkgrel=1
          pkgdesc="Website crawler and QA toolkit in Rust for security, performance, SEO, and accessibility audits, offline cloning, markdown export, sitemap generation, cache warming, and CI/CD gating — one dependency-free binary for all major platforms, 10 tools in one."
          arch=('x86_64' 'aarch64')
          url="https://crawler.siteone.io/"
          license=('MIT')
          provides=('siteone-crawler')
          conflicts=('siteone-crawler')

          source_x86_64=("https://github.com/janreges/siteone-crawler/releases/download/v\${pkgver}/siteone-crawler-v\${pkgver}-linux-x64.tar.gz")
          source_aarch64=("https://github.com/janreges/siteone-crawler/releases/download/v\${pkgver}/siteone-crawler-v\${pkgver}-linux-arm64.tar.gz")
          sha256sums_x86_64=('${SHA_LINUX_X64}')
          sha256sums_aarch64=('${SHA_LINUX_ARM64}')

          package() {
              install -Dm755 "\${srcdir}/siteone-crawler/siteone-crawler" "\${pkgdir}/usr/bin/siteone-crawler"
              install -Dm644 "\${srcdir}/siteone-crawler/LICENSE" "\${pkgdir}/usr/share/licenses/\${pkgname}/LICENSE"
          }
          PKGBUILD

          cat > .SRCINFO << SRCINFO
          pkgbase = siteone-crawler-bin
          	pkgdesc = Website crawler and QA toolkit in Rust for security, performance, SEO, and accessibility audits, offline cloning, markdown export, sitemap generation, cache warming, and CI/CD gating — one dependency-free binary for all major platforms, 10 tools in one.
          	pkgver = ${VERSION}
          	pkgrel = 1
          	url = https://crawler.siteone.io/
          	arch = x86_64
          	arch = aarch64
          	license = MIT
          	provides = siteone-crawler
          	conflicts = siteone-crawler
          	source_x86_64 = https://github.com/janreges/siteone-crawler/releases/download/v${VERSION}/siteone-crawler-v${VERSION}-linux-x64.tar.gz
          	sha256sums_x86_64 = ${SHA_LINUX_X64}
          	source_aarch64 = https://github.com/janreges/siteone-crawler/releases/download/v${VERSION}/siteone-crawler-v${VERSION}-linux-arm64.tar.gz
          	sha256sums_aarch64 = ${SHA_LINUX_ARM64}

          pkgname = siteone-crawler-bin
          SRCINFO

          git config user.name "Jan Reges"
          git config user.email "jan.reges@siteone.cz"
          git add PKGBUILD .SRCINFO
          git commit -m "chore: update siteone-crawler to v${VERSION}"
          git push

  # ─────────────────────────────────────────────────────────────────
  # Publish .deb and .rpm to Cloudsmith (APT + DNF repository)
  # ─────────────────────────────────────────────────────────────────
  publish-cloudsmith:
    name: Publish to Cloudsmith
    runs-on: ubuntu-latest
    if: vars.PUBLISH_CLOUDSMITH == 'true'
    steps:
      - name: Determine version
        id: version
        run: |
          TAG="${{ github.event.release.tag_name || inputs.tag }}"
          echo "version=${TAG#v}" >> "$GITHUB_OUTPUT"

      - name: Download .deb, .rpm and .apk from release
        env:
          GH_TOKEN: ${{ github.token }}
          VERSION: ${{ steps.version.outputs.version }}
        run: |
          mkdir -p packages
          BASE_URL="https://github.com/${{ github.repository }}/releases/download/v${VERSION}"
          # Download all .deb, .rpm and .apk assets from the release
          for file in $(gh release view "v${VERSION}" --repo "${{ github.repository }}" --json assets -q '.assets[].name' | grep -E '\.(deb|rpm|apk)$'); do
            echo "Downloading ${file} ..."
            curl -sfL "${BASE_URL}/${file}" -o "packages/${file}"
          done

      - name: List packages
        run: ls -lhR packages/

      - name: Install Cloudsmith CLI
        run: pip install cloudsmith-cli

      - name: Upload .deb packages
        env:
          CLOUDSMITH_API_KEY: ${{ secrets.CLOUDSMITH_API_KEY }}
        run: |
          for deb in packages/*.deb; do
            [ -f "$deb" ] || continue
            echo "Uploading $deb ..."
            cloudsmith push deb janreges/siteone-crawler/any-distro/any-version "$deb" --republish
          done

      - name: Upload .rpm packages
        env:
          CLOUDSMITH_API_KEY: ${{ secrets.CLOUDSMITH_API_KEY }}
        run: |
          for rpm in packages/*.rpm; do
            [ -f "$rpm" ] || continue
            echo "Uploading $rpm ..."
            cloudsmith push rpm janreges/siteone-crawler/any-distro/any-version "$rpm" --republish
          done

      - name: Upload .apk packages
        env:
          CLOUDSMITH_API_KEY: ${{ secrets.CLOUDSMITH_API_KEY }}
        run: |
          for apk in packages/*.apk; do
            [ -f "$apk" ] || continue
            echo "Uploading $apk ..."
            cloudsmith push alpine janreges/siteone-crawler/alpine/any-version "$apk" --republish
          done


================================================
FILE: .github/workflows/release.yml
================================================
name: Release

# Trigger: push a tag like v1.0.10
on:
  push:
    tags:
      - 'v*'
  # Manual trigger for building artifacts only (no release created)
  workflow_dispatch:
    inputs:
      version:
        description: 'Version number (e.g. 1.0.10)'
        required: true

permissions:
  contents: write

env:
  CARGO_TERM_COLOR: always

jobs:
  build:
    name: Build ${{ matrix.artifact_suffix }}
    runs-on: ${{ matrix.os }}
    strategy:
      fail-fast: false
      matrix:
        include:
          - target: x86_64-unknown-linux-gnu
            os: ubuntu-latest
            artifact_suffix: linux-x64
            archive: tar.gz

          - target: aarch64-unknown-linux-gnu
            os: ubuntu-latest
            artifact_suffix: linux-arm64
            archive: tar.gz
            cross: true

          - target: x86_64-apple-darwin
            os: macos-latest
            artifact_suffix: macos-x64
            archive: tar.gz

          - target: aarch64-apple-darwin
            os: macos-latest
            artifact_suffix: macos-arm64
            archive: tar.gz

          - target: x86_64-pc-windows-msvc
            os: windows-latest
            artifact_suffix: win-x64
            archive: zip

          - target: aarch64-pc-windows-msvc
            os: windows-latest
            artifact_suffix: win-arm64
            archive: zip

          - target: x86_64-unknown-linux-musl
            os: ubuntu-latest
            artifact_suffix: linux-musl-x64
            archive: tar.gz
            musl: true

          - target: aarch64-unknown-linux-musl
            os: ubuntu-latest
            artifact_suffix: linux-musl-arm64
            archive: tar.gz
            cross: true
            musl: true

    steps:
      - name: Checkout
        uses: actions/checkout@v6

      - name: Determine version
        id: version
        shell: bash
        run: |
          if [[ "${{ github.event_name }}" == "workflow_dispatch" ]]; then
            VERSION="${{ github.event.inputs.version }}"
          else
            # Extract from tag: v1.0.10 -> 1.0.10
            VERSION="${GITHUB_REF_NAME#v}"
          fi
          echo "version=${VERSION}" >> "$GITHUB_OUTPUT"
          echo "Version: ${VERSION}"

      - name: Install Rust toolchain
        uses: dtolnay/rust-toolchain@stable
        with:
          targets: ${{ matrix.target }}

      - name: Install cross (for cross-compilation)
        if: matrix.cross
        run: cargo install cross --git https://github.com/cross-rs/cross

      - name: Install musl tools
        if: matrix.musl && !matrix.cross
        run: sudo apt-get install -y musl-tools

      - name: Update version in source
        shell: bash
        run: |
          VERSION="${{ steps.version.outputs.version }}"
          DATE_SUFFIX="$(date +%Y%m%d)"
          VERSION_CODE="${VERSION}.${DATE_SUFFIX}"

          # Update Cargo.toml
          sed -i.bak "s/^version = .*/version = \"${VERSION}\"/" Cargo.toml

          # Update version.rs
          sed -i.bak "s/^pub const CODE: .*/pub const CODE: \&str = \"${VERSION_CODE}\";/" src/version.rs

          echo "Cargo.toml version: ${VERSION}"
          echo "version.rs CODE: ${VERSION_CODE}"

      - name: Build
        shell: bash
        run: |
          if [[ "${{ matrix.cross }}" == "true" ]]; then
            cross build --release --target ${{ matrix.target }}
          else
            cargo build --release --target ${{ matrix.target }}
          fi

      # ── macOS Code Signing & Notarization ──────────────────────────
      - name: Import Apple certificate
        if: runner.os == 'macOS'
        env:
          CERTIFICATE_BASE64: ${{ secrets.APPLE_CERTIFICATE_BASE64 }}
          CERTIFICATE_PASSWORD: ${{ secrets.APPLE_CERTIFICATE_PASSWORD }}
        run: |
          CERTIFICATE_PATH="$RUNNER_TEMP/certificate.p12"
          KEYCHAIN_PATH="$RUNNER_TEMP/signing.keychain-db"
          KEYCHAIN_PASSWORD="$(openssl rand -hex 16)"

          echo -n "$CERTIFICATE_BASE64" | base64 --decode -o "$CERTIFICATE_PATH"

          security create-keychain -p "$KEYCHAIN_PASSWORD" "$KEYCHAIN_PATH"
          security set-keychain-settings -lut 21600 "$KEYCHAIN_PATH"
          security unlock-keychain -p "$KEYCHAIN_PASSWORD" "$KEYCHAIN_PATH"

          security import "$CERTIFICATE_PATH" \
            -P "$CERTIFICATE_PASSWORD" \
            -A -t cert -f pkcs12 \
            -k "$KEYCHAIN_PATH"

          security set-key-partition-list \
            -S apple-tool:,apple: \
            -k "$KEYCHAIN_PASSWORD" \
            "$KEYCHAIN_PATH"

          security list-keychain -d user -s "$KEYCHAIN_PATH"

      - name: Sign macOS binary
        if: runner.os == 'macOS'
        env:
          SIGNING_IDENTITY: ${{ secrets.APPLE_SIGNING_IDENTITY }}
        run: |
          BINARY="target/${{ matrix.target }}/release/siteone-crawler"

          codesign --force --options runtime \
            --sign "$SIGNING_IDENTITY" \
            "$BINARY"

          echo "Verifying signature..."
          codesign --verify --verbose "$BINARY"
          echo "Signature OK"

      - name: Notarize macOS binary
        if: runner.os == 'macOS'
        env:
          APPLE_ID: ${{ secrets.APPLE_ID }}
          APPLE_ID_PASSWORD: ${{ secrets.APPLE_ID_PASSWORD }}
          APPLE_TEAM_ID: ${{ secrets.APPLE_TEAM_ID }}
        run: |
          BINARY="target/${{ matrix.target }}/release/siteone-crawler"
          NOTARIZE_ZIP="$RUNNER_TEMP/notarize.zip"

          # ditto is required — Apple's notary service rejects zip-created archives
          ditto -c -k --keepParent "$BINARY" "$NOTARIZE_ZIP"

          echo "Submitting for notarization..."
          xcrun notarytool submit "$NOTARIZE_ZIP" \
            --apple-id "$APPLE_ID" \
            --password "$APPLE_ID_PASSWORD" \
            --team-id "$APPLE_TEAM_ID" \
            --wait

          echo "Notarization complete"

      - name: Clean up keychain
        if: runner.os == 'macOS' && always()
        run: |
          KEYCHAIN_PATH="$RUNNER_TEMP/signing.keychain-db"
          if [ -f "$KEYCHAIN_PATH" ]; then
            security delete-keychain "$KEYCHAIN_PATH"
          fi
      # ────────────────────────────────────────────────────────────────

      - name: Package (Unix)
        if: matrix.archive == 'tar.gz'
        shell: bash
        run: |
          VERSION="${{ steps.version.outputs.version }}"
          ARTIFACT="siteone-crawler-v${VERSION}-${{ matrix.artifact_suffix }}"
          mkdir -p "staging/siteone-crawler"
          cp "target/${{ matrix.target }}/release/siteone-crawler" "staging/siteone-crawler/"
          cp README.md "staging/siteone-crawler/" 2>/dev/null || true
          cp LICENSE "staging/siteone-crawler/" 2>/dev/null || true
          chmod +x "staging/siteone-crawler/siteone-crawler"
          (cd staging && tar czf "../${ARTIFACT}.tar.gz" siteone-crawler/)
          echo "ARTIFACT_PATH=${ARTIFACT}.tar.gz" >> "$GITHUB_ENV"

      - name: Package (Windows)
        if: matrix.archive == 'zip'
        shell: bash
        run: |
          VERSION="${{ steps.version.outputs.version }}"
          ARTIFACT="siteone-crawler-v${VERSION}-${{ matrix.artifact_suffix }}"
          mkdir -p "staging/siteone-crawler"
          cp "target/${{ matrix.target }}/release/siteone-crawler.exe" "staging/siteone-crawler/"
          cp README.md "staging/siteone-crawler/" 2>/dev/null || true
          cp LICENSE "staging/siteone-crawler/" 2>/dev/null || true
          (cd staging && 7z a -r "../${ARTIFACT}.zip" siteone-crawler/)
          echo "ARTIFACT_PATH=${ARTIFACT}.zip" >> "$GITHUB_ENV"

      # ── Build .deb and .rpm packages (Linux only) ──────────────
      - name: Install cross-compilation tools (arm64)
        if: runner.os == 'Linux' && matrix.cross
        run: sudo apt-get install -y binutils-aarch64-linux-gnu

      - name: Strip binary (Linux)
        if: runner.os == 'Linux'
        shell: bash
        run: |
          BINARY="target/${{ matrix.target }}/release/siteone-crawler"
          if [[ "${{ matrix.target }}" == "aarch64"* ]]; then
            aarch64-linux-gnu-strip -s "$BINARY" || true
          else
            strip -s "$BINARY" || true
          fi

      - name: Build .deb package
        if: runner.os == 'Linux'
        shell: bash
        run: |
          cargo install cargo-deb
          if [[ "${{ matrix.musl }}" == "true" ]]; then
            cargo deb --no-build --no-strip --target ${{ matrix.target }} --variant static
          else
            cargo deb --no-build --no-strip --target ${{ matrix.target }}
          fi
          echo "DEB_PATH=$(ls target/${{ matrix.target }}/debian/*.deb)" >> "$GITHUB_ENV"

      - name: Build .rpm package
        if: runner.os == 'Linux'
        shell: bash
        run: |
          cargo install cargo-generate-rpm
          mkdir -p target/release
          cp "target/${{ matrix.target }}/release/siteone-crawler" target/release/
          if [[ "${{ matrix.musl }}" == "true" ]]; then
            # Override package name for static/musl variant
            sed -i 's/^name = "siteone-crawler"$/name = "siteone-crawler-static"/' Cargo.toml
          fi
          cargo generate-rpm --target ${{ matrix.target }}
          echo "RPM_PATH=$(find target -name '*.rpm' -path '*/generate-rpm/*' | head -1)" >> "$GITHUB_ENV"

      - name: Upload .deb artifact
        if: runner.os == 'Linux'
        uses: actions/upload-artifact@v7
        with:
          name: siteone-crawler-${{ matrix.artifact_suffix }}-deb
          path: ${{ env.DEB_PATH }}

      - name: Upload .rpm artifact
        if: runner.os == 'Linux'
        uses: actions/upload-artifact@v7
        with:
          name: siteone-crawler-${{ matrix.artifact_suffix }}-rpm
          path: ${{ env.RPM_PATH }}
      # ────────────────────────────────────────────────────────────────

      - name: Upload artifact
        uses: actions/upload-artifact@v7
        with:
          name: siteone-crawler-${{ matrix.artifact_suffix }}
          path: ${{ env.ARTIFACT_PATH }}

  # ─────────────────────────────────────────────────────────────────
  # Build Alpine .apk packages from musl binaries
  # ─────────────────────────────────────────────────────────────────
  package-alpine:
    name: Build Alpine .apk (${{ matrix.arch }})
    needs: build
    runs-on: ubuntu-latest
    strategy:
      fail-fast: false
      matrix:
        include:
          - arch: x86_64
            artifact_suffix: linux-musl-x64
          - arch: aarch64
            artifact_suffix: linux-musl-arm64
    steps:
      - name: Checkout
        uses: actions/checkout@v6

      - name: Determine version
        id: version
        shell: bash
        run: |
          if [[ "${{ github.event_name }}" == "workflow_dispatch" ]]; then
            VERSION="${{ github.event.inputs.version }}"
          else
            VERSION="${GITHUB_REF_NAME#v}"
          fi
          echo "version=${VERSION}" >> "$GITHUB_OUTPUT"

      - name: Download musl binary
        uses: actions/download-artifact@v8
        with:
          name: siteone-crawler-${{ matrix.artifact_suffix }}
          path: dist

      - name: Extract binary
        run: |
          VERSION="${{ steps.version.outputs.version }}"
          tar xzf "dist/siteone-crawler-v${VERSION}-${{ matrix.artifact_suffix }}.tar.gz" -C dist

      - name: Setup Alpine
        uses: jirutka/setup-alpine@v1
        with:
          arch: ${{ matrix.arch }}
          packages: abuild

      - name: Prepare signing key
        shell: alpine.sh --root {0}
        env:
          ALPINE_RSA_KEY: ${{ secrets.ALPINE_RSA_PRIVATE_KEY }}
          ALPINE_RSA_PUB: ${{ secrets.ALPINE_RSA_PUBLIC_KEY }}
        run: |
          BUILDER=runner

          # Install signing key
          mkdir -p /etc/apk/keys
          printf '%s\n' "$ALPINE_RSA_PUB" > /etc/apk/keys/siteone.rsa.pub

          # Setup abuild config for builder
          mkdir -p "/home/$BUILDER/.abuild"
          printf '%s\n' "$ALPINE_RSA_KEY" > "/home/$BUILDER/.abuild/siteone.rsa"
          printf '%s\n' "$ALPINE_RSA_PUB" > "/home/$BUILDER/.abuild/siteone.rsa.pub"
          chmod 600 "/home/$BUILDER/.abuild/siteone.rsa"
          cat > "/home/$BUILDER/.abuild/abuild.conf" << 'EOF'
          PACKAGER_PRIVKEY="$HOME/.abuild/siteone.rsa"
          EOF
          chown -R "$BUILDER" "/home/$BUILDER/.abuild"

          # Add user to abuild group
          addgroup "$BUILDER" abuild

      - name: Build .apk
        shell: alpine.sh {0}
        env:
          VERSION: ${{ steps.version.outputs.version }}
        run: |
          ARCH=$(uname -m)

          # Prepare build directory
          mkdir -p ~/build
          cp "$GITHUB_WORKSPACE/dist/siteone-crawler/siteone-crawler" ~/build/
          cp "$GITHUB_WORKSPACE/LICENSE" ~/build/ 2>/dev/null || true

          # Create APKBUILD
          cat > ~/build/APKBUILD << EOF
          # Maintainer: Jan Reges <jan.reges@siteone.cz>
          pkgname=siteone-crawler
          pkgver=${VERSION}
          pkgrel=1
          pkgdesc="Website crawler and QA toolkit in Rust"
          url="https://crawler.siteone.io/"
          arch="${ARCH}"
          license="MIT"
          source=""
          options="!check !strip"

          package() {
              install -Dm755 "\$startdir/siteone-crawler" "\$pkgdir/usr/bin/siteone-crawler"
              install -Dm644 "\$startdir/LICENSE" "\$pkgdir/usr/share/licenses/\$pkgname/LICENSE" 2>/dev/null || true
          }
          EOF

          # Build the package
          cd ~/build
          abuild -d -P ~/packages

          # Copy and rename to include arch (both arches produce the same filename)
          mkdir -p "$GITHUB_WORKSPACE/apk-out"
          for f in $(find ~/packages -name '*.apk'); do
            BASENAME=$(basename "$f" .apk)
            cp "$f" "$GITHUB_WORKSPACE/apk-out/${BASENAME}-${ARCH}.apk"
          done

      - name: Upload .apk artifact
        uses: actions/upload-artifact@v7
        with:
          name: siteone-crawler-alpine-${{ matrix.arch }}
          path: apk-out/*.apk

  release:
    name: Create GitHub Release
    needs: [build, package-alpine]
    runs-on: ubuntu-latest
    if: always() && startsWith(github.ref, 'refs/tags/v') && needs.build.result == 'success'
    steps:
      - name: Checkout
        uses: actions/checkout@v6

      - name: Download all artifacts
        uses: actions/download-artifact@v8
        with:
          path: artifacts
          merge-multiple: true

      - name: Determine version
        id: version
        run: echo "version=${GITHUB_REF_NAME#v}" >> "$GITHUB_OUTPUT"

      - name: List artifacts
        run: ls -lhR artifacts/

      - name: Create Release
        uses: softprops/action-gh-release@v2
        with:
          name: "v${{ steps.version.outputs.version }}"
          body: |
            ### Downloads

            | Platform | Architecture | File |
            |----------|-------------|------|
            | Linux | x64 | `siteone-crawler-v${{ steps.version.outputs.version }}-linux-x64.tar.gz` |
            | Linux | arm64 | `siteone-crawler-v${{ steps.version.outputs.version }}-linux-arm64.tar.gz` |
            | Linux | x64 (musl/static) | `siteone-crawler-v${{ steps.version.outputs.version }}-linux-musl-x64.tar.gz` |
            | Linux | arm64 (musl/static) | `siteone-crawler-v${{ steps.version.outputs.version }}-linux-musl-arm64.tar.gz` |
            | macOS | arm64 (Apple Silicon) | `siteone-crawler-v${{ steps.version.outputs.version }}-macos-arm64.tar.gz` |
            | macOS | x64 (Intel) | `siteone-crawler-v${{ steps.version.outputs.version }}-macos-x64.tar.gz` |
            | Windows | x64 | `siteone-crawler-v${{ steps.version.outputs.version }}-win-x64.zip` |
            | Windows | arm64 | `siteone-crawler-v${{ steps.version.outputs.version }}-win-arm64.zip` |

            ### Linux packages (glibc — best performance, requires glibc 2.39+)

            | Format | Architecture | File |
            |--------|-------------|------|
            | Debian/Ubuntu (.deb) | x64 | `siteone-crawler_${{ steps.version.outputs.version }}-1_amd64.deb` |
            | Debian/Ubuntu (.deb) | arm64 | `siteone-crawler_${{ steps.version.outputs.version }}-1_arm64.deb` |
            | Fedora/RHEL (.rpm) | x64 | `siteone-crawler-${{ steps.version.outputs.version }}-1.x86_64.rpm` |
            | Fedora/RHEL (.rpm) | arm64 | `siteone-crawler-${{ steps.version.outputs.version }}-1.aarch64.rpm` |

            ### Linux packages (musl/static — any Linux, ~50–80% slower)

            | Format | Architecture | File |
            |--------|-------------|------|
            | Debian/Ubuntu (.deb) | x64 | `siteone-crawler-static_${{ steps.version.outputs.version }}-1_amd64.deb` |
            | Debian/Ubuntu (.deb) | arm64 | `siteone-crawler-static_${{ steps.version.outputs.version }}-1_arm64.deb` |
            | Fedora/RHEL (.rpm) | x64 | `siteone-crawler-static-${{ steps.version.outputs.version }}-1.x86_64.rpm` |
            | Fedora/RHEL (.rpm) | arm64 | `siteone-crawler-static-${{ steps.version.outputs.version }}-1.aarch64.rpm` |
            | Alpine (.apk) | x64 | `siteone-crawler-${{ steps.version.outputs.version }}-r1-x86_64.apk` |
            | Alpine (.apk) | arm64 | `siteone-crawler-${{ steps.version.outputs.version }}-r1-aarch64.apk` |

            ### Quick start

            ```bash
            # Extract and run
            tar xzf siteone-crawler-v${{ steps.version.outputs.version }}-linux-x64.tar.gz
            cd siteone-crawler
            ./siteone-crawler --url=https://example.com
            ```

            ### Install via package manager

            ```bash
            # Debian/Ubuntu (glibc — Ubuntu 24.04+, Debian 13+)
            sudo dpkg -i siteone-crawler_${{ steps.version.outputs.version }}-1_amd64.deb

            # Debian/Ubuntu (static/musl — older distributions)
            sudo dpkg -i siteone-crawler-static_${{ steps.version.outputs.version }}-1_amd64.deb

            # Fedora/RHEL
            sudo dnf install ./siteone-crawler-${{ steps.version.outputs.version }}-1.x86_64.rpm
            ```
          files: artifacts/*
          generate_release_notes: true
          draft: true
          prerelease: false


================================================
FILE: .gitignore
================================================
/target
/tmp/
/dist/
*.swp
*.swo
*~
.idea/
.vscode/
*.cache


================================================
FILE: CHANGELOG.md
================================================
### Changelog

All notable changes to this project will be documented in this file. Dates are displayed in UTC.

#### [v1.0.9](https://github.com/janreges/siteone-crawler/compare/v1.0.8...v1.0.9)

- typos: non exhaustive typo and spelling corrections [`#8`](https://github.com/janreges/siteone-crawler/pull/8)
- offline exporter: new option --ignore-store-file-error for the OfflineWebsiteExporter [`#16`](https://github.com/janreges/siteone-crawler/pull/16)
- url handling: added option --transform-url to force requests for some URL to be internally transformed and a different URL/domain (e.g. local) to be queried, fixes #58 [`#58`](https://github.com/janreges/siteone-crawler/issues/58)
- html report: added option to list which sections to include in the HTML report via --html-report-options (see README.md), fixes #63 [`#63`](https://github.com/janreges/siteone-crawler/issues/63)
- offline export: fix behavior regarding URLs containing various valid UTF-8 characters (German, Chinese, etc.), fixes #65 [`#65`](https://github.com/janreges/siteone-crawler/issues/65)
- seo analysis: fix for an issue that occurs when encoding UTF-8 due to some special characters in the content, fixes #51 [`#51`](https://github.com/janreges/siteone-crawler/issues/51)
- offline website exporter: added option --offline-export-no-auto-redirect-html, which disables the generation of automatic sub-folder.html with meta redirects to sub-folder/index.html, fixes #54 [`#54`](https://github.com/janreges/siteone-crawler/issues/54)
- offline website exporter: fix replacing reference  where it is followed by  and not an immediate number, fixes #52 [`#52`](https://github.com/janreges/siteone-crawler/issues/52)
- slowest analyzer: fixed typo slowest-&gt;slower, fixes #42 [`#42`](https://github.com/janreges/siteone-crawler/issues/42)
- url & sitemaps: as --url it is now possible to specify a URL to sitemap xml, or sitemap index xml, from which to find a list of all URLs, fixes #25 [`#25`](https://github.com/janreges/siteone-crawler/issues/25)
- github: remove all unnecessary files from the release package [`e54029c`](https://github.com/janreges/siteone-crawler/commit/e54029cbef015a259d92e93933e81af2e851a145)
- github: fix release workflow [`c9d5361`](https://github.com/janreges/siteone-crawler/commit/c9d5361acd646b24e47cb6e60e7d07be12cd96c9)
- github: workflow for automatic creation of release archives for all 5 supported platforms/architectures [`0a461ac`](https://github.com/janreges/siteone-crawler/commit/0a461aca0b145982005b6a460d3e852a0767426a)
- webp analysis: if there are avif images on the website (they are more optimized than webp), we will not report the absence of webp [`e067653`](https://github.com/janreges/siteone-crawler/commit/e06765332fa743f9bb22f5eb589cb71a01dc90db)
- term: if TERM is not set or we're not in a TTY, use default width 138 [`eb839e4`](https://github.com/janreges/siteone-crawler/commit/eb839e423abf4df7020822986dc9e2ae43d44971)
- options: handling of the situation of calling only 'crawler' without a parameters - complete documentation and a red message about the need to pass at least the --url parameter will be displayed [`fc390ae`](https://github.com/janreges/siteone-crawler/commit/fc390ae693ba201b060effc67a90ad893772558f)
- phpstan: fix errors found by phpstan and increasing the memory limit for phpstan [`650d46a`](https://github.com/janreges/siteone-crawler/commit/650d46abb867ff04df24be01c3c6daebd42b0911)
- tests: fix the tests after removing the underscore for the external domain [`b31a872`](https://github.com/janreges/siteone-crawler/commit/b31a872fdc439a321c364665d18634906ce8ad30)
- Revert "url parser: fix url parsing in some cases when href starts with './'" [`240430b`](https://github.com/janreges/siteone-crawler/commit/240430bc90039063b9e810980360f798afa46f74)
- url parser: fix url parsing in some cases when href starts with './' [`2443532`](https://github.com/janreges/siteone-crawler/commit/244353202c80c152b6a3b63ef83f6046338404e9)
- url parser: fix url parsing in some cases when href starts with './' [`fe33e7b`](https://github.com/janreges/siteone-crawler/commit/fe33e7b6c404a62cf636db20b6193196c4bf6e25)
- website to markdown: added --markdown-remove-links-and-images-from-single-file - useful when used within an AI tool to obtain context from a website (typically with documentation of a solution/framework) [`631e544`](https://github.com/janreges/siteone-crawler/commit/631e544b9eb836a01f68f055e80b8b35b16687dc)
- website to markdown: fixed the problem with incorrect sorting of the root index.md (homepage should be at the beginning) [`c2ffff3`](https://github.com/janreges/siteone-crawler/commit/c2ffff32a48e84c872e132417ef1623015755e7e)
- website to markdown: fine tuning of the resulting markdown files, correct detection of table headers, removal of excess whitespaces [`ee40b29`](https://github.com/janreges/siteone-crawler/commit/ee40b2915611824c676c5d4761a266edba6be0d2)
- website to markdown: added --markdown-export-single-file for the ability to save all website content into one combined markdown file (smart detection and removal of shared headers and footers is also implemented) [`af01376`](https://github.com/janreges/siteone-crawler/commit/af013766830991f473d26dc25dc5804cc88b7c76)
- readme: changed partnership to powered by JetBrains [`e77f755`](https://github.com/janreges/siteone-crawler/commit/e77f755527319e99d66cec6b2b2864dee4d560e4)
- readme: added partnership with JetBrains [`0104646`](https://github.com/janreges/siteone-crawler/commit/0104646b6f209eae1c530bb68160a5fa238f7dda)
- website to markdown: added implicit excluded selectors for typical 'hidden' classes [`b3c57d6`](https://github.com/janreges/siteone-crawler/commit/b3c57d69f9ef52592cab308b493b328b81c29705)
- website to markdown: consecutive links fixes (ignore links without visible text or defined href) [`6d9a310`](https://github.com/janreges/siteone-crawler/commit/6d9a31053a56bb532961395a2e1821f2028e36ac)
- website to markdown: list fixes and prepared auto-removal of duplicates (e.g. desktop & mobile version of menus) [`338b0c6`](https://github.com/janreges/siteone-crawler/commit/338b0c692a434a4f3a2a20160c9f45004526c04a)
- website to markdown: removed unwanted escaping from links/images [`35c6f57`](https://github.com/janreges/siteone-crawler/commit/35c6f579a62080316210ec00734af8069ea32f27)
- website to markdown: refactoring the way ul/ol lists are composed (there were problems with nested lists and whitespaces) [`15ea68c`](https://github.com/janreges/siteone-crawler/commit/15ea68ce7e3e3d962c5813ad971571eec42fe933)
- README: improved introduction and added icons [`737b8c6`](https://github.com/janreges/siteone-crawler/commit/737b8c63bce618bdd613090205866d03bde1d67b)
- docs: added Table of Contents to JSON-OUTPUT.md and TEXT-OUTPUT.md [`2aa2856`](https://github.com/janreges/siteone-crawler/commit/2aa28569de9fa0315219e68d084cc675ead57303)
- docs: added detailed documentation and real sample JSON and TXT output from the crawler for a better idea of ​​its functionality [`09495d1`](https://github.com/janreges/siteone-crawler/commit/09495d187e3f80d4f4c29176c12540e300c5cb6f)
- docs: added detailed documentation and real sample JSON and TXT output from the crawler for a better idea of ​​its functionality [`cb7606b`](https://github.com/janreges/siteone-crawler/commit/cb7606b2c8fe1547cfdf787d0b7050693228ff2e)
- json output docs: first version [`73e8d45`](https://github.com/janreges/siteone-crawler/commit/73e8d45ad93e1cc88a778533d0955e22cec9d6c7)
- output options: added option --timezone (e.g. Europe/Prague, default is UTC) to set the time zone in which dates and times in HTML reports and exported folder/file names should be, refs #57 [`e3d3213`](https://github.com/janreges/siteone-crawler/commit/e3d321315b6c9f0290b5795345a90e78af32a358)
- website to markdown: use link URL as text when link text is empty [`873ffae`](https://github.com/janreges/siteone-crawler/commit/873ffae76a8d96c8a2a4e4670ad09f4ed8527d4a)
- website to markdown: if the link contains nested div/span tags, display the link in markdown as a list-item so that it is on its own line [`c48f346`](https://github.com/janreges/siteone-crawler/commit/c48f34614a057b79e5f9e5d5fbb9877cd7c2d25f)
- website to markdown: removed the use of html2markdown (problematic integration on windows due to cygwin) and replaced with a custom HtmlToMarkdownConverter [`4e1db09`](https://github.com/janreges/siteone-crawler/commit/4e1db090f7b9276663c8fda587e8673d67783340)
- content processor: added justification for skipping URLs due to exceeding --max-depth [`a6bc08a`](https://github.com/janreges/siteone-crawler/commit/a6bc08ac2367b3fb008b51e1278b3b78ae5bfe28)
- README: converting arguments to a table view and adding missing links to the outline [`c23a686`](https://github.com/janreges/siteone-crawler/commit/c23a6860f06918062a159747039bd38e868cd7f8)
- README: added all missing options (--max-reqs-per-sec, --max-heading-level, --websocket-server, --console-width and a few others less important) [`82c48bc`](https://github.com/janreges/siteone-crawler/commit/82c48bccf597a7c3811c16ef6c8b29fc37d7c46c)
- extra columns: added option to extract data using XPath and RegEx to --extra-columns [`cd6d55a`](https://github.com/janreges/siteone-crawler/commit/cd6d55af254f4f38b25399293aa6d122c578f4c7)
- http response: ensuring that the repeated response header is merged into a concatenated string, instead of an array, refs #48 [`c0f3b21`](https://github.com/janreges/siteone-crawler/commit/c0f3b210e3ddca203eb9363f038bcf4e30a3f30c)
- css processor: fix for a situation where some processors could cause CSS content to be NULL [`c8f2ffc`](https://github.com/janreges/siteone-crawler/commit/c8f2ffc45628a2f0f1e477dc7e2ea436c9ebafbe)
- website to markdown: better removal of nested images in situations like [![logo by @foobar](data:image/gif;base64,fooo= "logo by @foobar")](index.html) [`9ecba5e`](https://github.com/janreges/siteone-crawler/commit/9ecba5e91608fbbcd625e3ff42621869a7e31f00)
- website to markdown: first version of the converter of entire web pages to markdown [`b944edb`](https://github.com/janreges/siteone-crawler/commit/b944edbcc33381c97ba220e1920994574c676225)
- security check: handle case of multiple headers with the same name [`706977e`](https://github.com/janreges/siteone-crawler/commit/706977e545c428ab82714e95a75294841dac5e46)
- html processor: do not remove the schema and host for URLs defined in --ignore-regex [`8be42af`](https://github.com/janreges/siteone-crawler/commit/8be42afea5af076aee097842fa3c4996e66c47ef)
- offline export: added --offline-export-remove-unwanted-code=&lt;1/0&gt; (default is 1) to remove unwanted code for offline mode - typically, JS of the analytics, social networks, cookie consent, cross origins, etc .. refs #37 [`17a11fa`](https://github.com/janreges/siteone-crawler/commit/17a11fa3fe7a2d9c012e0f70c2392e833e02193c)
- loop protection: added --max-non200-responses-per-basename as configurable protection against looping with dynamic non-200 URLs. If a basename (the last part of the URL after the last slash) has more non-200 responses than this limit, other URLs with same basename will be ignored/skipped [`063bddf`](https://github.com/janreges/siteone-crawler/commit/063bddf47a9fe82dc2b08297acd16fd154001feb)
- bin/swoole-cli: upgrade to latest Swoole 6.0.0 (this version already supports Swoole\Threads - in the future there will be a refactoring that will relieve us of the necessity to use Swoole\Table, which requires memory preallocation for a predefined number of rows + my ticket https://github.com/swoole/swoole-src/issues/5460 has been processed regarding the support of getting the values of repeated header) [`b6e7c23`](https://github.com/janreges/siteone-crawler/commit/b6e7c23c055032a1003605ef2679f1ca59b64a08)
- css processor: fix query string and anchor processing for paths in url() + don't replace url(data:*) with complex information e.g. about svg including brackets, refs #31 [`36eece8`](https://github.com/janreges/siteone-crawler/commit/36eece89c0719602145dbf51673d80355a80bfd2)
- skipped urls: width defined fixed at 60 - better for most situations than the previous dynamic calculation [`8ef462f`](https://github.com/janreges/siteone-crawler/commit/8ef462f2fb52b65213136705536ba575dd2a9511)
- manager: refactored mb_convert_encoding() -&gt; htmlentities() as part of the migration to PHP 8.4.1 [`5c7c903`](https://github.com/janreges/siteone-crawler/commit/5c7c903d7d4d178b691c870e4a71fc862685c21d)
- http cache analysis: added analysis of http cache of all pages and assets - divided by content type, domains, and their combination [`b09cfbd`](https://github.com/janreges/siteone-crawler/commit/b09cfbdf3fe033feef3b64b0fcbbda15dc0308ab)
- css processing: added search for urls in @import url(*.css) [`c964fea`](https://github.com/janreges/siteone-crawler/commit/c964fea1382fec71b990ac2cd89683590694d5b3)
- analysis/report: if there is no URL with code &gt;= 200, there is no point to perform analysis, print empty output of all analyzers and generate full report [`c1bb448`](https://github.com/janreges/siteone-crawler/commit/c1bb448922cb47d0fe7fa28d2c5f540d6961ea94)
- options: fix passing booleans to correctUrl() in case of empty '-u' or '--url' parameters (recognized as boolean flags) [`a297fec`](https://github.com/janreges/siteone-crawler/commit/a297feccb34002604705c180855d8f12cd0e41a2)
- skipped-urls: added overview of skipped URLs including summary across domains - not only from security point of view it is good to know where external links are pointing and from where js/css/fonts/images are loaded [`84ae146`](https://github.com/janreges/siteone-crawler/commit/84ae1467a6c02b194e9e5631351f00a52b5924e0)
- user-agent: if a manually defined user-agent ends with the exclamation !, do not add the signature siteone-crawler/version and remove the exclamation [`cfda3b0`](https://github.com/janreges/siteone-crawler/commit/cfda3b072e208966f9e7078211257d1a027d2bfa)
- options: better response and warning for unfilled required --url [`52e50db`](https://github.com/janreges/siteone-crawler/commit/52e50db58f15bd10ab64b70f4c3f3fbf299c0135)
- dns resolving: added --resolve attribute, which behaves exactly the same as curl, and using the 'domain:port:ip' entry it is possible to provide a custom IP address for the domain:port pair [`4031181`](https://github.com/janreges/siteone-crawler/commit/403118132807f30c65ed89b1b2d8f924a22e3a90)
- windows/cygwin: workarounds for cygwin environment to return as much DNS/SSL/TLS info as possible even if nslookup or dig cannot be called [`bfc4f55`](https://github.com/janreges/siteone-crawler/commit/bfc4f5508e85b4af2e7c17309181791a3a9d5fc1)
- upload timeout: fix that --upload-timeout does not overwrite the primary timeout [`c429639`](https://github.com/janreges/siteone-crawler/commit/c429639e30d44420bc9af017536714df52868813)
- readme: adding a sample report and clone of nextjs.org and a few other updates [`07ad5e1`](https://github.com/janreges/siteone-crawler/commit/07ad5e119b47455ff4a2e3ba6230a21203d40396)
- readme: added description for --allowed-domain-for-external-files and --allowed-domain-for-crawling [`0c8b1b3`](https://github.com/janreges/siteone-crawler/commit/0c8b1b3fb791a5c0e8f540a34d62122682680c19)
- filtering: added --single-foreign-page to ensure that only the linked page and its assets are loaded from the external domain (which second-level domain is not the same as the initialization URL), but not all other pages on the external domain are automatically crawled [`c4af4ec`](https://github.com/janreges/siteone-crawler/commit/c4af4ec5fb76456f4d47eaf6041ba4be4fbb48b8)
- filtering: added --disable-all-assets as a shortcut for calling all --disable-* flags [`7e32c44`](https://github.com/janreges/siteone-crawler/commit/7e32c440fb0ee0260f7b1e2c6b2a01b753ffb149)
- filtering: added --max-depth=&lt;int&gt; for maximum crawling depth (for pages, not assets) and --single-page moved to basic options [`2dbff75`](https://github.com/janreges/siteone-crawler/commit/2dbff756dec3735f8f8c9f293dcd846eb3b3fde6)
- resource filtering: added --single-page for loading only one given URL and their assets [`7325a4b`](https://github.com/janreges/siteone-crawler/commit/7325a4bbf633f60015309e257af509a5f21384d5)
- offline exporter: added the possibility to use --replace-query-string to replace the default behavior where the query string is replaced by a short hash constructed from the query string in filenames, see issue #30 [`1a3482c`](https://github.com/janreges/siteone-crawler/commit/1a3482c6dada06b8482f205ceb181d8b42a62607)
- offline export: added --replace-content=&lt;val&gt; option to replace content in HTML/JS/CSS before saving to disk (with strict text & regexp support) [`81cddaa`](https://github.com/janreges/siteone-crawler/commit/81cddaaf57550ac253b3e1ab322c3f5498374e96)
- revert caps [`76a7418`](https://github.com/janreges/siteone-crawler/commit/76a74184c871714871f537344a84e757069fff0c)
- Revert "Auxiliary commit to revert individual files from b3bb0eea10075aee124cce485379c24ece78df79" [`5878be9`](https://github.com/janreges/siteone-crawler/commit/5878be97f663d8ac70eac9e56578e628faeabb9f)
- robots.txt handling: process Disallow records only for user-agent 'SiteOne-Crawler' or '*' [`9c2c989`](https://github.com/janreges/siteone-crawler/commit/9c2c989c569fed518bb5139c1d496159cc486683)
- new option for the OfflineWebsiteExporter [`2c4bbbc`](https://github.com/janreges/siteone-crawler/commit/2c4bbbc6f0e55a4f3af6a89be50450e15b65cdd2)
- tables: added --rows-limit option (default 200) to hard limit the length of all tables with data from analyses (except Visited URLs) to prevent very long and slow reports .. tables are sorted by severity, so it should be ok [`9798252`](https://github.com/janreges/siteone-crawler/commit/9798252901dd25797d1d38fa26a19c6dbc409fa1)
- video gallery: added display of all found videos with video player (including use of observer for lazy loading and smart option to preload first seconds of video + button to play 2 seconds of each video sequentially) [`411736a`](https://github.com/janreges/siteone-crawler/commit/411736ac3852d07464fe4a4a52c4c0bf171d716f)
- license: change of licensing to MIT [`14b73e2`](https://github.com/janreges/siteone-crawler/commit/14b73e2e10cc924112966d2c5b16812dadf1fc48)
- non exhaustive typo and spelling corrections [`b3bb0ee`](https://github.com/janreges/siteone-crawler/commit/b3bb0eea10075aee124cce485379c24ece78df79)

#### [v1.0.8](https://github.com/janreges/siteone-crawler/compare/v1.0.7...v1.0.8)

> 24 August 2024

- reports: changed file name composition from report.mydomain.com.* to mydomain.com.report.* [`#9`](https://github.com/janreges/siteone-crawler/pull/9)
- version: update to 1.0.8.20240824 [`6c634e0`](https://github.com/janreges/siteone-crawler/commit/6c634e0f88cce49aa3f5fb9cd69ca55fa5191bd8)
- version 1.0.8.20240824 + changelog [`a02cc7b`](https://github.com/janreges/siteone-crawler/commit/a02cc7bf4c0fc4703189341d9ea0be2345b95796)
- crawler: solved edge-case, which very rarely occurred when the queue processing was already finished, but the last outstanding coroutine still found some new URL [`a85990d`](https://github.com/janreges/siteone-crawler/commit/a85990d662d74af281805cfdf10c0320fee0007a)
- javascript processor: improvement of webpack JS processing in order to correctly replace paths from VueJS during offline export (as e.g. in case of docs.netlify.com) .. without this, HTML had the correct paths in the left menu, but JS immediately broke them because they started with an absolute path with a slash at the beginning [`9bea99b`](https://github.com/janreges/siteone-crawler/commit/9bea99b9684e6059b8abfad4b382fafdad31c9a9)
- offline export: detect and process fonts.googleapis.com/css* as CSS even if there is no .css extension [`da33100`](https://github.com/janreges/siteone-crawler/commit/da33100975635be8305e07c2023a22c300b66216)
- js processor: removed the forgotten var_dump [`5f2c36d`](https://github.com/janreges/siteone-crawler/commit/5f2c36de1666e6987d2c9d88a39e3b6d0a2e1f32)
- offline export: improved search for external JS in the case of webpack (dynamic composition of URLs from an object with the definition of chunks) - it was debugged on docs.netlify.com [`a61e72e`](https://github.com/janreges/siteone-crawler/commit/a61e72e7f5b773a437b4151432db04a5afd7124a)
- offline export: in case the URL ends with a dot and a number (so it looks like an extension), we must not recognize it as an extension in some cases [`c382d95`](https://github.com/janreges/siteone-crawler/commit/c382d959f7440ebfcd95566ec0050e771a2f3495)
- offline url converter: better support for SVG in case the URL does not contain an extension at all, but has e.g. 'icon' in the URL (it's not perfect) [`c9c01a6`](https://github.com/janreges/siteone-crawler/commit/c9c01a69905fefce82f4e8f85e707a0d1abb5e1e)
- offline exporter: warning instead of exception for some edge-cases, e.g. not saving SVG without an extension does not cause the export to stop [`9d285f4`](https://github.com/janreges/siteone-crawler/commit/9d285f4d599ba8892dd8752e8d831cd3c86af178)
- cors: do not set Origin request header for images (otherwise error 403 on cdn.sanity.io for svg, etc.) [`2f3b7eb`](https://github.com/janreges/siteone-crawler/commit/2f3b7eb51a03d42d3d2961c84aadcd118b546e05)
- best practice analyzer: in checking for missing quotes ignore values ​​longer than 1000 characters (fixes, e.g., at skoda-auto.cz the error Compilation failed: regular expression is too large at offset 90936) [`8a009df`](https://github.com/janreges/siteone-crawler/commit/8a009df9734773275fd9805862dc9bfeeccb6079)
- html report: added loading of extra headers to the visited URL list in the HTML report [`781cf17`](https://github.com/janreges/siteone-crawler/commit/781cf17c18088126db74ebc1ef00fee3d6784979)
- Frontload the report names [`62d2aae`](https://github.com/janreges/siteone-crawler/commit/62d2aae57e31c7bfa53720446cc8dfbc59e482af)
- robots.txt: added option --ignore-robots-txt (we often need to view internal or preview domains that are otherwise prohibited from indexing by search engines) [`9017c45`](https://github.com/janreges/siteone-crawler/commit/9017c45a675dd327895b57f14095ad6bd52a02fc)
- http client: adden an explicit 'Connection: close' header and explicitly calling $client-&gt;close(), even though Swoole was doing it automatically after exiting the coroutine [`86a7346`](https://github.com/janreges/siteone-crawler/commit/86a7346d059452d210b945ca4329e1cc17781dca)
- javascript processor: parse url addresses to import the JS module only in JS files (otherwise imports from HTML documentation, e.g. on the websites svelte.dev or nextjs.org, were parsed by mistake) [`592b618`](https://github.com/janreges/siteone-crawler/commit/592b618c01e75509e16a812fafab7f21f3c7c64d)
- html processor: added obtaining urls from HTML attributes that are not wrapped in quotes (but I am aware that current regexps can cause problems in the cases when are used spaces, which are not properly escaped) [`f00abab`](https://github.com/janreges/siteone-crawler/commit/f00ababfa459eca27dce7657fe91c70831f86089)
- offline url converter: swapping woff2/woff order for regex because in this case their priority is important and because of that woff2 didn't work properly [`3f318d1`](https://github.com/janreges/siteone-crawler/commit/3f318d19fa0a3757546493ac7f47cca21922b1f5)
- non-200 url basename detection: we no longer consider e.g. image generators that have the same basename and the url to the image in the query parameters as the same basename [`bc15ef1`](https://github.com/janreges/siteone-crawler/commit/bc15ef198bb13fe845fef8cd4946b2cab5c2ea6d)
- supertable: activation of automatic creation of active links also for homepage '/' [`c2e228e`](https://github.com/janreges/siteone-crawler/commit/c2e228e0d475351431cf9b060487e86ce6d33e52)
- analysis and robots.txt: improving the display of url addresses for SEO analysis in the case of a multi-domain website, so that it cannot happen that the same url, e.g. '/', is in the overview multiple times without recognizing the domain or scheme + improving the work with robots.txt in SEO detection and displaying urls banned for indexing [`47c7602`](https://github.com/janreges/siteone-crawler/commit/47c7602217e40a4f6d4f3af5c71d6dff72952aab)
- offline website exporter: we add the suffix '_' to the folder name only in the case of a typical extension of a static file - we don't want this to happen with domain names as well [`d16722a`](https://github.com/janreges/siteone-crawler/commit/d16722a5ad6271270fb0fff11e66a7f02f3b6e9a)
- javascript processor: extract JS urls also from imports like import {xy} from "./path/foo.js" [`aec6cab`](https://github.com/janreges/siteone-crawler/commit/aec6cab051a46df9d89866f5cfd7e66312dafb92)
- visited url: added 'txt' extension to looksLikeStaticFileByUrl() [`460c645`](https://github.com/janreges/siteone-crawler/commit/460c6453d91e85c2889ebaa2b2542fd88c5ffa6a)
- html processor: extract JS urls also from &lt;link href="*.js"&gt;, typically with rel="modulepreload" [`c4a92be`](https://github.com/janreges/siteone-crawler/commit/c4a92bee00d96c530431134370a3ba0d2216a1c1)
- html processor: extracting repeated calls to getFullUrl() into a variable [`a5e1306`](https://github.com/janreges/siteone-crawler/commit/a5e1306530717d9edd4f95a7989539a172a38f4a)
- analysis: do not include urls that failed to load (timeout, skipping, etc.) in the analysis of content-types and source-domains - prevention of displaying content type 'unknown' [`b21ecfb`](https://github.com/janreges/siteone-crawler/commit/b21ecfb85f58d07c0a82b93826ad2977ab2cd523)
- cli options: improved method of removing quotes even for options that can be arrays - also fixes --extra-columns='Title' [`97f2761`](https://github.com/janreges/siteone-crawler/commit/97f27611acf2fc4ed24b1e5574be84711ea3fa12)
- url skipping: if there are a lot of URLs with the same basename (ending after the last slash), we will allow a maximum of 5 requests for URLs with the same basename - the purpose is to prevent a lot of 404 from being triggered when there is an incorrect relative link to relative/my-img.jpg on all pages (e.g. on 404 page on v2.svelte.dev) [`4fbb917`](https://github.com/janreges/siteone-crawler/commit/4fbb91791f9111cc6f9d98b60732fcca7fad2f1f)
- analysis: perform most of the analysis only on URLs from domains for which we have crawling enabled [`313adde`](https://github.com/janreges/siteone-crawler/commit/313addede29ac847273b6ab6ed3a8ab878a6fb4a)
- audio & video: added audio/video file search in &lt;audio&gt; and &lt;video&gt; tags, if file crawling is not disabled [`d72a5a5`](https://github.com/janreges/siteone-crawler/commit/d72a5a51bd6863425a3d8bcffc7a9b5eb831f979)
- base practices: retexting stupid warning like '&lt;h2&gt; after &lt;h0&gt;' to '&lt;h2&gt; without previous heading [`041b383`](https://github.com/janreges/siteone-crawler/commit/041b3836a8a585158ae1a1a6fb0057b367f3a4f6)
- initial url redirect: in the case thats is entered url that redirects to another url/domain within the same 2nd-level domain (typically http-&gt;https or mydomain.tld -&gt; www.mydomain.tld redirects), we continue crawling with new url/domain and declare a new url as initial url [`166e617`](https://github.com/janreges/siteone-crawler/commit/166e617fbc893798dc7b340f43de75df2d4cf335)

#### [v1.0.7](https://github.com/janreges/siteone-crawler/compare/v1.0.6...v1.0.7)

> 22 December 2023

- version 1.0.7.20231222 + changelog [`9d2be52`](https://github.com/janreges/siteone-crawler/commit/9d2be52776c081989322953c7a31debfd4947420)
- html report template: updated logo link to crawler.siteone.io [`9892cfe`](https://github.com/janreges/siteone-crawler/commit/9892cfe5708a3da2f5fc355246dd50b2a0c5cb4f)
- http headers analysis: renamed 'Headers' to 'HTTP headers' [`436e6ea`](https://github.com/janreges/siteone-crawler/commit/436e6ea5a9914c8615bb03b444ac0aad15e31c49)
- sitemap generator: added info about crawler to generated sitemap.xml [`7cb7005`](https://github.com/janreges/siteone-crawler/commit/7cb7005bf50b8f93b421c94c57ff51eb99b45912)
- html report: refactor of all inline on* event listeners to data attributes and event listeners added from static JS inside &lt;script&gt;, so that we can disable all inline JS in the online HTML report and allow only our JS signed with hashes by Content-Security-Policy [`b576eef`](https://github.com/janreges/siteone-crawler/commit/b576eef55a5678a67928970fc51aaaefd7abd1a8)
- readme: removed HTTP auth from roadmap (it's already done), improved guide how to implement own upload endpoint and message about SMTP moved under mailer options [`e1567ae`](https://github.com/janreges/siteone-crawler/commit/e1567aee52f9d09c1cef1ad35babaf9eea388175)
- utils: hide passwords/authentication specified in cli parameters as *auth=xyz (e.g. --http-auth=abc:xyz)" in html report [`c8bb88f`](https://github.com/janreges/siteone-crawler/commit/c8bb88fc1a65ecdfd53db23fc5d972b841830837)
- readme: fixed formatting of the upload and expert options [`2d14bd5`](https://github.com/janreges/siteone-crawler/commit/2d14bd5972496989624f91617de2689601e1c027)
- readme: added Upload Options [`d8352c5`](https://github.com/janreges/siteone-crawler/commit/d8352c5acfddbeef1c1ae6498556dc296d944e0b)
- upload exporter: added possibility via --upload to upload HTML report to offline URL, by default crawler.siteone.io/html/* [`2a027c3`](https://github.com/janreges/siteone-crawler/commit/2a027c38bfdb8e6e416b9a79ebe81e809c9326d9)
- parsed-url: fixed warning in the case of url without host [`284e844`](https://github.com/janreges/siteone-crawler/commit/284e844f3f94cdb02032ddb76e51caa9a584c120)
- seo and opengraph: fixed false positives 'DENY (robots.txt)' in some cases [`658b649`](https://github.com/janreges/siteone-crawler/commit/658b6494130fa282505ec38f12aa058acf7709b9)
- best practices and inline-svgs: detection and display of the entire icon set in the HTML report in the case of &lt;svg&gt; with more &lt;symbol&gt; or &lt;g&gt; [`3b2772c`](https://github.com/janreges/siteone-crawler/commit/3b2772c59f822b7b4a6f91e15b616815b5ff92c4)
- sitemap generator: sort urls primary by number of dashes and secondary alphabetically (thanks to this, urls of the main levels will be at the beginning) [`bbc47e6`](https://github.com/janreges/siteone-crawler/commit/bbc47e6239f9693c621016a50e624698dc3d242d)
- sitemap generator: only include URLs from the same domain as the initial URL [`9969254`](https://github.com/janreges/siteone-crawler/commit/9969254e35cd8c134f85a7817de8722091f0377c)
- changelog: updated by 'composer changelog' [`0c67fd4`](https://github.com/janreges/siteone-crawler/commit/0c67fd4f8d308d8d51d5b912d9b82cc96fb6e4fb)
- package.json: used by auto-changelog generator [`6ad8789`](https://github.com/janreges/siteone-crawler/commit/6ad87895e5a8ab8bbce3d9cbf92ee5e8b8218cc0)

#### [v1.0.6](https://github.com/janreges/siteone-crawler/compare/v1.0.5...v1.0.6)

> 8 December 2023

- readme: removed bold links from the intro (it didn't look as good on github as it did in the IDE) [`b675873`](https://github.com/janreges/siteone-crawler/commit/b6758733cde67f11322a2f82573b19ec1a0edc9d)
- readme: improved intro and gif animation with the real output [`fd9e2d6`](https://github.com/janreges/siteone-crawler/commit/fd9e2d69c8f940cfaa81ad7bab86f1a74f01b0da)
- http auth: for security reasons, we only send auth data to the same 2nd level domain (and possibly subdomains). With HTTP basic auth, the name and password are only base64 encoded and we would send them to foreign domains (which are referred to from the crawled website) [`4bc8a7f`](https://github.com/janreges/siteone-crawler/commit/4bc8a7f9871064aa1c88c374aa299904409d2817)
- html report: increased specificity of the .header class for the header, because this class were also used by the generic class at &lt;td class='header'&gt; in security tab [`9d270e8`](https://github.com/janreges/siteone-crawler/commit/9d270e884545d6459f20348db71404e513ae8928)
- html report: improved readability of badge colors in light mode [`76c5680`](https://github.com/janreges/siteone-crawler/commit/76c5680397446b84f3b13800590d914b7a9b0533)
- crawler: moving the decrement of active workers after parsing URLs from the content, where further filling of the queue could occur (for this reason, queue processing could sometimes get stuck in the final stages) [`f8f82ab`](https://github.com/janreges/siteone-crawler/commit/f8f82ab61c1969952bb70f1b598ed3d97938a84e)
- analysis: do not parse/check empty HTML (it produced unnecessary warning) - it is valid to have content-type: text/html but with connect-lengt: 0 (for example case for 'gtm.js?id=') [`436d81b`](https://github.com/janreges/siteone-crawler/commit/436d81b81f905178fb972f8b5cd0236bac244bc4)

#### [v1.0.5](https://github.com/janreges/siteone-crawler/compare/v1.0.4...v1.0.5)

> 3 December 2023

- changelog: updated changelog after 3 added commits to still untagged draft release 1.0.5 [`f42fe18`](https://github.com/janreges/siteone-crawler/commit/f42fe18de89676dc0dea4dc033207c934282d04b)
- utils tests: fixed tests of methods getAbsolutePath() and getOutputFormattedPath() [`d4f4576`](https://github.com/janreges/siteone-crawler/commit/d4f4576ff566eb48495c9fb55a898b0989ef42c3)
- crawler.php: replaced preg_match to str_contains [`5b28952`](https://github.com/janreges/siteone-crawler/commit/5b289521cdbb90b6571a29cb9c880e065b852129)
- version: 1.0.5.20231204 + changelog [`7f2e974`](https://github.com/janreges/siteone-crawler/commit/7f2e9741fab25e9369151bc2d79a38b8827e2463)
- option: replace placeholders like a '%domain' also in validateValue() method because there is also check if path is writable with attempt to mkdir [`329143f`](https://github.com/janreges/siteone-crawler/commit/329143fa23925ea523504735b3f724c026fe5ac6)
- swoole in cygwin: improved getBaseDir() to work better even with the version of Swoole that does not have SCRIPT_DIR [`94cc5af`](https://github.com/janreges/siteone-crawler/commit/94cc5af4411a8c7427ee136a937ac629b8637668)
- html processor: it must also process the page with the redirect, because is needed to replace the URL in the meta redirect tag [`9ce0eee`](https://github.com/janreges/siteone-crawler/commit/9ce0eeeebe1e524b9d46d91dd4cecb2e796db8c3)
- sitemap: use formatted output path (primary for better output in Cygwin environment with needed C:/foo &lt;-&gt; /cygwin/c/foo conversion) [`6297a7f`](https://github.com/janreges/siteone-crawler/commit/6297a7f4069f9e09c013268e0df896db2fa91dec)
- file exporter: use formatted output path (primary for better output in Cygwin environment with needed C:/foo &lt;-&gt; /cygwin/c/foo conversion) [`426cfb2`](https://github.com/janreges/siteone-crawler/commit/426cfb2b32f854d65abfce841e4e4f4badf04fef)
- options: in the case of dir/file validation, we want to work with absolute paths for more precise error messages [`6df228b`](https://github.com/janreges/siteone-crawler/commit/6df228bdfc87a2c9fb6eee611fdc87d976b7f721)
- crawler.php: improved baseDir detection - we want to work with absolute path in all scenarios [`9d1b2ce`](https://github.com/janreges/siteone-crawler/commit/9d1b2ce9bedb15ede90bcee9641e1cfc62b9c3cc)
- utils: improved getAbsolutePath() for cygwin and added getOutputFormattedPath() with reverse logic for cygwin (C:/foo/bar &lt;-&gt; /cygdrive/c/foo/bar) [`161cfc5`](https://github.com/janreges/siteone-crawler/commit/161cfc5c4fd3fa3675cade409d7d5e11db2da0c6)
- offline export: renamed --offline-export-directory to --offline-export-dir for consistency with --http-cache-dir or --result-storage-dir [`26ef45d`](https://github.com/janreges/siteone-crawler/commit/26ef45d145a1a02a5313067e6298571e26d9618b)

#### [v1.0.4](https://github.com/janreges/siteone-crawler/compare/v1.0.3...v1.0.4)

> 30 November 2023

- dom parsing: handling warnings in case of impossibility to parse some DOM elements correctly, fixes #3 [`#3`](https://github.com/janreges/siteone-crawler/issues/3)
- version: 1.0.4.20231201 + changelog [`8e15781`](https://github.com/janreges/siteone-crawler/commit/8e15781265cdd9cce10d9dcde57d46b57b50e1cf)
- options: ignore empty values in the case of directives with the possibility of repeated definition [`5e30c2f`](https://github.com/janreges/siteone-crawler/commit/5e30c2f8ad6cf00ad819ba1d7d6ec4e6c95a7113)
- http-cache: now the http cache is turned off using the 'off' value (it's more understandable) [`9508409`](https://github.com/janreges/siteone-crawler/commit/9508409fbba2d96dc92cd73bed5abe462d5cea15)
- core options: added --console-width to enforce the definition of the console width and disable automatic detection via 'tput cols' on macOS/Linux or 'mode con' on Windows (used by Electron GUI) [`8cf44b0`](https://github.com/janreges/siteone-crawler/commit/8cf44b06616e15301c486146a7c6b1003ce5137f)
- gui support: added base-dir detection for Windows where the GUI crawler runs in Cygwin [`5ce893a`](https://github.com/janreges/siteone-crawler/commit/5ce893a66c7f1e21af025603b66223e04246e029)
- renaming: renamed 'siteone-website-crawler' to 'siteone-crawler' and 'SiteOne Website Crawler' to 'SiteOne Crawler' [`64ddde4`](https://github.com/janreges/siteone-crawler/commit/64ddde4b53f16679a8c4671c98b3f9c619d94b42)
- utils: fixed color-support detection [`62dbac0`](https://github.com/janreges/siteone-crawler/commit/62dbac07d15ecfa0ff677c277e2a3381a47025bf)
- core options: added --force-color options to bypass tty detection (used by Electron GUI) [`607b4ad`](https://github.com/janreges/siteone-crawler/commit/607b4ad8583845adea209f75edfa27870ac23f9d)
- best practice analysis: in the case of checking an image (e.g. for the existence of WebP/AVIF), we also want to check external images, because very often websites have images linked from external domains or services for image modification or optimization [`6100187`](https://github.com/janreges/siteone-crawler/commit/6100187347e0bbba6270335e2d9b2faf37475333)
- html report: set scaleDown as default object-fit for image gallery [`91cd300`](https://github.com/janreges/siteone-crawler/commit/91cd300dcd7455c2b9be548fb2746cea7fd7c904)
- offline exporter: added short -oed as alias to --offline-export-directory [`22368d9`](https://github.com/janreges/siteone-crawler/commit/22368d9a892aab8011aa4a0884bf01a8560f6167)
- image gallery: list of all images on the website (except those from the srcset, where there would be duplicates only in other sizes or formats), including SVG with rich filtering options (through image format, size and source tag/attribute) and the option of choosing small/medium/view and scale-down/contains/cover for object-fit css property [`43de0af`](https://github.com/janreges/siteone-crawler/commit/43de0af1c60d398f91b373c192d1a35ac2df2fd1)
- core options: added a shortened version of the command name consisting of only one hyphen and the first letters of the words of the full command (e.g. --memory-limit has short version -ml), added getInitialScheme() [`eb9a3cc`](https://github.com/janreges/siteone-crawler/commit/eb9a3cc62dffc58be2701c52bb21509d39a5dfad)
- visited url: added 'sourceAttr' with information about where the given URL was found and useful helper methods [`6de4e39`](https://github.com/janreges/siteone-crawler/commit/6de4e39c5f8b9ba685e3865193274ccf0ee91a3d)
- found urls: in the case of the occurrence of one URL in several places/attributes, we consider the first one to be the main one (typically the same URL in src and then also in srcset) [`660bb2b`](https://github.com/janreges/siteone-crawler/commit/660bb2b2bd2cb6949fe9c573e72b31e9fb97a9fe)
- url parsing: added more recognition of which attributes the given URL address was parsed from (we need to recognize src and srcset for ImageGallery in particular) [`802c3c6`](https://github.com/janreges/siteone-crawler/commit/802c3c66a40087745e68f47392f0e6e8e9725171)
- supertable and urls: in removing the redundant hostname for a more compact URL output, we also take into account the scheme http:// or https:// of initial URL (otherwise somewhere it lookedlike duplicate) + prevention of ansi-color definitions for bash in the HTML output [`915469e`](https://github.com/janreges/siteone-crawler/commit/915469e2a4a6d0fed337ca70efe9170758751ade)
- title/description/keywords parsing: added html entities decoding because some website uses decoded entities with &#xED; &#x2013; etc [`920523d`](https://github.com/janreges/siteone-crawler/commit/920523d3c55baf6cd7b2602334d9776b3e40f4d7)
- crawler: added 'sourceAttr' to the swoole table queue and already visited URLs (we will use it in the Image Gallery for filtering, so as not to display unnecessarily and a lot of duplicate images only in other resolutions from the srcsets) [`0345abc`](https://github.com/janreges/siteone-crawler/commit/0345abc6dab770e3196dd88ff0123a2050828644)
- url parameter: it is already possible not to enter the scheme and https:// or http:// will be added automatically (http:// for e.g. for localhost) [`85e14e9`](https://github.com/janreges/siteone-crawler/commit/85e14e961b53b83c208ac936972a335cace61bf8)
- disabled images: in the case of a request to remove the images, replace their body with a 1x1px transparent gif and place a semi-transparent hatch with the crawler logo and opacity as a background [`c1418c3`](https://github.com/janreges/siteone-crawler/commit/c1418c3154301fd3995dde421b066f16850203e7)
- url regex filtering: added option , which will allow you to limit the list of crawled pages according to the declared regexps, but at the same time it will allow you to crawl and download assets (js, css, images, fonts, documents, etc.) from any URL (but with respect to allowed domains) [`21e67e5`](https://github.com/janreges/siteone-crawler/commit/21e67e5be74050cd5b7c9998654ed66f18db4d85)
- img srcset parsing: because a valid URL can also contain a comma (and various dynamic parametric img generators use them) and in the srcset a comma+whitespace should be used to separate multiple values, this is also reflected in the srcset parsing [`0db578b`](https://github.com/janreges/siteone-crawler/commit/0db578bda37c024b2b111c814e35c2107e4751ad)
- websocket server: added option to set --websocket-server, which starts a parallel process with the websocket server, through which the crawler sends various information about the progress of crawling (this will also be used by Electron UI applications) [`649132f`](https://github.com/janreges/siteone-crawler/commit/649132f8965421cd1bb3570fbb9f534e6caef313)
- http client: handle scenario when content loaded from cache is not valid (is_bool) [`1ddd099`](https://github.com/janreges/siteone-crawler/commit/1ddd099ecdadc5752016237ec1f0acf80e907dc8)
- HTML report: updated logo with final look [`2a3bb42`](https://github.com/janreges/siteone-crawler/commit/2a3bb428180067a649f2467419920b3d4f70a9fd)
- mailer: shortening and simplifying email content [`e797107`](https://github.com/janreges/siteone-crawler/commit/e7971071f8c5e4cff1472464ce9ec4407c198a59)
- robots.txt: added info about loaded robots.txt to summary (limited to 10 domains for case of huge multi domain crawling) [`00f9365`](https://github.com/janreges/siteone-crawler/commit/00f93659637705bc6389c5f073a29f09b743370f)
- redirects analyzer: handled edge case with empty url [`e9be1e3`](https://github.com/janreges/siteone-crawler/commit/e9be1e350b1d114c54b7099b54277da23467b538)
- text output: added fancy banner with crawler logo (thanks to great SiteOne designers!) and smooth effect [`e011c35`](https://github.com/janreges/siteone-crawler/commit/e011c35f3cbc87fceb9d7a9c56c726817c79b543)
- content processors: added applyContentChangesBeforeUrlParsing() and better NextJS chunks handling [`e5c404f`](https://github.com/janreges/siteone-crawler/commit/e5c404f2d52a7c2ebdb80ae3c93760c7e881dc9a)
- url searches: added ignoring data:, mailto:, tel:, file:// and other non-requestable resources also to FoundUrls [`5349be2`](https://github.com/janreges/siteone-crawler/commit/5349be242f99567b8f5f093537a696ef5fd319ac)
- crawler: added declare(strict_types=1) and banner [`27134d2`](https://github.com/janreges/siteone-crawler/commit/27134d29d16e3e24c633f010f731f11deeeadcb7)
- heading structure analysis: highlighting and calculating errors for duplicate &lt;h1&gt; + added help cursor with a hint [`f5c7db6`](https://github.com/janreges/siteone-crawler/commit/f5c7db6206ed06e0cbaf38a7ae2505be573da2e6)
- core options: added --help and --version, colorized help [`6f1ada1`](https://github.com/janreges/siteone-crawler/commit/6f1ada112898580d2de028c02e32fdeb8ad2a845)
- ./crawler binary - send output of cd - to /dev/null and hide unwanted printed script path [`16fe79d`](https://github.com/janreges/siteone-crawler/commit/16fe79d08e24c4a6fbd87d16417413725aaa24e8)
- README: updated paths in the documentation - it is now possible to use the ERROR: Option --url () must be valid URL [`86abd99`](https://github.com/janreges/siteone-crawler/commit/86abd998da94971c2512b6018085f39e8dd5db7f)
- options: --workers default for Cygwin runtime is now 1 (instead of 3), because Cygwin runtime is highly unstable when workers &gt; 1 [`f484960`](https://github.com/janreges/siteone-crawler/commit/f4849606fb382e1b759f547c4f1bfe2e5d8b4d02)

#### [v1.0.3](https://github.com/janreges/siteone-crawler/compare/v1.0.2...v1.0.3)

> 10 November 2023

- version: 1.0.3.20231110 + changelog [`5b80965`](https://github.com/janreges/siteone-crawler/commit/5b8096550dcd489a998d34fae44e3d99375e33e3)
- cache/storage: better race-condition handling in a situation where several coroutines could write the same folder at one time, then mkdir reported 'File exists' [`be543dc`](https://github.com/janreges/siteone-crawler/commit/be543dc195e675e49064b20ee091903f1977942a)

#### [v1.0.2](https://github.com/janreges/siteone-crawler/compare/v1.0.1...v1.0.2)

> 10 November 2023

- version: 1.0.2.20231110 + changelog [`230b947`](https://github.com/janreges/siteone-crawler/commit/230b9478a36ee664dfe080447c09da9c4a9bc25c)
- html report: added aria labels to active/important elements [`a329b9d`](https://github.com/janreges/siteone-crawler/commit/a329b9d4e0f040996c17cb3382cf3c07c61a4b35)
- version: 1.0.1.20231109 - changelog [`50dc69c`](https://github.com/janreges/siteone-crawler/commit/50dc69c9ab956691bbf97860355d410a0bdba0c9)

#### [v1.0.1](https://github.com/janreges/siteone-crawler/compare/v1.0.0...v1.0.1)

> 9 November 2023

- version: 1.0.1.20231109 [`e213cb3`](https://github.com/janreges/siteone-crawler/commit/e213cb326db78e2f69fd3e4f04b9728223550a3d)
- offline exporter: fixed case when on https:// website is link to same path but with http:// protocol (it overrided proper *.html file just with meta redirect .. real case from nextjs.org) [`4a1be0b`](https://github.com/janreges/siteone-crawler/commit/4a1be0bdfb62167c498f6c3b4c91fe74532ff833)
- html processor: force to remove all anchor listeners when NextJS is detected (it is very hard to achive a working NextJS with offline file:// protocol) [`2b1d935`](https://github.com/janreges/siteone-crawler/commit/2b1d935419bade80d8e6ab07b2ae04ded0df131e)
- file exporters: now by default crawler generates a html/json/txt report to 'tmp/[report|output].%domain%.%datetime%.[html|json|txt]' .. i assume that most people will want to save/see them [`7831c6b`](https://github.com/janreges/siteone-crawler/commit/7831c6b87dd41444a0fca529bc450bf7934ef541)
- security analysis: removed multi-line console output for recommendations .. it was ugly [`310af30`](https://github.com/janreges/siteone-crawler/commit/310af308859dbb2fd5895af468195e2339f2788d)
- json output: added JSON_UNESCAPED_UNICODE for unescaped unicode chars (e.g. czech chars will be readable) [`cf1de9f`](https://github.com/janreges/siteone-crawler/commit/cf1de9f60820963ccb78a00b43ca3aec8b311a77)
- mailer: do not send e-mails in case of interruption of the crawler using ctrl+c [`19c94aa`](https://github.com/janreges/siteone-crawler/commit/19c94aac8211b4550ba11497e1332d604f8cdbc7)
- refactoring: manager stats logic extracted into ManagerStats and implemented also into manager of content processors + stats added into 'Crawler stats' tab in HTML report [`3754200`](https://github.com/janreges/siteone-crawler/commit/3754200652dc91ac05efe22812e64c0e4be84019)
- refactoring: content related logic extracted to content processors based on ContentProcessor interface with methods findUrls():?FoundUrls, applyContentChangesForOfflineVersion():void and isContentTypeRelevant():bool + better division of web framework related logic (NextJS, Astro, Svelte, ...) + better URL handling and maximized usage of ParsedUrl [`6d9f25c`](https://github.com/janreges/siteone-crawler/commit/6d9f25ce82f8a1cfbfbc6bc0b5a6a07262c427b1)
- phpstan: ignore BASE_DIR warning [`6e0370a`](https://github.com/janreges/siteone-crawler/commit/6e0370aafe02d3bb2ca528ea8a9a37995f5ddce6)
- offline website exporter: improved export of a website based on NextJS, but it's not perfect, because latest NextJS version do not have some JS/CSS path in code, but they are generated dynamicly from arrays/objects [`c4993ef`](https://github.com/janreges/siteone-crawler/commit/c4993efcb97f7058834713ed273f9c4274be5cad)
- seo analyzer: fixed trim() warning when no &lt;h1&gt; found [`f0c526f`](https://github.com/janreges/siteone-crawler/commit/f0c526f5d2ff7d0155c1bfc7da7a6c0f2f7a1419)
- offline export: a lot of improvements when generating the offline version of the website on NextJS - chunk detection from the manifest, replacing paths, etc. [`98c2e15`](https://github.com/janreges/siteone-crawler/commit/98c2e15acf4e22d25301d160968555c19ddd44cc)
- seo and og: fixed division by zero when no og/twitter tags found [`19e4259`](https://github.com/janreges/siteone-crawler/commit/19e4259c519a3e41eb7aa8eabce80e6364e74639)
- console output: lots of improvements for nice, consistent and minimal word-wrap output [`596a5dc`](https://github.com/janreges/siteone-crawler/commit/596a5dc17945359ffc0fef2ed8ed8ee8bfc1db00)
- basic file/dir structure: created ./crawler (for Linux/macOS) and ./crawler.bat for Windows, init script moved to ./src, small related changes about file/dir path building [`5ce41ee`](https://github.com/janreges/siteone-crawler/commit/5ce41ee8e78425747bf40327152bd99499c64013)
- header status: ignore too dynamic Content-Disposition header [`4e0c6fd`](https://github.com/janreges/siteone-crawler/commit/4e0c6fdf5c356f8c0eea78ccebe29641b90f96b4)
- offline website exporter: added .html extensions to typical dynamic language extensions, because without it the browser will show them as source code [`7130b9e`](https://github.com/janreges/siteone-crawler/commit/7130b9eb666eca5b08c9dbeda91198bc85b31379)
- html report: show tables with details, even if they are without data (it is good to know that the checks were carried out, but nothing was found) [`da019e4`](https://github.com/janreges/siteone-crawler/commit/da019e4591682c21e9f78de1ec26939088d92ccc)
- tests: repaired tests after last changes of file/url building for offline website .. merlot is great! [`7c77c41`](https://github.com/janreges/siteone-crawler/commit/7c77c411ff67c01e07d16cb2acce0e926b264fcd)
- utils: be more precise and do not replace attributes in SVG .. creative designers will not love you when looking at the broken SVG in HTML report [`3fc81bb`](https://github.com/janreges/siteone-crawler/commit/3fc81bb0c47eef2935da2e74721a809a9aff0959)
- utils: be more precise in parsing phone numbers, otherwise people will 'love' you because of false positives .. wine is still great [`51fd574`](https://github.com/janreges/siteone-crawler/commit/51fd574c764d832d74cb5e67eed890bd9d349a5c)
- html parser: better support for formatted html with tags/attributes on multiple lines [`89a36d2`](https://github.com/janreges/siteone-crawler/commit/89a36d2fcf3d96b61c4b3d2e20d5a46f4cb96cb8)
- utils: don't be hungry in stripJavaScript() because you ate half of my html :) wine is already in my head... [`0e00957`](https://github.com/janreges/siteone-crawler/commit/0e0095727638b7940d2e555a6be231ad3dde19e4)
- file result storage: changed cache directory structure for consistency with http client's cache, so it looks like my.domain.tld-443/04/046ec07c.cache [`26bf428`](https://github.com/janreges/siteone-crawler/commit/26bf428f95bc428485d7cf505e74c8a69c94d869)
- http client cache: for better consistency with result storage cache, directory structure now contains also port, so it looks like my.domain.tld-443/b9/b989bdcf2b9389cf0c8e5edb435adc05.cache [`a0b2e09`](https://github.com/janreges/siteone-crawler/commit/a0b2e09d01e36aed56c0208a8001d616755de096)
- http client cache: improved directory structure for large scale and better orientation for partial cache deleting.. current structure in tmp dir: my.domain.tld/b9/b989bdcf2b9389cf0c8e5edb435adc05.cache [`10e02c1`](https://github.com/janreges/siteone-crawler/commit/10e02c189297f28ea563ba6f3792462c2d6790ea)
- offline website exporter: better srcset handling - urls can be defined with or without sizes [`473c1ad`](https://github.com/janreges/siteone-crawler/commit/473c1ad0d753df209aa160b0d90687c4bff21912)
- html report: blue color for search term, looks better [`cb47df9`](https://github.com/janreges/siteone-crawler/commit/cb47df98e230c0375dbcb14c278250709bf3644a)
- offline website exporter: handled situation of the same-name folder/file when both the folder /foo/next.js/ and the file /foo/next.js existed on the website (real case from vercel.com) [`7c27d2c`](https://github.com/janreges/siteone-crawler/commit/7c27d2c2277dd134615563ee4eaa706ec0ee7485)
- exporters: added exec times to summary messages [`41c8873`](https://github.com/janreges/siteone-crawler/commit/41c8873dc33d7f08d91f77d71fcf1bf2fafa30ae)
- crawler: use port from URL if defined or by scheme .. previous solution didn't work properly for localhost:port and parsed URLs to external websites [`324ba04`](https://github.com/janreges/siteone-crawler/commit/324ba04267b962a56817dd10e3ecba7777702aa2)
- heading analysis: changed sorting to DESC by errors, renamed Headings structure -&gt; Heading structure [`dbc1a38`](https://github.com/janreges/siteone-crawler/commit/dbc1a38f33d4094aebe64020531518538e2b3baf)
- security analysis: detection and ignoring of URLs that point to a non-existent static file but return 404 HTML, better description [`193fb7d`](https://github.com/janreges/siteone-crawler/commit/193fb7dcf1f994aba69b646576bf7c6f8701a975)
- super table: added escapeOutputHtml property to column for better escape managing + updated related supertables [`bfb901c`](https://github.com/janreges/siteone-crawler/commit/bfb901cb82b9cda81198df0dc87885b5eceb5c93)
- headings analysis: replace usage of DOMNode-&gt;textContent because when the headings contain other tags, including &lt;script&gt;, textContent also contains JS code, but without the &lt;script&gt; tag [`5c426c2`](https://github.com/janreges/siteone-crawler/commit/5c426c24969a063aa3366da02520025733cf16e7)
- best practices: better missing quotes detection and minimizing false positives in special cases (HTML/JS in attributes, etc.) [`b03a534`](https://github.com/janreges/siteone-crawler/commit/b03a5345e7f71f880ee4d36fb9f51c230d8c772f)
- best practices: better SVG detection and minimizing false positives (e.g. code snippets with SVG), improved look in HTML report and better descriptions [`c35f7e2`](https://github.com/janreges/siteone-crawler/commit/c35f7e226f6cd384e5c8cf4b9af3a1a0d3be4cfc)
- headers analysis: added [ignored generic values] or [see values below] for specific headers [`a7b444d`](https://github.com/janreges/siteone-crawler/commit/a7b444dab0e1c3949abfa0e0746db18343b9b55d)
- core options: changed --hide-scheme-and-host to --show-scheme-and-host (by default is hidden schema+host better) [`3c202e9`](https://github.com/janreges/siteone-crawler/commit/3c202e998a824f97b6f481575a24e2924c9dc663)
- truncating: replaced '...' with '…' [`870cf8c`](https://github.com/janreges/siteone-crawler/commit/870cf8cd447fd14e389d76bcc8853b1e691f5349)
- accessibility analyzer: better descriptions [`514b471`](https://github.com/janreges/siteone-crawler/commit/514b47124d101cd4f0bd67148f41ea5644febd62)
- crawler & http client: if the response is loaded from the cache, we do not wait due to rate limiting - very useful for repeated executions [`61fbfab`](https://github.com/janreges/siteone-crawler/commit/61fbfab34ba07c1856099051b8f68dc76b1adf09)
- header stats: added missing strval in values preview [`9e11030`](https://github.com/janreges/siteone-crawler/commit/9e1103064af0962ed4963cace61bf7ad201d19a2)
- content type analyzer: increased column width for MIME type from 20 to 26 (enough for application/octet-stream) [`c806674`](https://github.com/janreges/siteone-crawler/commit/c806674ee82d0aba90a9d61e10ff2b5e2cf6c813)
- SSL/TLS analyzer: fixed issues on Windows with Cygwin where nslookup does not work reliably [`714b9e1`](https://github.com/janreges/siteone-crawler/commit/714b9e12a2426574731b62d460c98f1fed95aa18)
- text output: removed redundant whitespaces from banner after .YYYYMMDD was added to the version number [`8b76205`](https://github.com/janreges/siteone-crawler/commit/8b76205b41ca9cbf4dd32e7d908f4fe932c4a2a3)
- readme: added link to #ready-to-use-releases to summary [`574b39e`](https://github.com/janreges/siteone-crawler/commit/574b39e836794c98e7be8ceaa81d1ab0c50ab149)
- readme: added section Ready-to-use releases [`44d686b`](https://github.com/janreges/siteone-crawler/commit/44d686b910a36747d002ec2886b85c22be5c4864)
- changelog: added changelog by https://github.com/cookpete/auto-changelog/tree/master + added 'composer changelog' [`d11af7e`](https://github.com/janreges/siteone-crawler/commit/d11af7e4d847362276e1dd4cec3c25cad38263fb)

#### v1.0.0

> 7 November 2023

- proxy: added support for --proxy=&lt;host:port&gt;, closes #1 [`#1`](https://github.com/janreges/siteone-crawler/issues/1)
- license: renamed to LICENSE.md [`c0f8ec2`](https://github.com/janreges/siteone-crawler/commit/c0f8ec22a68741b1740981dc98bdec13d8e5182a)
- license: added license CC 4.0 BY [`bd5371b`](https://github.com/janreges/siteone-crawler/commit/bd5371b99363fbb5de29c33f0fcc572d154e467d)
- version: set v1.0.0.20231107 [`bdbf2be`](https://github.com/janreges/siteone-crawler/commit/bdbf2be97e68cfa01fb992fb960c1c5313d5780f)
- version: set v1.0.0 [`a98e61e`](https://github.com/janreges/siteone-crawler/commit/a98e61e161652861541743df6fe1d8c55be446f9)
- SSL/TLS analyzer: uncolorize valid-to in summary item, phpstan fixes (non-funcional changes) [`88d1d9f`](https://github.com/janreges/siteone-crawler/commit/88d1d9fec8bc29cd26ab88c18d6c122939b59bba)
- content type analyzer: added table with MIME types [`b744f13`](https://github.com/janreges/siteone-crawler/commit/b744f139e417b625bd22ea282f744b55406853b1)
- seo analysis: added TOP10 non-unique titles and descriptions to tab SEO and OpenGraph + badges [`4ae14c1`](https://github.com/janreges/siteone-crawler/commit/4ae14c13be5163704c2c6a2d55d75bc83f41f801)
- html report: increased sidebar width to prevent wrapping in the case of higher numbers in badges [`c5c8f4c`](https://github.com/janreges/siteone-crawler/commit/c5c8f4cae991bbdd6b6a8a7fab6cbaae1c199344)
- dns analyzer: increased column size to prevent auto-truncation of dns/ip addresses [`b4d4127`](https://github.com/janreges/siteone-crawler/commit/b4d4127b2b67efd63fff53ae0ad27b6c9a987501)
- html report: fixed badge with errors on DNS and SSL tab [`e290403`](https://github.com/janreges/siteone-crawler/commit/e29040349ac4966b22842e52ee4c102a67f9860c)
- html report: ensure that no empty tabs will be in report (e.g. in case where all analyzers will be deactivated by --analyzer-filter-regex='/anything/') [`6dd5bcc`](https://github.com/janreges/siteone-crawler/commit/6dd5bcc67d215bca085ef75cb98398aa162ce5fa)
- html report: improved replacement of non-badged cells to transparent badge for better alignment [`172a074`](https://github.com/janreges/siteone-crawler/commit/172a074c519a55c492d2b72250232e23749cd75b)
- html report: increased visible part of long tables from 500px to 658px (based on typical sidebar height), updated title [`0be355f`](https://github.com/janreges/siteone-crawler/commit/0be355f5474ad6aff461ac3362127569d29eac22)
- utils: selected better colors for ansi-&gt;html conversion [`6c2a8e3`](https://github.com/janreges/siteone-crawler/commit/6c2a8e364790e2cdb338f164c572aafd9e3db6c1)
- SSL/TLS analyzer: evaluation and hints about unsafe or recommeneded protocols, from-to validation, colorized output [`5cea1fe`](https://github.com/janreges/siteone-crawler/commit/5cea1fe51d500db433c4d86fe5fa8660d2ef2a14)
- SEO & OpenGraph analyzers: refactored class names, headings structure moved to own tab, other small improvements [`75a9724`](https://github.com/janreges/siteone-crawler/commit/75a97245af1e896ab3304891dd4459873ad3a26f)
- security analyzer: bette vulnerabilities explanation and better output formatting [`ee172cb`](https://github.com/janreges/siteone-crawler/commit/ee172cb25073e2e5452b38d5a6c52802e9585bcc)
- summary: selected more suitable icons from the utf-8 set that work well in the console and HTML [`ef67483`](https://github.com/janreges/siteone-crawler/commit/ef67483827755895f0edf3149f4f106d28ba1942)
- header stats: addValue() can accept both string and array [`a0d746b`](https://github.com/janreges/siteone-crawler/commit/a0d746ba9f956c03cb4ad1bddee14a26951ff86d)
- headers & redirects - text improvements [`3ac9010`](https://github.com/janreges/siteone-crawler/commit/3ac9010c33e9048f1b3d24182232ae182ae681ca)
- dns analyzer: colorized output and added info about CNAME chain into summary [`7dd1f8a`](https://github.com/janreges/siteone-crawler/commit/7dd1f8ac1eafcdcd92f651d397b561f6383fdcfc)
- best practices analyzer: added SVG sanitization to prevent XSS, fine-tuning of missing quotes detection, typos [`4dc1eb5`](https://github.com/janreges/siteone-crawler/commit/4dc1eb592de3631f61ed67dfb87466a95462d5f3)
- options: added extras option, e.g. for number range validation [`760a865`](https://github.com/janreges/siteone-crawler/commit/760a865082a7cd5f8e439f3fc9094fb7503a78be)
- seo and socials: small type-hint and phpstan fixes [`bf695be`](https://github.com/janreges/siteone-crawler/commit/bf695be5fa859ca49bef67fb6511039e4301bb34)
- best practice analyzer: added found depth to messages about too deep DOM depth [`220b43c`](https://github.com/janreges/siteone-crawler/commit/220b43c77a6d4747a29cf483e11a985dc07ac460)
- analysis: added SSL/TLS analyzer with info about SSL certificate, its validity, supported protocols, issuer .. in the report SSL/TLS info are under tab 'DNS and TLS/SSL' [`3daf175`](https://github.com/janreges/siteone-crawler/commit/3daf1757e1eee765ea3d6b2dca1ed55ffb694d4a)
- super table: show fulltext only for &gt;= 10 rows + visible height of the table in HTML shorten to 500px/20 rows and show 'Show entire table' link .. implemented only with HTML+CSS, so that it also works on devices without JS (e.g. e-mail browser on iOS) [`7fb9e52`](https://github.com/janreges/siteone-crawler/commit/7fb9e52de2514b0fc1a11032238de815f76acb37)
- analysis: added seo & sharing analysis - meta info (title, h1, description, keywords), OG/Twitter data, heading structure details [`53e12e6`](https://github.com/janreges/siteone-crawler/commit/53e12e63102d70b0329194493599523808758716)
- best practices: added checks for WebP and AVIF images [`0ccabc6`](https://github.com/janreges/siteone-crawler/commit/0ccabc633cdae4b7ef7b03aad22ab8cfab1a590f)
- best practices: added brotli support reporting to tables [`7ff2c53`](https://github.com/janreges/siteone-crawler/commit/7ff2c53e56705c19de77d54db578338252007b99)
- super table: added option to specify whether the table should be displayed on the output to the console, html or json [`6bb6217`](https://github.com/janreges/siteone-crawler/commit/6bb62177522a61bab1673b9d5f19e18f50bd54a3)
- headers analysis: analysis of HTTP headers of all requests to the main domain, their detailed breakdown, values and statistics [`1fcc1db`](https://github.com/janreges/siteone-crawler/commit/1fcc1dba38a3ac41f0547a4f11a2aef9af1d876f)
- analysis: fixed search of attributes with missing quotes [`3db31b9`](https://github.com/janreges/siteone-crawler/commit/3db31b9c01317d8c8ac6eba6b98679be79982c3e)
- super table: added the number of found/displayed lines next to the full text [`6e7f3d4`](https://github.com/janreges/siteone-crawler/commit/6e7f3d4b4de0cfa378920c9389291a9902c0c486)
- super table: removed setting column widths for HTML table - works best without forcing widths [`2a785e7`](https://github.com/janreges/siteone-crawler/commit/2a785e70b675ef681b005042a50b289b3b29d600)
- html report: even wider content of the report is allowed, for better functioning for high-resolution displays [`363990c`](https://github.com/janreges/siteone-crawler/commit/363990c3566cb39d653ab2760df6bb4d2acd8149)
- pages 404: truncate too long urls [`082bae6`](https://github.com/janreges/siteone-crawler/commit/082bae6f28d2ba8296591a0885548faa0b38a59a)
- fixes: fixed various minor warnings related to specific content or parameters [`da1802d`](https://github.com/janreges/siteone-crawler/commit/da1802d82f8ccf2de3f4329bf3b952ebefeb3449)
- options: ignore extra comma or empty value in list [`3f5cab6`](https://github.com/janreges/siteone-crawler/commit/3f5cab68bc4981faea7b7bed30b9f687ea773830)
- super table: added useful fulltext search for all super tables [`50a4edf`](https://github.com/janreges/siteone-crawler/commit/50a4edf9caa69f67fdc21c3c32a92d201c211ccc)
- colors: more light color for badge.neutral in light mode because previous was too contrasting [`0dbad09`](https://github.com/janreges/siteone-crawler/commit/0dbad0920f8f8a9f14186f9513e3ea6793fcf297)
- colors: notice is now blue instead of yellow and severity order fix in some places (critical -&gt; warning -&gt; notice -&gt; ok -&gt; info) [`1b50b99`](https://github.com/janreges/siteone-crawler/commit/1b50b99ae079a4d1cdc350038e105d469dec524a)
- colors: changed gray color to more platform-consistent color, otherwise gray was too dark on macOS [`173c9bd`](https://github.com/janreges/siteone-crawler/commit/173c9bd211bf066b69bb3adbde487ec3e99f6da1)
- scripts: removed helper run.tests* scripts [`e9f0c8f`](https://github.com/janreges/siteone-crawler/commit/e9f0c8ff768042737bfab57b5d2270df995c611e)
- analysis: added table with detailed list of security findings and URLs [`5b9e0fe`](https://github.com/janreges/siteone-crawler/commit/5b9e0fe1c3a514941abf2e277bf3f2bd4e017004)
- analysis: added SecurityAnalyzer, which checks the existence and values of security headers and performs HTML analysis for common issues [`0cb7cb9`](https://github.com/janreges/siteone-crawler/commit/0cb7cb9daac5303227e31b72b0f6931218968bf7)
- http auth: added support for basic HTTP authentication by --http-auth=username:password [`147e004`](https://github.com/janreges/siteone-crawler/commit/147e0040e97f6ad37da7897813063cbb73302e22)
- error handling: improved behaviour in case of entering a non-existent domain or problems with DNS resolving [`5c08fb4`](https://github.com/janreges/siteone-crawler/commit/5c08fb4c82409863f73fcdcd66f9a0ba76206c5c)
- html report: implemented completely redesigned html report with useful information, with light/dark mode and possibility to sort tables by clicking on the header .. design inspired by Zanrly from Shuffle.dev [`05da14f`](https://github.com/janreges/siteone-crawler/commit/05da14f50b108deec4827c5c0324bbd1b9775b37)
- http client: fix of extension detection in the case of very non-standard or invalid URLs [`113faa5`](https://github.com/janreges/siteone-crawler/commit/113faa501016f14c017f5f1eaa586a6fae35efbf)
- options: increased default memory limit from 512M to 2048M + fixed refactored 'file-system' -&gt; 'file' in docs for result storage [`1471b28`](https://github.com/janreges/siteone-crawler/commit/1471b2884bcbf1806a388e4ae85cc4f7e1bc11fe)
- utils: fix that date formats are not detected as a phone number in parsePhoneNumbersFromHtml() [`e4e1009`](https://github.com/janreges/siteone-crawler/commit/e4e10097f7e74816dd716d2713516d5ff8eef39a)
- strict types: added declare(strict_types=1) to all classes with related fixes and copyright [`92dd47c`](https://github.com/janreges/siteone-crawler/commit/92dd47c72e4f1aaa5a05187f60f2a9f0a5c285ee)
- dns analyzer: added information about the DNS of the given domain - shows the entire cname/alias chain as well as the final resolved IPv4/IPv6 addresses + tests [`199421d`](https://github.com/janreges/siteone-crawler/commit/199421df3c96e2f2bec20f45230cbd812e9fc21c)
- utils: helper function parsePhoneNumbersFromHtml() used in BestPracticeAnalyzer + tests [`09cc5fb`](https://github.com/janreges/siteone-crawler/commit/09cc5fbbbdf7f4a706ef912221e32d476fa397b4)
- summary consistency: forced dots at the end of each item in the summary list [`4758e38`](https://github.com/janreges/siteone-crawler/commit/4758e38c3b2ab73476516662129e3b6abd78ff44)
- crawler: support for more benevolent tags for title and meta attributes .. e.g. even the title can contain other HTML attributes [`770b339`](https://github.com/janreges/siteone-crawler/commit/770b339fb7b6ac86af56a864feb184977974d37d)
- options: default timeout increased from 3 to 5 seconds .. after testing on a lot websites, it makes better sense [`eb74207`](https://github.com/janreges/siteone-crawler/commit/eb7420736f5c4d353651ec39d8d030a8485e1486)
- super table: added option to force non-breakable spaces in column cells [`3500818`](https://github.com/janreges/siteone-crawler/commit/35008185064331d33c380e0643606f2dbaeb2b64)
- best practice analyzer: added measurement of individual steps + added checking of active links with phone numbers &lt;a href="tel: 123..."&gt; [`1bb39e8`](https://github.com/janreges/siteone-crawler/commit/1bb39e87a440975e8956fbf1d66b81ef1b424574)
- accessibility analyzer: added measurement of individual steps + removed DOMDocument parsing after refactoring [`2a7c49b`](https://github.com/janreges/siteone-crawler/commit/2a7c49b415dd2864cc37497d409cb083abb99df5)
- analysis: added option to measure the duration and number of analysis steps + the analyzeVisitedUrl() method already accepts DOMDocument (if HTML) so the analyzers themselves do not have to do it twice [`d8b9a3d`](https://github.com/janreges/siteone-crawler/commit/d8b9a3d8e0016ec4cc6da908a1bd9db39370e9da)
- super table: calculated auto-width can't be shorter than column name (label) [`b97484f`](https://github.com/janreges/siteone-crawler/commit/b97484f22d59bee04b935fa204d18c609ba8658c)
- utils: removed ungreedy flag from all regular expressions, it caused problems under some circumstances [`03fc202`](https://github.com/janreges/siteone-crawler/commit/03fc202ed2f30fe4bd2001e8fcaecbea5ca45f7e)
- phpstan: fixed all level 5 issues [`04c21aa`](https://github.com/janreges/siteone-crawler/commit/04c21aaeeed24117740fac22b5756363e3a4769d)
- phpstan: fixed all level 4 issues [`91fee49`](https://github.com/janreges/siteone-crawler/commit/91fee49a0aefa603c4dba9bc1f19d658a7ab413e)
- phpstan: fixed all level 3 issues [`2f7866a`](https://github.com/janreges/siteone-crawler/commit/2f7866a389b05e3c796e7f1f0bd7f6410a23cb05)
- phpstan: fixed all level 2 issues [`e438996`](https://github.com/janreges/siteone-crawler/commit/e4389962be4a476bdcacc6acc18f36c7037b90ee)
- phpstan: installed phpstan with level 2 for now [`b896e6c`](https://github.com/janreges/siteone-crawler/commit/b896e6c0552e4fd938088594a7d44d6af14fc809)
- tests: allowed nextjs.org for crawling (incorrectly because of this, a couple of tests did not pass) [`cdc7f56`](https://github.com/janreges/siteone-crawler/commit/cdc7f5688f6aca0e822c3fa6daee6a3acd99eeeb)
- refactor: moved /Crawler/ into /src/Crawler/ + added file attachment support to mailer [`2f0d26c`](https://github.com/janreges/siteone-crawler/commit/2f0d26c7d2f7cb65495b375dd4b11bf7849888e2)
- sitemap exporter: renamed addErrorToSummary -&gt; addCriticalToSummary [`e46e192`](https://github.com/janreges/siteone-crawler/commit/e46e1926df52a3edfc4137ebd8ede9dee8a45bf1)
- text output: added options --show-inline-criticals and --show-inline-warning which displays the found problems directly under the URL - the displayed table will be less clear, but the problems are clearly visible [`725b212`](https://github.com/janreges/siteone-crawler/commit/725b2124172710895d86503fd4a933e2ea91efaa)
- composer.json: added require declarations for ext-dom, ext-libxml (used in analyzers) and ext-zlib (used in cache/storages) [`3542cf0`](https://github.com/janreges/siteone-crawler/commit/3542cf03829e9a3c745e58e0df1bc2f6284d25ba)
- analysis: added accessibility and best practices analyzers with useful checks [`860316f`](https://github.com/janreges/siteone-crawler/commit/860316fa685509104462412aeb125417dceaee28)
- analysis: added AnalysisManager for better analysis control with the possibility to filter required analyzers using --analyzer-filter-regex [`150569f`](https://github.com/janreges/siteone-crawler/commit/150569fd20c380781ed5971cefd47308762a730a)
- result storage: options --result-storage, --result-storage-dir and --result-storage-compression for storage of response bodies and headers (by default is used memory storage but you can use file storage for extremely large websites) [`d2a8fab`](https://github.com/janreges/siteone-crawler/commit/d2a8fabcef72067500dfcb0065e87ebc4395dac3)
- http cache: added --http-cache-dir and --http-cache-compression parameters (by default http cache is on and set to 'tmp/http-client-cache' and compression is disabled) [`2eb9ed8`](https://github.com/janreges/siteone-crawler/commit/2eb9ed86d9d53b4735a3de3cf6d06b652818dbc0)
- super table: the currentOrderColumn is already optional - sometimes we want to leave the table sorted according to the input array [`4fba880`](https://github.com/janreges/siteone-crawler/commit/4fba880fcf137a6207df4c5177cf3ec80afaa3ae)
- analysis: replaced severity ok/warning/error with ok/notice/warning/critical - it made more sense for analyzers [`18dbaa7`](https://github.com/janreges/siteone-crawler/commit/18dbaa7a4a760874ba39c75af28f7e808fb8eb2e)
- analysis: added support for immediate analysis of visited URLs with the possibility to insert the analyzer's own columns into the main table [`004865f`](https://github.com/janreges/siteone-crawler/commit/004865f223c9ec688c4f522cd8f93d8022458130)
- content types: fixed json/xml detection [`00fc180`](https://github.com/janreges/siteone-crawler/commit/00fc1808838c7a191cc9986e884ffda26f841281)
- content type analyzer: decreased URLs column size from 6 to 5 - that's enough [`2eefbaf`](https://github.com/janreges/siteone-crawler/commit/2eefbafad24f68118a2efe8d6ddedc4d3d45b5cf)
- formatting: unification of duration formatting across the entire application [`412ee7a`](https://github.com/janreges/siteone-crawler/commit/412ee7ab5c5eda19dfc5492a6cc9edbb7c5969c6)
- super table: fixed sorting for array of arrays [`4829be8`](https://github.com/janreges/siteone-crawler/commit/4829be8f8e1d3f0d8201dedfa99d245453601422)
- source domains analyzer: minor formatting improvements [`2d32ced`](https://github.com/janreges/siteone-crawler/commit/2d32cedb59aa13e4e27a1dbe58eff586e4407cd9)
- offline website exporter: added info about successful export to summary [`92e7e46`](https://github.com/janreges/siteone-crawler/commit/92e7e46bdbc1f1cff329cf4aff5ee99dd70332e2)
- help: added red message about invalid CLI parameters also to the end of help output, because help is already too long [`6942e8f`](https://github.com/janreges/siteone-crawler/commit/6942e8f4535d748763a124207634ea7548bbfa83)
- super table: added column property 'formatterWillChangeValueLength' to handle situation with the colored text and broken padding [`7371a68`](https://github.com/janreges/siteone-crawler/commit/7371a68f11191b0b21307e6ca703e362f476b815)
- analyzers: setting a more meaningful analyzers order [`5e8f747`](https://github.com/janreges/siteone-crawler/commit/5e8f747392f291abdfb0140038c42fe84801955c)
- analyzers: added source domains analyzer with summary of domains and downloaded content types (number/size/duration) [`f478f17`](https://github.com/janreges/siteone-crawler/commit/f478f178fb2f79a81e5db89909951816ac6e1c9f)
- super table: added auto-width column feature [`d2c04de`](https://github.com/janreges/siteone-crawler/commit/d2c04dec3312d72ed373236d73f7a4d3bbf8c20d)
- renaming: '--max-workers' to '--workers' with possibility to use shortcut '-w=&lt;num&gt;' + adding possibility to use shortcut '-rps=&lt;num&gt;' for '--max-reqs-per-sec=&lt;num&gt;' [`218f8ff`](https://github.com/janreges/siteone-crawler/commit/218f8ffcca15550853bcb4ace44dedf260d1e735)
- extra columns: added ability to force columns to the required length via "!" + refactoring using ExtraColumn [`def82ff`](https://github.com/janreges/siteone-crawler/commit/def82ff3f5f11efa2e4ef812e086a5c8379ac962)
- readme: divisionlit of features into several groups and divided accordingly [`c03d231`](https://github.com/janreges/siteone-crawler/commit/c03d2311b618f8aad165ffad39ae51989f60f846)
- offline exporter: export of the website to the offline form has already been fine-tuned (but not perfect yet), --disable-* options to disable JS/CSS/images/fonts/etc. and a lot of other related functionalities [`0d04a98`](https://github.com/janreges/siteone-crawler/commit/0d04a9805bdebea708eba44cc6680bd58995d559)
- crawler: added possibility to set speed via --max-reqs-per-sec (default 10) [`d57cc4a`](https://github.com/janreges/siteone-crawler/commit/d57cc4a39e6ce1882ee3233b015200382d90f06f)
- tests: dividing asserts for URL conversion testing into different detailed groups [`f6221cb`](https://github.com/janreges/siteone-crawler/commit/f6221cb5d3e5e844f146a95940479b20604c37cf)
- html url parser: added support for loading fonts from &lt;link href='...'&gt; [`4c482d1`](https://github.com/janreges/siteone-crawler/commit/4c482d1078fb535e4a3be96f6c3e7ded2ea02d65)
- manager: remove avif/webp support if OfflineWebsiteExporter is active - we want to use only long-supported jpg/png/gif on the local offline version [`3ec81d3`](https://github.com/janreges/siteone-crawler/commit/3ec81d338590ae16ee337cbbfa8a741e01b0522d)
- http response: transformation of the redirect to html with redirection through the &lt;meta&gt; tag [`8f6ff16`](https://github.com/janreges/siteone-crawler/commit/8f6ff161066a82af9ae91a738aae66327fe407b6)
- initiator: skip comments or empty arguments [`12f4c52`](https://github.com/janreges/siteone-crawler/commit/12f4c52b7fe0429926c2a6540e8842eae4882888)
- http client: added crawler signature to User-Agent and X-Crawler-Info header + added possibility to set Origin request header (otherwise some servers block downloading the fonts) [`ae4eaf3`](https://github.com/janreges/siteone-crawler/commit/ae4eaf3298e0bc94c1d913d08393426e380ba4ad)
- visited url: added isStaticFile() [`f1cd5e8`](https://github.com/janreges/siteone-crawler/commit/f1cd5e8e397b734dc3353db943c2928ff46cf520)
- crawler: increased pcre.backtrack_limit and pcre.recursion_limit (100x) to support longer HTML/CSS/JS [`35a6e9a`](https://github.com/janreges/siteone-crawler/commit/35a6e9a4729fffa7ee0a77b0be50621c4077a7b9)
- core options: renamed --headers-to-table to --extra-columns [`7c30988`](https://github.com/janreges/siteone-crawler/commit/7c30988fdecdaeb6aa89aed15a864a033c121d2f)
- crawler: added type for audio and xml + static cache for getContentTypeIdByContentTypeHeader [`386599e`](https://github.com/janreges/siteone-crawler/commit/386599e881051ae8c14b7ec9688690e50c0dd7dc)
- found urls: normalization of URL takes care of spaces + change of source type to int [`c3063a2`](https://github.com/janreges/siteone-crawler/commit/c3063a247f10bf00b8516eb2303bb85cab426c15)
- debugging: possibility to enable debugging through ParsedUrl [`979dc0e`](https://github.com/janreges/siteone-crawler/commit/979dc0e89af063b5ffe04b49275ceb0fa9191db2)
- offline url converter: class for solving the translation of URL addresses to offline/local + tests [`44118e6`](https://github.com/janreges/siteone-crawler/commit/44118e6bf96f6b25c7d8410084f76dfb3eb10188)
- url converter: TargetDomainRelation enum with tests [`fd6cf21`](https://github.com/janreges/siteone-crawler/commit/fd6cf216d903785adf46923ed2a805937f724d15)
- initiator: check only script basename in unknown args check [`888448f`](https://github.com/janreges/siteone-crawler/commit/888448fc9c598a7e8f750e746214b2834722b412)
- offline website export: to run the exporter is necessary to set --offline-export-directory [`33e9f95`](https://github.com/janreges/siteone-crawler/commit/33e9f952814b52bdfc7634cf4b9521d393b87417)
- offline website export: to run the exporter is necessary to set --offline-export-directory [`bcc007b`](https://github.com/janreges/siteone-crawler/commit/bcc007b6a3a9c0e9de23e76bd6f9150c7d2295c9)
- log & tmp: added .gitkeep for versioning of these folders - they are used by some optional features [`065f8ef`](https://github.com/janreges/siteone-crawler/commit/065f8ef27fabe889e8a35b98fd75ce260263d268)
- offline website export & tests: added the already well-functioning option to export the entire website to offline mode working from local static HTML files, including images, fonts, styles, scripts and other files (no documentation yet) + lot of related changes in Crawler + added first test testing some important functionalities about relative URL building [`4633211`](https://github.com/janreges/siteone-crawler/commit/463321199e6f9bac10b097e3f286da6a13f36906)
- composer & phpunit: added composer, phpunit and license CC BY 4.0 [`4979143`](https://github.com/janreges/siteone-crawler/commit/4979143ac2aea9d7b3fe9fcfb9d57f1890c1f114)
- visited-url: added info if is external and if is allowed to crawl it [`268a696`](https://github.com/janreges/siteone-crawler/commit/268a6960f8ff69046c8e6c73beae98d24b73ba1f)
- text-output: added peak memory usage and average traffic bandwidth to total stats [`cb68340`](https://github.com/janreges/siteone-crawler/commit/cb683407e2cdcd62f5484da96baf9ef43e49a4b3)
- crawler: added video support and fixed javascript detection by content-type [`3c3eb96`](https://github.com/janreges/siteone-crawler/commit/3c3eb9625f20657e971249c14cdff97a0a0b8687)
- url parsers: extraction of url parsing from html/css into dedicated classes and FoundUrl with info about source tag/attribute [`d87597d`](https://github.com/janreges/siteone-crawler/commit/d87597d36507c7bd6029f87bf1801586eea9b420)
- manager: ensure that done callback is executed only once [`d99cccd`](https://github.com/janreges/siteone-crawler/commit/d99cccd91b43680e0726f9c037fb568a9e8be1b4)
- http-client: extraction of http client functionality into dedicated classes and implemented cache for HTTP responses (critical for efficient development) [`8439e37`](https://github.com/janreges/siteone-crawler/commit/8439e376c50a346e133a2d99e7406020bb89030a)
- debugging: added debugging related expert options + Debugger class [`2c89682`](https://github.com/janreges/siteone-crawler/commit/2c89682feaf65a4f224da8ebaf05c48aa899eccc)
- parsed-url: added query, it is already needed [`860df08`](https://github.com/janreges/siteone-crawler/commit/860df086ae8c8556420d92e249b3b459b8bf288f)
- status: trim only HTML bodies because trim break some types of binary files, e.g. avif [`fca2156`](https://github.com/janreges/siteone-crawler/commit/fca2156a2f9607f705a32833a650ae70d5690772)
- url parsers: unification of extension length in relevant regexes to {1,10} [`96a3548`](https://github.com/janreges/siteone-crawler/commit/96a35484ba5ab0eee7e43837c1eade1aba6f8a57)
- basic-stats: fixed division by zero and nullable times [`8c38b96`](https://github.com/janreges/siteone-crawler/commit/8c38b9660752f132c09e3ceaab596e54176b46e9)
- fastest-analyzer: show only URLs with status 200 on the TOP list [`0085dd1`](https://github.com/janreges/siteone-crawler/commit/0085dd1fcbd3b5657eca73345921fe3fc6f407bc)
- content-type-analyzer: added stats for 42x statuses (429 Too many requests) [`4f49d12`](https://github.com/janreges/siteone-crawler/commit/4f49d124d1d9993abe3babd9a181c9768b5c2903)
- file export: fixed HTML report error after last refactoring [`e77fa6c`](https://github.com/janreges/siteone-crawler/commit/e77fa6cf791da08b522e2124545c303ab5de67ed)
- sitemap: publish only URLs with status 200 OK [`b2d4448`](https://github.com/janreges/siteone-crawler/commit/b2d44488a28aeca3421c36ca1e5ada0030de26d8)
- summary: added missing &lt;/ul&gt; and renamed heading Stats to Summary in HTML report [`c645e16`](https://github.com/janreges/siteone-crawler/commit/c645e16016611a49f70c3d5de9e6ab4d58a45048)
- status summary: added summary showing important analyzed metrics with OK/WARNING/CRITICAL icons, ordering by severity and INFO about the export execution + interrupting the script by CTRL+C will also run all analyzers, exporters and display all statistics for already processed URLs [`fd643d0`](https://github.com/janreges/siteone-crawler/commit/fd643d016036f4eed5418375f8b25cfe08549ed0)
- output consistency: ensuring color and formatting consistency of different types of values (status codes, request durations) [`3ffe1d2`](https://github.com/janreges/siteone-crawler/commit/3ffe1d2a939d718a6fae9c1f927646cfbec808f4)
- analyzers: added content-type analyzer with stats for total/avg times, total sizes and statuses 200x, 300x, 400x, 500x [`0475347`](https://github.com/janreges/siteone-crawler/commit/04753478bce1f81dfdab73cd19b0541e725317fe)
- crawler: better content-type handling for statistics and added 'Type' column to URL lists + refactored info from array to class [`346caf4`](https://github.com/janreges/siteone-crawler/commit/346caf45f3a18e75a0cf4d0e65961fbee63c9632)
- supertable: is now able to display from the array-of-arrays as well as from the array-of-objects + it can translate color declarations from bash to HTML colors when rendering to HTML [`80f0b1c`](https://github.com/janreges/siteone-crawler/commit/80f0b1ca3d50ee7dfae9a01eccbe15fcc06a72d5)
- analyzers: TOP slowest/fastest pages analyzer now evaluates only HTML pages, otherwise static content skews the results + decreased minTime for slowest analysis from 0.1 to 0.01 sec (on a very fast and cached website, the results were empty, which is not ideal) [`1390bbc`](https://github.com/janreges/siteone-crawler/commit/1390bbc6daa5484fed8612731dc99f734c406042)
- major refactoring: implementation of the Status class summarizing useful information for analyzers/exporters (replaces the JsonOutput over-use) + implementation of basic analyzers (404, redirects, slow/fast URLs) + SuperTable component that exports data to text and HTML + choice of memory-limit setting + change of some default values [`efb9a60`](https://github.com/janreges/siteone-crawler/commit/efb9a60aa0be5cb8af55b09723a236370fccb904)
- url parsing: fixes for cases when query params are used with htm/html/php/asp etc. + mini readme fix [`af1acfa`](https://github.com/janreges/siteone-crawler/commit/af1acfa9efa536d2ef2e51b2f0a2404ef9d2417a)
- minor refactoring: renaming about core options, small non-functional changes [`1dd258e`](https://github.com/janreges/siteone-crawler/commit/1dd258e81eb4d06658e5e41e62141d5be48ce622)
- major refactoring: better modularity and auto loading in the area of the exporters, analyzers, their configurability and help auto-building + new mailer options --mail-from-name and --mail-subject-template [`0c57dbd`](https://github.com/janreges/siteone-crawler/commit/0c57dbdb30702cc6669a703788b530fbc4d04af6)
- json output: automatic shortening of the URL according to the text width of the console, because if the long URL exceeds the width of the window, the rewriting of the line with the progressbar stops working properly [`106332b`](https://github.com/janreges/siteone-crawler/commit/106332b1d8421dbea5f8725536fa3efed6834564)
- manual exit: captures CTRL+C and ends with the statistics for at least the current URLs [`7f4fc80`](https://github.com/janreges/siteone-crawler/commit/7f4fc80c5f9f0fe47da2d9bee2e139489c36a966)
- error handling: show red error with help when queue or visited tables are full and info how to fix it [`4efbd73`](https://github.com/janreges/siteone-crawler/commit/4efbd734d775aaa2e6dd66d2d8ed7a007871a1dd)
- DOM elements: implemented DOM elements counter and when you add 'DOM' to --headers-to-column you will see DOM elements count [`1837a9c`](https://github.com/janreges/siteone-crawler/commit/1837a9cb12f97a33aec6bcf03a54250bd48545a2)
- sitemap and no-color: implemented xml/txt sitemap generator and --no-color option [`f9ade44`](https://github.com/janreges/siteone-crawler/commit/f9ade44d470d97bcc399039bc91a5ce74a6537c1)
- readme: added table of contents and rewrited intro, features and installation chapters [`469fd1c`](https://github.com/janreges/siteone-crawler/commit/469fd1cf15af4d191c239b2523e0fd8614f7653f)
- readme: removed deprecated and duplicate mailer docs [`c5effe8`](https://github.com/janreges/siteone-crawler/commit/c5effe84aece85f7a6aaa97228cd84a5eade4f8b)
- readme and CLI help: dividing the parameters into clear groups and improving parameters description - in README.md is detailed form, in CLI instructions is a shorter version. [`19ff724`](https://github.com/janreges/siteone-crawler/commit/19ff724ec0d21f08c4d6cf09def06ba27b023598)
- include/ignore regex: added option to limit crawled URLs with the common combination of --include-regex and --ignore-regex [`88e393d`](https://github.com/janreges/siteone-crawler/commit/88e393d33c07fab77173432fd0faf7fe631c2c2c)
- html report: masking passwords, styling, added logo, better info ordering and other small changes [`4cdcdab`](https://github.com/janreges/siteone-crawler/commit/4cdcdabf145ffe6f02d84b3250b2a1fc46a5677a)
- mailer & exports: implemented ability to send HTML report to e-mail via SMTP + exports to HTML/JSON/TXT file + better reporting of HTTP error conditions (timeout, etc.) + requests for assets are sent only as HEAD without the need to download all binary data + updated documentation [`a97c29d`](https://github.com/janreges/siteone-crawler/commit/a97c29d78f07b4d854853c474fb9d0542b6f2796)
- table output: option to set expected column length for better look by 'X-Cache(10)' [`e44f89d`](https://github.com/janreges/siteone-crawler/commit/e44f89d6c3114ccf02c70f38d5ffa5a0f081c1b2)
- output: renamed print*() methods to more meaningul add*() relevant also for JSON output [`1069c4a`](https://github.com/janreges/siteone-crawler/commit/1069c4a346d13878c52a316b5953ffa997ec3700)
- options: default timeout decreased from 10 to 3, --table-url-column-size renamed to --url-column-size and decreased its default value from 100 to 80, new option --hide-progress-bar, changed --truncate-url-to-column-size to --do-not-truncate-url [`e75038c`](https://github.com/janreges/siteone-crawler/commit/e75038c56afcf85ae591b1dbedf33a54fcd84754)
- readme: improved documentation describing use on Windows, macOS or arm64 Linux [`baf2d05`](https://github.com/janreges/siteone-crawler/commit/baf2d0596a3e8367d51fe6ab75793d803e984330)
- readme: added info about really tested crawler on Windows with Cygwin (Cygwin has some output limitations and it is not possible to achieve such nice behavior as on Linux) [`1f195c0`](https://github.com/janreges/siteone-crawler/commit/1f195c0c9c8565a37fcb5786070e69c6aa0b8e0e)
- windows compatibility: ensuring compatibility with running through cygwin Swoole, which I recommend in the documentation for Windows users [`c22cc45`](https://github.com/janreges/siteone-crawler/commit/c22cc4559ed3de2ac5e4e6e2957b4d3233b4fda5)
- json output: implemented nice continuos progress reporting, intentionally on STDERR so the output on STDOUT can be used to save JSON to file + improved README.md [`c095249`](https://github.com/janreges/siteone-crawler/commit/c095249d03c96a00da75553b10dadf7e025a5b0b)
- limits: increased limit of max queue length from 1000 to 2000 (this default will more suitable even for medium-sized websites) [`c8c3312`](https://github.com/janreges/siteone-crawler/commit/c8c33121c371cc4d0f0791a250178254d9e3a88a)
- major refactoring: splitting the code into classes, improving error handling and implementing other functions (JSON output, assets crawling) [`f6902fc`](https://github.com/janreges/siteone-crawler/commit/f6902fc025943ef96150739ae6834358097b235d)
- readme: added information how to use crawler with Windows, macOS or arm64 architecture + a few other details [`721f4bb`](https://github.com/janreges/siteone-crawler/commit/721f4bb73e92f65ca3aab789219f046dea665931)
- url parsing: handled situations when relative or dotted URLs are also used in HTML, e.g. href='sub/page', href='./sub/page' or href='../sub/page', href='../../sub/page' etc. + few minor optimizations [`c2bbf72`](https://github.com/janreges/siteone-crawler/commit/c2bbf72cf636340a43ebf8472c38008d0fc50f27)
- memory allocation: added optional params --max-queue-length=&lt;n&gt; (default 1000), --max-visited-urls=&lt;n&gt; (default 5000) and --max-url-length=&lt;u&gt; (default 2000) [`947a43f`](https://github.com/janreges/siteone-crawler/commit/947a43f3bb826ad852ca51390ae2778fbff320e0)
- Initial commit with first version 2023.10.1 [`7109788`](https://github.com/janreges/siteone-crawler/commit/71097884df3c1ade6fd7c02b4ac9ac8f5f161a12)


================================================
FILE: CLAUDE.md
================================================
# CLAUDE.md

This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.

## Setup After Clone

```bash
git config core.hooksPath .githooks               # enable pre-commit hook (fmt + clippy + tests)
```

## Build & Test Commands

```bash
cargo fmt                                         # auto-format code (always run before build)
cargo build                                       # debug build
cargo build --release                             # release build (~11s)
cargo test                                        # unit tests + offline integration tests (~300 tests)
cargo test --test integration_crawl -- --ignored --test-threads=1  # network integration tests (crawls crawler.siteone.io)
cargo test scoring::ci_gate::tests::all_checks_pass  # run a single test by name
cargo clippy -- -D warnings                       # lint (CI enforces zero warnings)
cargo fmt -- --check                              # format check
```

## Quick Run

```bash
./target/release/siteone-crawler --url=https://example.com --single-page
./target/release/siteone-crawler --url=https://example.com --output=json --http-cache-dir=  # no cache
./target/release/siteone-crawler --html-to-markdown=page.html                               # convert local HTML to markdown (stdout)
./target/release/siteone-crawler --html-to-markdown=page.html --html-to-markdown-output=page.md  # convert to file
```

## Architecture

### Crawl Lifecycle (in order)

1. **CLI Parsing** (`Initiator` → `CoreOptions::parse_argv()`): Parses 120+ CLI options, merges config file if present, validates. Exits with code 101 on error, code 2 on `--help`/`--version`. Non-crawl utility modes (`--serve-markdown`, `--serve-offline`, `--html-to-markdown`) exit early in `main.rs` before creating the Manager.

2. **Analyzer Registration** (`Initiator::register_analyzers()`): Creates all 15 analyzer instances (Accessibility, BestPractice, Caching, ContentType, DNS, ExternalLinks, Fastest, Headers, Page404, Redirects, Security, SeoAndOpenGraph, SkippedUrls, Slowest, SourceDomains, SslTls) and registers them with `AnalysisManager`. Some analyzers receive config from CLI options (e.g. `fastest_top_limit`, `max_heading_level`).

3. **Manager Setup** (`Manager::run()`): Creates `Status` (result storage), `Output` (text/json/multi), `HttpClient` (with optional proxy, auth, cache), `ContentProcessorManager` (HTML, CSS, JS, XML, Astro, Next.js, Svelte processors), and the `Crawler` instance.

4. **Robots.txt Fetch** (`Crawler::fetch_robots_txt()`): Before crawling starts, fetches and parses `/robots.txt` from the initial domain. Respects `--ignore-robots-txt` option.

5. **Crawl Loop** (`Crawler::run()`): Breadth-first concurrent URL processing:
   - URL queue (`DashMap`) seeded with initial URL
   - Tokio tasks limited by `Semaphore` (= `--workers` count) + rate limiting (`--max-reqs-per-sec`)
   - Per-URL flow: check robots.txt → HTTP request → on error, store with negative status code → on success, run content processors → extract links from HTML → enqueue discovered URLs
   - Content processors (`HtmlProcessor`, `CssProcessor`, etc.) transform response bodies during crawl — used by offline/markdown exporters for URL rewriting
   - Each visited URL's response is stored in `Status` for post-crawl analysis
   - Per-URL data collected: status code, headers, body, response time, content type, size, redirects

6. **Post-Crawl Analysis** (`Manager::run_post_crawl()`): Sequential pipeline after crawling ends:
   - Transfer skipped URLs from crawler to `Status`
   - Run all registered analyzers (`AnalysisManager::run_analyzers()`): each analyzer gets read access to `Status` (all crawled data) and write access to `Output` (adds tables/findings)
   - Add content processor stats table

7. **Exporters** (`Manager::run_exporters()`): Generate output files based on CLI options:
   - `SitemapExporter`: XML/TXT sitemap files
   - `OfflineWebsiteExporter`: Static website copy with rewritten relative URLs
   - `MarkdownExporter`: HTML→Markdown conversion with relative .md links
   - `FileExporter`: Save text/JSON output to file
   - `HtmlReport`: Self-contained HTML report (also used by Mailer and Upload)
   - `MailerExporter`: Email HTML report via SMTP
   - `UploadExporter`: Upload report to remote server

8. **Scoring** (`scorer::calculate_scores()`): Computes quality scores (0–10) across 5 weighted categories (Performance 20%, SEO 20%, Security 25%, Accessibility 20%, Best Practices 15%). Deductions come from summary findings (criticals, warnings) and stats (404s, 5xx, slow responses).

9. **CI/CD Gate** (`ci_gate::evaluate()`): When `--ci` is active, checks scores and stats against configurable thresholds (`--ci-min-score`, `--ci-max-404`, etc.). Returns exit code 10 on failure.

10. **Summary & Output** (`Output::add_summary()`, `Output::end()`): Prints summary table with OK/Warning/Critical counts, finalizes output. Exit code: 0 = success, 3 = no pages crawled, 10 = CI gate failed.

### How Analyzers Work

Each analyzer implements the `Analyzer` trait (`analysis/analyzer.rs`). Analyzers are **post-crawl only** — they don't run during crawling. The `AnalysisManager` calls each analyzer's `analyze(&Status, &mut Output)` method after all URLs have been visited. Analyzers read crawled data from `Status` (visited URLs, response headers, bodies, skipped URLs) and produce `SuperTable` instances that get added to `Output`. Analyzers also add `Item` entries to the `Summary` (OK, Warning, Critical, Info findings) which feed into scoring.

### How Content Processors Work

Content processors implement `ContentProcessor` (`content_processor/content_processor.rs`) and run **during crawl** on each URL's response body. They serve two purposes: (1) transform content for offline/markdown export (rewrite URLs to relative paths), and (2) extract metadata (links, assets). Processors are type-specific: `HtmlProcessor` handles HTML, `CssProcessor` handles CSS `url()` references, etc. The `ContentProcessorManager` dispatches to the right processor based on content type.

### Concurrency Model

The crawler uses tokio for async I/O with a semaphore-based worker pool (`options.workers`). Shared state uses:
- `Arc<DashMap<...>>` for lock-free concurrent maps (URL queue, visited URLs, skipped URLs)
- `Arc<Mutex<...>>` for sequential-access state (Status, Output, AnalysisManager)
- `Arc<AtomicBool/AtomicUsize>` for simple flags and counters

### Key Traits

- **`Analyzer`** (`analysis/analyzer.rs`): Post-crawl analysis (SEO, security, headers, etc.). Each analyzer gets `&Status` and `&mut Output`.
- **`Exporter`** (`export/exporter.rs`): Output generators (HTML report, offline website, markdown, sitemap, mailer, upload).
- **`Output`** (`output/output.rs`): Formatting backend. Implementations: `TextOutput`, `JsonOutput`, `MultiOutput`.
- **`ContentProcessor`** (`content_processor/content_processor.rs`): Per-URL content transformation during crawl (HTML, JS, CSS, XML processors).

### Options System

CLI options are defined in `options/core_options.rs` via `get_options()` which returns an `Options` struct with typed option groups. Parsing flow: `parse_argv()` → merge config file → parse flags → `CoreOptions::from_options()` → `apply_option_value()` for each option. New CLI options require: adding the field to `CoreOptions`, a case in `apply_option_value()`, and an entry in the appropriate option group.

### Exit Codes

| Code | Meaning |
|------|---------|
| 0 | Success (with `--ci`: all thresholds passed) |
| 1 | Runtime error |
| 2 | Help/version displayed |
| 3 | No pages successfully crawled (DNS failure, timeout, etc.) |
| 10 | CI/CD quality gate failed |
| 101 | Configuration error |

### HTTP Response Body

`HttpResponse.body` is `Option<Vec<u8>>` (not String) to preserve binary data for images, fonts, etc. Use `body_text()` for string content. Failed HTTP requests return `Ok(HttpResponse)` with negative status codes (-1 connection error, -2 timeout, -4 send error), not `Err`.

### Testing Structure

- **Unit tests**: In-file `#[cfg(test)] mod tests` blocks (standard Rust convention)
- **Integration tests**: `tests/integration_crawl.rs` with shared helpers in `tests/common/mod.rs`
- Network-dependent integration tests are `#[ignore]` — run explicitly with `--ignored`

### Testing Complex Scenarios with Sample Websites

The crawler has a built-in HTTP server (`--serve-offline=<dir>`) that can serve any local directory as a static website. This enables efficient local testing of edge cases without deploying a real site:

1. Create a sample website directory, e.g. `./tmp/sample-website-xyz/`
2. Add HTML files and assets simulating the desired scenario (spaces in filenames, special characters, redirect chains, broken links, specific heading structures, etc.)
3. Start the built-in server: `./target/release/siteone-crawler --serve-offline=./tmp/sample-website-xyz/ --serve-port=8888`
4. In another terminal, crawl the local site: `./target/release/siteone-crawler --url=http://127.0.0.1:8888/`
5. Verify the crawler handles the scenario correctly (output, offline export, analysis results)

This approach is useful for reproducing bug reports, testing regex edge cases (e.g. URLs with spaces, HTML entities, unusual attribute quoting), validating offline/markdown export for specific HTML structures, and any scenario that would be hard to find on a live website.

### Key Files

- `src/engine/crawler.rs` (~1700 lines): Core crawl loop, URL queue management, HTML/content parsing
- `src/options/core_options.rs` (~2500 lines): All 120+ CLI options, parsing, validation
- `src/export/utils/offline_url_converter.rs` (~1400 lines): URL-to-file-path conversion for offline export
- `src/export/html_report/report.rs`: HTML report generation with embedded template
- `src/scoring/scorer.rs`: Quality score calculation from summary findings
- `src/scoring/ci_gate.rs`: CI/CD threshold evaluation

### Edition & Rust Version

Project uses `edition = "2024"` (Rust 1.85+) with `rust-version = "1.94"`. Edition 2024 features used throughout: `unsafe extern` blocks, `if let` chaining (`if let ... && ...`), `unsafe { std::env::set_var() }`.

### Commit Policy

**Never commit automatically.** Commits are only allowed on explicit user request. Before every commit, always run `git status`, review the changes, and stage only the relevant files — never use `git add -A` or `git add .` blindly.

### Commit Messages

Use [Conventional Commits](https://www.conventionalcommits.org/): `feat:`, `fix:`, `refactor:`, `perf:`, `docs:`, `style:`, `ci:`, `chore:`, `test:`. Examples:
- `feat: add built-in HTTP server for markdown/offline exports`
- `fix: correct non-ASCII text corruption in heading ID generation`
- `perf: eliminate heap allocation in content_type_for_extension`
- `chore: bump version to 2.0.3`

### Releasing a New Version

1. Update version in `Cargo.toml` (`version = "X.Y.Z"`)
2. Update version in `src/version.rs` (`pub const CODE: &str = "X.Y.Z.YYYYMMDD";`)
3. Run `cargo check` so that `Cargo.lock` is updated with the new version
4. Commit all three files (`Cargo.toml`, `src/version.rs`, `Cargo.lock`): `git commit -m "chore: bump version to X.Y.Z"`
5. Tag and push: `git tag vX.Y.Z && git push && git push --tags`

### Important Conventions

- Tables, column order, and formatting must stay consistent across versions. The HTML parser uses the `scraper` crate.
- HTTP cache lives in `tmp/http-client-cache/` by default. Delete it for fresh crawls or use `--http-cache-dir=` to disable.
- `rustls` requires explicit `ring` CryptoProvider installation in `main.rs`.


================================================
FILE: Cargo.toml
================================================
[package]
name = "siteone-crawler"
version = "2.3.0"
edition = "2024"
rust-version = "1.94"
authors = ["Ján Regeš <jan.reges@siteone.cz>"]
description = "Website crawler and QA toolkit in Rust for security, performance, SEO, and accessibility audits, offline cloning, markdown export, sitemap generation, cache warming, and CI/CD gating — one dependency-free binary for all major platforms, 10 tools in one."
license = "MIT"
repository = "https://github.com/janreges/siteone-crawler"
homepage = "https://crawler.siteone.io/"
keywords = ["crawler", "seo", "website-analysis", "accessibility", "security"]
categories = ["command-line-utilities", "web-programming"]
readme = "README.md"

[[bin]]
name = "siteone-crawler"
path = "src/main.rs"

[dependencies]
tokio = { version = "1", features = ["full"] }
reqwest = { version = "0.13", features = ["gzip", "brotli", "deflate", "rustls", "socks", "cookies", "stream", "blocking", "multipart"] }
scraper = "0.25"
regex = "1"
clap = { version = "4", features = ["derive"] }
serde = { version = "1", features = ["derive"] }
serde_json = "1"
colored = "3"
dashmap = "6"
hickory-resolver = "0.25"
rustls = { version = "0.23", features = ["ring"] }
x509-parser = "0.18"
lettre = { version = "0.11", default-features = false, features = ["tokio1-rustls-tls", "smtp-transport", "builder"] }
flate2 = "1"
brotli = "8"
chrono = { version = "0.4", features = ["serde"] }
chrono-tz = "0.10"
terminal_size = "0.4"
quick-xml = "0.39"
thiserror = "2"
anyhow = "1"
md-5 = "0.10"
url = "2"
percent-encoding = "2"
mime = "0.3"
once_cell = "1"
indexmap = "2"
gethostname = "1.1"
rustls-native-certs = "0.8"
ego-tree = "0.10"
base64 = "0.22"
dirs = "6"
pulldown-cmark = "0.13.1"
inquire = { version = "0.9", default-features = false, features = ["crossterm"] }
crossterm = "0.29"
fancy-regex = "0.17"

[package.metadata.deb]
maintainer = "Ján Regeš <jan.reges@siteone.cz>"
copyright = "2023-2026, Ján Regeš"
depends = "libc6"
section = "web"
priority = "optional"
extended-description = """\
SiteOne Crawler is an ultra-fast, open-source website crawler and QA toolkit \
written in Rust. It helps developers, DevOps teams, QA engineers, and technical \
SEO specialists crawl websites, audit quality, stress-test pages under load, \
clone sites for offline browsing and archiving, export content to markdown, \
generate sitemaps, warm caches, and enforce CI/CD quality gates — all from a \
single, dependency-free binary for Linux, macOS, and Windows.\n\
\n\
It combines multiple website tooling workflows in one application: security, \
performance, SEO, accessibility, and best-practices audits; whole-site quality \
scoring; UX checks that other tools miss (e.g. non-clickable phone numbers, \
missing alt text, broken heading hierarchy); reporting of all external links \
with their source pages, redirects, and 404s; stress/load testing with tunable \
concurrency and rate limits; offline multi-domain cloning with URL rewriting; \
markdown export for documentation, archiving, or AI workflows; sitemap \
generation; post-deploy cache warming; and automated quality checks for CI/CD \
pipelines.\n\
\n\
SiteOne Crawler can output results as interactive HTML reports (including an \
image gallery of all pictures found on the site), structured JSON, or readable \
terminal text, making it suitable both for local development and for automation \
in CI/CD environments. It can also email HTML reports directly via \
the user's own SMTP server and includes a built-in web server for browsing \
generated markdown exports, plus extensive CLI configurability for advanced \
use cases.\n\
\n\
Whether you need a technical website audit, an offline mirror, a load-testing \
helper, a markdown export for LLM/AI processing, or a reliable quality gate \
before deployment, SiteOne Crawler delivers 10 tools in one — as an ultra-fast, \
portable, open-source Rust binary with zero runtime dependencies."""
assets = [
    ["target/release/siteone-crawler", "usr/bin/", "755"],
    ["README.md", "usr/share/doc/siteone-crawler/", "644"],
    ["LICENSE", "usr/share/doc/siteone-crawler/", "644"],
]

[package.metadata.deb.variants.static]
name = "siteone-crawler-static"
depends = ""
conflicts = "siteone-crawler"
provides = "siteone-crawler"
extended-description = """\
Statically linked (musl) variant of SiteOne Crawler for maximum Linux compatibility. \
This version runs on any Linux distribution regardless of the installed glibc version. \
Install this if the standard siteone-crawler package reports a 'GLIBC not found' error. \
Note: ~50–80% slower than the glibc variant for CPU-intensive operations (offline and \
markdown export) due to the musl memory allocator."""

[package.metadata.generate-rpm]
assets = [
    { source = "target/release/siteone-crawler", dest = "/usr/bin/siteone-crawler", mode = "0755" },
    { source = "README.md", dest = "/usr/share/doc/siteone-crawler/README.md", mode = "0644" },
    { source = "LICENSE", dest = "/usr/share/doc/siteone-crawler/LICENSE", mode = "0644" },
]


================================================
FILE: LICENSE
================================================
MIT License

Copyright (c) 2023-2026 Ján Regeš

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

================================================
FILE: README.md
================================================
# SiteOne Crawler

SiteOne Crawler is a powerful and easy-to-use **website analyzer, cloner, and converter** designed for developers seeking security and performance insights, SEO specialists identifying optimization opportunities, and website owners needing reliable backups and offline versions.

**Now rewritten in Rust** for maximum performance, minimal resource usage, and zero runtime dependencies. The transition from PHP+Swoole to Rust resulted in **25% faster execution** and **30% lower memory consumption** while producing identical output.

**Discover the SiteOne Crawler advantage:**

*   **Run Anywhere:** Single native binary for **🪟 Windows**, **🍎 macOS**, and **🐧 Linux** (x64 & arm64). No runtime dependencies.
*   **Work Your Way:** Launch the binary without arguments for an **interactive wizard** 🧙 with 10 preset modes, use the extensive **command-line interface** 📟 ([releases](https://github.com/janreges/siteone-crawler/releases), [▶️ video](https://www.youtube.com/watch?v=25T_yx13naA&list=PL9mElgTe-s1Csfg0jXWmDS0MHFN7Cpjwp)) for automation and power, or enjoy the intuitive **desktop GUI application** 💻 ([GUI app](https://github.com/janreges/siteone-crawler-gui), [▶️ video](https://www.youtube.com/watch?v=rFW8LNEVNdw)) for visual control.
*   **Rich Output Formats:** Interactive **HTML audit report** 📊 with sortable tables and quality scoring (0.0-10.0) (see [nextjs.org sample](https://crawler.siteone.io/html/2024-08-23/forever/cl8xw4r-fdag8wg-44dd.html)), detailed **JSON** for programmatic consumption, and human-readable **text** for terminal. Send HTML reports directly to your inbox via **built-in SMTP mailer** 📧.
*   **CI/CD Integration:** Built-in **quality gate** (`--ci`) with configurable thresholds — exit code 10 on failure enables automated deployment blocking. Also useful for **cache warming** — crawling the entire site after deployment populates your reverse proxy/CDN cache.
*   **Offline & Markdown Power:** Create complete **offline clones** 💾 for browsing without a server ([nextjs.org clone](https://crawler.siteone.io/examples-exports/nextjs.org/)) or convert entire websites into clean **Markdown** 📝 — perfect for backups, documentation, or feeding content to AI models ([examples](https://github.com/janreges/siteone-crawler-markdown-examples/)).
*   **Deep Crawling & Analysis:** Thoroughly crawl every page and asset, identify errors (404s, redirects), generate **sitemaps** 🗺️, and even get **email summaries** 📧 (watch [▶️ video example](https://www.youtube.com/watch?v=PHIFSOmk0gk)).
*   **Learn More:** Dive into the 🌐 [Project Website](https://crawler.siteone.io/), explore the detailed [Documentation](https://crawler.siteone.io/configuration/command-line-options/), or check the [JSON](docs/JSON-OUTPUT.md)/[Text](docs/TEXT-OUTPUT.md) output specs.

GIF animation of the crawler in action (also available as a [▶️ video](https://www.youtube.com/watch?v=25T_yx13naA&list=PL9mElgTe-s1Csfg0jXWmDS0MHFN7Cpjwp)):

![SiteOne Crawler](docs/siteone-crawler-command-line.gif)

## Table of contents

- [✨ Features](#-features)
    * [🕷️ Crawler](#️-crawler)
    * [🛠️ Dev/DevOps assistant](#️-devdevops-assistant)
    * [📊 Analyzer](#-analyzer)
    * [📧 Reporter](#-reporter)
    * [💾 Offline website generator](#-offline-website-generator)
    * [📝 Website to markdown converter](#-website-to-markdown-converter)
    * [🗺️ Sitemap generator](#️-sitemap-generator)
- [🚀 Installation](#-installation)
    * [📦 Pre-built binaries](#-pre-built-binaries)
    * [🍺 Homebrew (macOS / Linux)](#-homebrew-macos--linux)
    * [🐧 Debian / Ubuntu (apt)](#-debian--ubuntu-apt)
    * [🎩 Fedora / RHEL (dnf)](#-fedora--rhel-dnf)
    * [🦎 openSUSE / SLES (zypper)](#-opensuse--sles-zypper)
    * [🏔️ Alpine Linux (apk)](#️-alpine-linux-apk)
    * [🔨 Build from source](#-build-from-source)
- [▶️ Usage](#️-usage)
    * [Interactive wizard](#interactive-wizard)
    * [Basic example](#basic-example)
    * [CI/CD example](#cicd-example)
    * [Fully-featured example](#fully-featured-example)
    * [⚙️ Arguments](#️-arguments)
        + [Basic settings](#basic-settings)
        + [Output settings](#output-settings)
        + [Resource filtering](#resource-filtering)
        + [Advanced crawler settings](#advanced-crawler-settings)
        + [File export settings](#file-export-settings)
        + [Mailer options](#mailer-options)
        + [Upload options](#upload-options)
        + [Offline exporter options](#offline-exporter-options)
        + [Markdown exporter options](#markdown-exporter-options)
        + [Sitemap options](#sitemap-options)
        + [Expert options](#expert-options)
        + [Fastest URL analyzer](#fastest-url-analyzer)
        + [SEO and OpenGraph analyzer](#seo-and-opengraph-analyzer)
        + [Slowest URL analyzer](#slowest-url-analyzer)
        + [Built-in HTTP server](#built-in-http-server)
        + [HTML-to-Markdown conversion](#html-to-markdown-conversion)
        + [CI/CD settings](#cicd-settings)
- [🏆 Quality Scoring](#-quality-scoring)
- [🔄 CI/CD Integration](#-cicd-integration)
- [📄 Output Examples](#-output-examples)
- [🧪 Testing](#-testing)
- [⚠️ Disclaimer](#️-disclaimer)
- [📜 License](#-license)

## ✨ Features

In short, the main benefits can be summarized in these points:

- **🕷️ Crawler** - very powerful crawler of the entire website reporting useful information about each URL (status code,
  response time, size, custom headers, titles, etc.)
- **🛠️ Dev/DevOps assistant** - offers stress/load testing with configurable concurrent workers (`--workers`) and request
  rate (`--max-reqs-per-sec`), cache warming, localhost testing, and rich URL/content-type filtering
- **📊 Analyzer** - analyzes all webpages and reports strange or error behaviour and useful statistics (404, redirects, bad
  practices, SEO and security issues, heading structures, etc.)
- **📧 Reporter** - interactive **HTML audit report**, structured **JSON**, and colored **text** output; built-in
  **SMTP mailer** sends HTML reports directly to your inbox
- **💾 Offline website generator** - clone entire websites to browsable local HTML files (no server needed) including all
  assets. Supports **multi-domain clones** — include subdomains or external domains with intelligent cross-linking.
- **📝 Website to markdown converter** - export the entire website to browsable text markdown (viewable on GitHub or any
  text editor), or generate a **single-file markdown** with smart header/footer deduplication — ideal for **feeding to AI
  tools**. Includes a **built-in web server** that renders markdown exports as styled HTML pages.
  Also supports **standalone HTML-to-Markdown conversion** of local files (`--html-to-markdown`).
  See [markdown examples](https://github.com/janreges/siteone-crawler-markdown-examples/).
- **🗺️ Sitemap generator** - allows you to generate `sitemap.xml` and `sitemap.txt` files with a list of all pages on your
  website
- **🏆 Quality scoring** - automatic quality scoring (0.0-10.0) across 5 categories: Performance, SEO, Security, Accessibility, Best Practices
- **🔄 CI/CD quality gate** - configurable thresholds with exit code 10 on failure for automated pipelines; also
  useful as a **post-deployment cache warmer** for reverse proxies and CDNs

The following features are summarized in greater detail:

### 🕷️ Crawler

- **all major platforms** supported without dependencies (🐧 Linux, 🪟 Windows, 🍎 macOS, arm64) — single native binary
- has incredible **🚀 native Rust performance** with async I/O and multi-threaded crawling
- provides simulation of **different device types** (desktop/mobile/tablet) thanks to predefined User-Agents
- will crawl **all files**, styles, scripts, fonts, images, documents, etc. on your website
- will respect the `robots.txt` file and will not crawl the pages that are not allowed
- has a **beautiful interactive** and **🎨 colourful output**
- it will **clearly warn you** ⚠️ of any wrong use of the tool (e.g. input parameters validation or wrong permissions)
- as `--url` parameter, you can specify also a `sitemap.xml` file (or [sitemap index](https://www.sitemaps.org/protocol.html#index)),
  which will be processed as a list of URLs. In sitemap-only mode, the crawler follows only URLs from
  the sitemap — it does not discover additional links from HTML pages. Gzip-compressed sitemaps (`*.xml.gz`)
  are fully supported, both as direct URLs and when referenced from sitemap index files.
- respects the HTML `<base href>` tag when resolving relative URLs on pages that use it.

### 🛠️ Dev/DevOps assistant

- allows testing **public** and **local projects on specific ports** (e.g. `http://localhost:3000/`)
- works as a **stress/load tester** — configure the number of **concurrent workers** (`--workers`) and the **maximum
  requests per second** (`--max-reqs-per-sec`) to simulate various traffic levels and test your infrastructure's
  resilience against high load or DoS scenarios
- combine with **rich filtering options** — include/ignore URLs by regex (`--include-regex`, `--ignore-regex`), disable
  specific asset types (`--disable-javascript`, `--disable-images`, etc.), or limit crawl depth (`--max-depth`) to focus
  the load on specific parts of your website
- will help you **warm up the application cache** or the **cache on the reverse proxy** of the entire website

### 📊 Analyzer

- will **find the weak points** or **strange behavior** of your website
- built-in analyzers cover SEO, security headers, accessibility, best practices, performance, SSL/TLS, caching, and more

### 📧 Reporter

Three output formats:

- **Interactive HTML report** — a self-contained `.html` file with sortable tables, quality scores, color-coded
  findings, and sections for SEO, security, accessibility, performance, headers, redirects, 404s, and more. Open it
  in any browser — no server needed.
- **JSON output** — structured data with all crawled URLs, response details, analysis findings, scores, and CI/CD gate
  results. Ideal for programmatic consumption, dashboards, and integrations.
- **Text output** — human-readable colored terminal output with tables, progress bars, and summaries.

Additional reporting features:

- **Built-in SMTP mailer** — send the HTML audit report directly to one or more email addresses via your own SMTP
  server. Configure sender, recipients, subject template, and SMTP credentials via CLI options.
- will provide you with data for **SEO analysis**, just add the `Title`, `Keywords` and `Description` extra columns
- will provide useful **summaries and statistics** at the end of the processing

### 💾 Offline website generator

- will help you **export the entire website** to offline form, where it is possible to browse the site through local
  HTML files (without HTTP server) including all documents, images, styles, scripts, fonts, etc.
- supports **multi-domain clones** — include subdomains (`*.mysite.tld`) or entirely different domains in a single
  offline export. All URLs across included domains are **intelligently rewritten to relative paths**, so the resulting
  offline version cross-links pages between domains seamlessly — you get one unified browsable clone.
- you can **limit what assets** you want to download and export (see `--disable-*` directives) .. for some types of
  websites the best result is with the `--disable-javascript` option.
- you can specify by `--allowed-domain-for-external-files` (short `-adf`) from which **external domains** it is possible
  to **download** assets (JS, CSS, fonts, images, documents) including `*` option for all domains.
- you can specify by `--allowed-domain-for-crawling` (short `-adc`) which **other domains** should be included in the
  **crawling** if there are any links pointing to them. You can enable e.g. `mysite.*` to export all language mutations
  that have a different TLD or `*.mysite.tld` to export all subdomains.
- you can use `--single-page` to **export only one page** to which the URL is given (and its assets), but do not follow
  other pages.
- you can use `--single-foreign-page` to **export only one page** from another domain (if allowed by `--allowed-domain-for-crawling`),
  but do not follow other pages.
- you can use `--replace-content` to **replace content** in HTML/JS/CSS with `foo -> bar` or regexp in PCRE format, e.g.
  `/card[0-9]/i -> card`. Can be specified multiple times.
- you can use `--replace-query-string` to **replace chars in query string** in the filename.
- you can use `--max-depth` to set the **maximum crawling depth** (for pages, not assets). `1` means `/about` or `/about/`,
  `2` means `/about/contacts` etc.
- you can use it to **export your website to a static form** and host it on GitHub Pages, Netlify, Vercel, etc. as a
  static backup and part of your **disaster recovery plan** or **archival/legal needs**
- works great with **older conventional websites** but also **modern ones**, built on frameworks like Next.js, Nuxt.js,
  SvelteKit, Astro, Gatsby, etc. When a JS framework is detected, the export also performs some framework-specific code
  modifications for optimal results.
- **try it** for your website, and you will be very pleasantly surprised :-)

### 📝 Website to markdown converter

Two export modes:

- **Multi-file markdown** — exports the entire website with all subpages to a directory of **browsable `.md` files**.
  The markdown renders nicely when uploaded to GitHub, viewed in VS Code, or any text editor. Links between pages are
  converted to relative `.md` links so you can navigate between files. Optionally includes images and other files
  (PDF, etc.).
- **Single-file markdown** — combines all pages into **one large markdown file** with smart removal of duplicate website
  headers and footers across pages. Ideal for **feeding entire website content to AI tools** (ChatGPT, Claude, etc.)
  that process markdown more effectively than raw HTML.

Smart conversion features:

- **collapsible accordions** — large link lists (menus, navigation, footer links with 8+ items) are automatically
  collapsed into `<details>` accordions with contextual labels ("Menu", "Links") for better readability
- content before the main heading (typically h1) — such as the site header and navigation — is moved to the end of the
  page below a `---` separator, so the actual page content comes first
- you can set multiple selectors (CSS-like) to **remove unwanted elements** from the exported markdown
- **code block detection** and **syntax highlighting** for popular programming languages
- HTML tables are converted to proper **markdown tables**

Built-in web server:

- use `--serve-markdown=<dir>` to start a **built-in HTTP server** that renders your markdown export as styled HTML
  pages with tables, dark/light mode, breadcrumb navigation, and accordion support — perfect for browsing and sharing
  the export locally or on a network

Standalone HTML-to-Markdown conversion:

- use `--html-to-markdown=<file>` to convert a **local HTML file** directly to Markdown without crawling any website
- outputs clean Markdown to **stdout** (pipe-friendly) or to a file with `--html-to-markdown-output=<file>`
- uses the same conversion pipeline as `--markdown-export-dir` — including all cleanup, accordion collapsing, code language detection, and implicit exclusions (cookie banners, `aria-hidden` elements, `role="menu"` dropdowns)
- respects `--markdown-disable-images`, `--markdown-disable-files`, `--markdown-exclude-selector`, and `--markdown-move-content-before-h1-to-end`
- does **not** rewrite links (`.html` → `.md`) since the file is standalone with no site context

💡 Tip: you can push the exported markdown folder to your GitHub repository, where it will be automatically rendered as a browsable
documentation. You can look at the [examples](https://github.com/janreges/siteone-crawler-markdown-examples/) of converted websites to markdown.

See all available [markdown exporter options](#markdown-exporter-options) and [HTML-to-Markdown conversion options](#html-to-markdown-conversion).

### 🗺️ Sitemap generator

- will help you create a `sitemap.xml` and `sitemap.txt` for your website
- you can set the priority of individual pages based on the number of slashes in the URL

Don't hesitate and try it. You will love it as we do! ❤️

## 🚀 Installation

### 📦 Pre-built binaries

Download pre-built binaries from [🐙 GitHub releases](https://github.com/janreges/siteone-crawler/releases) for all major platforms (🐧 Linux, 🪟 Windows, 🍎 macOS, x64 & arm64).

The binary is self-contained — no runtime dependencies required.

```bash
# Linux / macOS — download, extract, run
./siteone-crawler --url=https://my.domain.tld
```

**🐧 Linux binary variants:**

For Linux, two binary variants are provided:

| Variant | Compatibility | Performance |
|---------|--------------|-------------|
| **glibc** (primary) | Requires glibc 2.39+ (Ubuntu 24.04+, Debian 13+, Fedora 40+) | Full native performance |
| **musl** (compatible) | Any Linux distribution (statically linked, no dependencies) | ~50–80% slower due to musl memory allocator |

The **glibc** variant is recommended for current distributions — it offers the best performance. If you are running an older distribution (e.g. Ubuntu 22.04, Debian 12) and encounter a `GLIBC_2.xx not found` error, use the **musl** variant instead. The musl binary is fully statically linked and runs on any Linux system regardless of the installed glibc version. The performance difference is mainly noticeable during CPU-intensive operations like offline and markdown exports.

**Note for macOS users**: In case that Mac refuses to start the crawler from your Download folder, move the entire folder with the Crawler **via the terminal** to another location, for example to the homefolder `~`.

### 🍺 Homebrew (macOS / Linux)

```bash
brew install janreges/tap/siteone-crawler
siteone-crawler --url=https://my.domain.tld
```

### 🐧 Debian / Ubuntu (apt)

```bash
curl -1sLf 'https://dl.cloudsmith.io/public/janreges/siteone-crawler/setup.deb.sh' | sudo -E bash
sudo apt-get install siteone-crawler
```

> **Older distributions (Ubuntu 22.04, Debian 11/12, etc.):** If you get a `GLIBC_X.XX not found` error, install the statically linked variant instead:
> ```bash
> sudo apt-get install siteone-crawler-static
> ```
> See [Linux binary variants](#-pre-built-binaries) for details on the performance difference.

### 🎩 Fedora / RHEL (dnf)

```bash
curl -1sLf 'https://dl.cloudsmith.io/public/janreges/siteone-crawler/setup.rpm.sh' | sudo -E bash
sudo dnf install siteone-crawler
```

> **Older distributions:** If you get a `GLIBC_X.XX not found` error, use `sudo dnf install siteone-crawler-static` instead.
> See [Linux binary variants](#-pre-built-binaries) for details.

### 🦎 openSUSE / SLES (zypper)

```bash
curl -1sLf 'https://dl.cloudsmith.io/public/janreges/siteone-crawler/setup.rpm.sh' | sudo -E bash
sudo zypper install siteone-crawler
```

> **Older distributions:** If you get a `GLIBC_X.XX not found` error, use `sudo zypper install siteone-crawler-static` instead.
> See [Linux binary variants](#-pre-built-binaries) for details.

### 🏔️ Alpine Linux (apk)

```bash
curl -1sLf 'https://dl.cloudsmith.io/public/janreges/siteone-crawler/setup.alpine.sh' | sudo -E bash
sudo apk add siteone-crawler
```

### 🔨 Build from source

Requires [Rust](https://www.rust-lang.org/tools/install) 1.85 or later.

```bash
git clone https://github.com/janreges/siteone-crawler.git
cd siteone-crawler

# Build optimized release binary
cargo build --release

# Run
./target/release/siteone-crawler --url=https://my.domain.tld
```

**Build statically linked (musl) binary:**

```bash
# Install musl toolchain (Ubuntu/Debian)
sudo apt-get install musl-tools
rustup target add x86_64-unknown-linux-musl

# Build static binary (no system dependencies)
cargo build --release --target x86_64-unknown-linux-musl

# Run — works on any Linux distribution
./target/x86_64-unknown-linux-musl/release/siteone-crawler --url=https://my.domain.tld
```

## ▶️ Usage

### Interactive wizard

Run the binary **without any arguments** and an interactive wizard will guide you through the
configuration. Choose from 10 preset modes, enter the target URL, fine-tune settings with
arrow keys, and the crawler starts immediately — no need to remember CLI flags.

```
? Choose a crawl mode:
❯ Quick Audit               Fast site health overview — crawls all pages and assets
  SEO Analysis               Extract titles, descriptions, keywords, and OpenGraph tags
  Performance Test           Measure response times with cache disabled — find bottlenecks
  Security Check             Check SSL/TLS, security headers, and redirects site-wide
  Offline Clone              Download entire website with all assets for offline browsing
  Markdown Export            Convert pages to Markdown for AI models or documentation
  Stress Test                High-concurrency load test with cache-busting random params
  Single Page                Deep analysis of a single URL — SEO, security, performance
  Large Site Crawl           High-throughput HTML-only crawl for large sites (100k+ pages)
  Custom                     Start from defaults and configure every option manually
  ──────────────────────────────────────
  Browse offline export      Serve a previously exported offline site via HTTP
  Browse markdown export     Serve a previously exported markdown site via HTTP
[↑↓ to move, enter to select, type to filter]
```

After selecting a preset and entering the URL, the wizard shows a settings form where you can
adjust workers, timeout, content types, export options, and more. A configuration summary with the
equivalent CLI command is displayed before the crawl starts — copy it for future use without the
wizard.

If existing offline or markdown exports are detected in `./tmp/`, the wizard also offers to
**serve them via the built-in HTTP server** directly from the menu.

### Basic example

To run the crawler from the command line, provide the required arguments:

```bash
./siteone-crawler --url=https://mydomain.tld/ --device=mobile
```

### CI/CD example

```bash
# Fail deployment if quality score < 7.0 or any 5xx errors
./siteone-crawler --url=https://mydomain.tld/ --ci --ci-min-score=7.0 --ci-max-5xx=0
echo $?  # 0 = pass, 10 = fail
```

### Fully-featured example

```bash
./siteone-crawler --url=https://mydomain.tld/ \
  --output=text \
  --workers=2 \
  --max-reqs-per-sec=10 \
  --memory-limit=2048M \
  --resolve='mydomain.tld:443:127.0.0.1' \
  --timeout=5 \
  --proxy=proxy.mydomain.tld:8080 \
  --http-auth=myuser:secretPassword123 \
  --user-agent="My User-Agent String" \
  --extra-columns="DOM,X-Cache(10),Title(40),Keywords(50),Description(50>),Heading1=xpath://h1/text()(20>),ProductPrice=regexp:/Price:\s*\$?(\d+(?:\.\d{2})?)/i#1(10)" \
  --accept-encoding="gzip, deflate" \
  --url-column-size=100 \
  --max-queue-length=3000 \
  --max-visited-urls=10000 \
  --max-url-length=5000 \
  --max-non200-responses-per-basename=10 \
  --include-regex="/^.*\/technologies.*/" \
  --include-regex="/^.*\/fashion.*/" \
  --ignore-regex="/^.*\/downloads\/.*\.pdf$/i" \
  --analyzer-filter-regex="/^.*$/i" \
  --remove-query-params \
  --keep-query-param=page \
  --add-random-query-params \
  --transform-url="live-site.com -> local-site.local" \
  --transform-url="/cdn\.live-site\.com/ -> local-site.local/cdn" \
  --show-scheme-and-host \
  --do-not-truncate-url \
  --output-html-report=tmp/myreport.html \
  --html-report-options="summary,seo-opengraph,visited-urls,security,redirects" \
  --output-json-file=/dir/report.json \
  --output-text-file=/dir/report.txt \
  --add-timestamp-to-output-file \
  --add-host-to-output-file \
  --offline-export-dir=tmp/mydomain.tld \
  --replace-content='/<foo[^>]+>/ -> <bar>' \
  --ignore-store-file-error \
  --sitemap-xml-file=/dir/sitemap.xml \
  --sitemap-txt-file=/dir/sitemap.txt \
  --sitemap-base-priority=0.5 \
  --sitemap-priority-increase=0.1 \
  --markdown-export-dir=tmp/mydomain.tld.md \
  --markdown-export-single-file=tmp/mydomain.tld.combined.md \
  --markdown-move-content-before-h1-to-end \
  --markdown-disable-images \
  --markdown-disable-files \
  --markdown-remove-links-and-images-from-single-file \
  --markdown-exclude-selector='.exclude-me' \
  --markdown-replace-content='/<foo[^>]+>/ -> <bar>' \
  --markdown-replace-query-string='/[a-z]+=[^&]*(&|$)/i -> $1__$2' \
  --mail-to=your.name@my-mail.tld \
  --mail-to=your.friend.name@my-mail.tld \
  --mail-from=crawler@my-mail.tld \
  --mail-from-name="SiteOne Crawler" \
  --mail-subject-template="Crawler Report for %domain% (%date%)" \
  --mail-smtp-host=smtp.my-mail.tld \
  --mail-smtp-port=25 \
  --mail-smtp-user=smtp.user \
  --mail-smtp-pass=secretPassword123 \
  --ci --ci-min-score=7.0 --ci-min-security=8.0
```

## ⚙️ Arguments

For a clearer list, I recommend going to the documentation: 🌐 https://crawler.siteone.io/configuration/command-line-options/

### Basic settings

| Parameter | Description |
|-----------|-------------|
| `--url=<url>` | Required. HTTP or HTTPS URL address of the website or sitemap xml to be crawled.<br>Use quotation marks `''` if the URL contains query parameters. |
| `--single-page` | Load only one page to which the URL is given (and its assets), but do not follow other pages. |
| `--max-depth=<int>` | Maximum crawling depth (for pages, not assets). Default is `0` (no limit). `1` means `/about`<br>or `/about/`, `2` means `/about/contacts` etc. |
| `--device=<val>` | Device type for choosing a predefined User-Agent. Ignored when `--user-agent` is defined.<br>Supported values: `desktop`, `mobile`, `tablet`. Default is `desktop`. |
| `--user-agent=<val>` | Custom User-Agent header. Use quotation marks. If specified, it takes precedence over<br>the device parameter. If you add `!` at the end, the siteone-crawler/version will not be<br>added as a signature at the end of the final user-agent. |
| `--timeout=<int>` | Request timeout in seconds. Default is `5`. |
| `--proxy=<host:port>` | HTTP proxy to use in `host:port` format. Host can be hostname, IPv4 or IPv6. |
| `--http-auth=<user:pass>` | Basic HTTP authentication in `username:password` format. |
| `--config-file=<file>` | Load CLI options from a config file. One option per line, `#` comments allowed.<br>Without this flag, auto-discovers `~/.siteone-crawler.conf` or `/etc/siteone-crawler.conf`.<br>CLI arguments override config file values. |

### Output settings

| Parameter | Description |
|-----------|-------------|
| `--output=<val>` | Output type. Supported values: `text`, `json`. Default is `text`. |
| `--extra-columns=<values>` | Comma delimited list of extra columns added to output table. You can specify HTTP headers<br>(e.g. `X-Cache`), predefined values (`Title`, `Keywords`, `Description`, `DOM`), or custom<br>extraction from text files (HTML, JS, CSS, TXT, JSON, XML, etc.) using XPath or regexp.<br>For custom extraction, use the format `Custom_column_name=method:pattern#group(length)`, where<br>`method` is `xpath` or `regexp`, `pattern` is the extraction pattern, an optional `#group` specifies the<br>capturing group (or node index for XPath) to return (defaulting to the entire match or first node), and an<br>optional `(length)` sets the maximum output length (append `>` to disable truncation).<br>For example, use `Heading1=xpath://h1/text()(20>)` to extract the text of the first H1 element<br>from the HTML document, and `ProductPrice=regexp:/Price:\s*\$?(\d+(?:\.\d{2})?)/i#1(10)`<br>to extract a numeric price (e.g., "29.99") from a string like "Price: $29.99". |
| `--url-column-size=<num>` | Basic URL column width. By default, it is calculated from the size of your terminal window. |
| `--rows-limit=<num>` | Max. number of rows to display in tables with analysis results.<br>Default is `200`. |
| `--timezone=<val>` | Timezone for datetimes in HTML reports and timestamps in output folders/files, e.g. `Europe/Prague`.<br>Default is `UTC`. |
| `--do-not-truncate-url` | In the text output, long URLs are truncated by default to `--url-column-size` so the table does not<br>wrap due to long URLs. With this option, you can turn off the truncation. |
| `--show-scheme-and-host` | On text output, show scheme and host also for origin domain URLs. |
| `--hide-progress-bar` | Hide progress bar visible in text and JSON output for more compact view. |
| `--hide-columns=<list>` | Hide specified columns from the progress table. Comma-separated list of column names:<br>`type`, `time`, `size`, `cache`. Example: `--hide-columns=cache` or `--hide-columns=cache,type`. |
| `--no-color` | Disable colored output. |
| `--force-color` | Force colored output regardless of support detection. |
| `--show-inline-criticals` | Show criticals from the analyzer directly in the URL table. |
| `--show-inline-warnings` | Show warnings from the analyzer directly in the URL table. |

### Resource filtering

| Parameter | Description |
|-----------|-------------|
| `--disable-all-assets` | Disables crawling of all assets and files and only crawls pages in href attributes.<br>Shortcut for calling all other `--disable-*` flags. |
| `--disable-javascript` | Disables JavaScript downloading and removes all JavaScript code from HTML,<br>including `onclick` and other `on*` handlers. |
| `--disable-styles` | Disables CSS file downloading and at the same time removes all style definitions<br>by `<style>` tag or inline by style attributes. |
| `--disable-fonts` | Disables font downloading and also removes all font/font-face definitions from CSS. |
| `--disable-images` | Disables downloading of all images and replaces found images in HTML with placeholder image only. |
| `--disable-files` | Disables downloading of any files (typically downloadable documents) to which various links point. |
| `--remove-all-anchor-listeners` | On all links on the page remove any event listeners. Useful on some types of sites with modern<br>JS frameworks that would like to compose content dynamically (React, Svelte, Vue, Angular, etc.). |

### Advanced crawler settings

| Parameter | Description |
|-----------|-------------|
| `--workers=<int>` | Maximum number of concurrent workers (threads).<br>Crawler will not make more simultaneous requests to the server than this number.<br>Use carefully! A high number of workers can cause a DoS attack. Default is `3`. |
| `--max-reqs-per-sec=<val>` | Max requests/s for whole crawler. Be careful not to cause a DoS attack. Default value is `10`. |
| `--memory-limit=<size>` | Memory limit in units `M` (Megabytes) or `G` (Gigabytes). Default is `2048M`. |
| `--resolve=<host:port:ip>` | Custom DNS resolution in `domain:port:ip` format. Same as [curl --resolve](https://everything.curl.dev/usingcurl/connections/name.html?highlight=resolve#provide-a-custom-ip-address-for-a-name).<br>Can be specified multiple times. |
| `--allowed-domain-for-external-files=<domain>` | Enable loading of file content from another domain (e.g. CDN).<br>Can be specified multiple times. Use `*` for all domains. |
| `--allowed-domain-for-crawling=<domain>` | Allow crawling of other listed domains — typically language mutations on other domains.<br>Can be specified multiple times. Use wildcards like `*.mysite.tld`. |
| `--single-foreign-page` | When crawling of other domains is allowed, ensures that only the linked page<br>and its assets are crawled from foreign domains. |
| `--include-regex=<regex>` | PCRE-compatible regular expression for URLs that should be included.<br>Can be specified multiple times. Example: `--include-regex='/^\/public\//'` |
| `--ignore-regex=<regex>` | PCRE-compatible regular expression for URLs that should be ignored.<br>Can be specified multiple times. |
| `--regex-filtering-only-for-pages` | Apply `*-regex` rules only to page URLs, not static assets. |
| `--analyzer-filter-regex` | PCRE-compatible regular expression for filtering analyzers by name. |
| `--accept-encoding=<val>` | Custom `Accept-Encoding` request header. Default is `gzip, deflate, br`. |
| `--remove-query-params` | Remove query parameters from found URLs. |
| `--keep-query-param=<name>` | Keep only the specified query parameter(s) in discovered URLs; all others are removed.<br>Can be specified multiple times. If `--remove-query-params` is also set, all parameters<br>are removed regardless. |
| `--add-random-query-params` | Add random query parameters to each URL to bypass caches. |
| `--transform-url=<from->to>` | Transform URLs before crawling. Use `from -> to` for simple replacement or `/regex/ -> replacement`.<br>Can be specified multiple times. |
| `--force-relative-urls` | Normalize all discovered URLs matching the initial domain (incl. www variant and protocol<br>differences) to canonical form. Prevents duplicate files in offline export when the site<br>uses inconsistent URL formats (http/https, www/non-www). |
| `--ignore-robots-txt` | Ignore robots.txt content. |
| `--http-cache-dir=<dir>` | Cache dir for HTTP responses. Disable with `--http-cache-dir='off'` or `--no-cache`.<br>Default is `~/.cache/siteone-crawler/http-cache` (XDG-compliant, respects `$XDG_CACHE_HOME`). |
| `--http-cache-compression` | Enable compression for HTTP cache storage. |
| `--http-cache-ttl=<val>` | TTL for HTTP cache entries (e.g. `1h`, `7d`, `30m`). Use `0` for infinite. Default is `24h`. |
| `--no-cache` | Disable HTTP cache completely. Shortcut for `--http-cache-dir='off'`. |
| `--max-queue-length=<num>` | Maximum length of the waiting URL queue. Default is `9000`. |
| `--max-visited-urls=<num>` | Maximum number of visited URLs. Default is `10000`. |
| `--max-skipped-urls=<num>` | Maximum number of skipped URLs. Default is `10000`. |
| `--max-url-length=<num>` | Maximum supported URL length in chars. Default is `2083`. |
| `--max-non200-responses-per-basename=<num>` | Protection against looping with dynamic non-200 URLs. Default is `5`. |

### File export settings

| Parameter | Description |
|-----------|-------------|
| `--output-html-report=<file>` | Save HTML report into that file. Set to empty `''` to disable HTML report.<br>By default saved into `tmp/%domain%.report.%datetime%.html`. |
| `--html-report-options=<sections>` | Comma-separated list of sections to include in HTML report.<br>Available sections: `summary`, `seo-opengraph`, `image-gallery`, `video-gallery`, `visited-urls`, `dns-ssl`, `crawler-stats`, `crawler-info`, `headers`, `content-types`, `skipped-urls`, `external-links`, `caching`, `best-practices`, `accessibility`, `security`, `redirects`, `404-pages`, `slowest-urls`, `fastest-urls`, `source-domains`.<br>Default: all sections. |
| `--output-json-file=<file>` | File path for JSON output. Set to empty `''` to disable JSON file.<br>By default saved into `tmp/%domain%.output.%datetime%.json`.<br>See [JSON Output Documentation](docs/JSON-OUTPUT.md) for format details. |
| `--output-text-file=<file>` | File path for TXT output. Set to empty `''` to disable TXT file.<br>By default saved into `tmp/%domain%.output.%datetime%.txt`.<br>See [Text Output Documentation](docs/TEXT-OUTPUT.md) for format details. |
| `--add-timestamp-to-output-file` | Append timestamp to output filenames (HTML report, JSON, TXT) except sitemaps. |
| `--add-host-to-output-file` | Append initial URL host to output filenames (HTML report, JSON, TXT) except sitemaps. |

**Default output directory:** Report files are saved into `./tmp/` in the current working directory. If `./tmp/` cannot be created (e.g. read-only filesystem), the crawler falls back to the platform's XDG data directory (`~/.local/share/siteone-crawler/` on Linux, `~/Library/Application Support/siteone-crawler/` on macOS, `%APPDATA%\siteone-crawler\` on Windows) and prints a notice to stderr.

### Mailer options

| Parameter | Description |
|-----------|-------------|
| `--mail-to=<email>` | Recipients of HTML e-mail reports. Required for mailer activation.<br>You can specify multiple emails separated by comma. |
| `--mail-from=<email>` | E-mail sender address. Default is `siteone-crawler@your-hostname.com`. |
| `--mail-from-name=<val>` | E-mail sender name. Default is `SiteOne Crawler`. |
| `--mail-subject-template=<val>` | E-mail subject template. You can use `%domain%`, `%date%` and `%datetime%`.<br>Default is `Crawler Report for %domain% (%date%)`. |
| `--mail-smtp-host=<host>` | SMTP host for sending emails. Default is `localhost`. |
| `--mail-smtp-port=<port>` | SMTP port for sending emails. Default is `25`. |
| `--mail-smtp-user=<user>` | SMTP user, if your SMTP server requires authentication. |
| `--mail-smtp-pass=<pass>` | SMTP password, if your SMTP server requires authentication. |

### Upload options

| Parameter | Description |
|-----------|-------------|
| `--upload` | Enable HTML report upload to `--upload-to`. |
| `--upload-to=<url>` | URL of the endpoint where to send the HTML report. Default is `https://crawler.siteone.io/up`. |
| `--upload-retention=<val>` | How long should the HTML report be kept in the online version?<br>Values: 1h / 4h / 12h / 24h / 3d / 7d / 30d / 365d / forever.<br>Default is `30d`. |
| `--upload-password=<val>` | Optional password (user will be 'crawler') to display the online HTML report. |
| `--upload-timeout=<int>` | Upload timeout in seconds. Default is `3600`. |

### Offline exporter options

| Parameter | Description |
|-----------|-------------|
| `--offline-export-dir=<dir>` | Path to directory where to save the offline version of the website. |
| `--offline-export-store-only-url-regex=<regex>` | Debug: store only URLs matching these PCRE regexes. Can be specified multiple times. |
| `--offline-export-remove-unwanted-code=<1/0>` | Remove unwanted code for offline mode (analytics, social networks, etc.). Default is `1`. |
| `--offline-export-no-auto-redirect-html` | Disable automatic creation of redirect HTML files for subfolders containing `index.html`. |
| `--offline-export-preserve-url-structure` | Preserve the original URL path structure. E.g. `/about` is stored as `about/index.html`<br>instead of `about.html`. Useful for web server deployment where the clone should maintain<br>the same URL hierarchy as the original site. |
| `--offline-export-preserve-urls` | Preserve original URL format in exported HTML/CSS/JS — same-domain links become root-relative (`/path`), cross-domain links stay absolute. Ideal for processing with [siteone-chunker](https://github.com/janreges/siteone-chunker) and RAG pipelines where links must resolve to the production website. |
| `--replace-content=<val>` | Replace content in HTML/JS/CSS with `foo -> bar` or PCRE regexp.<br>Can be specified multiple times. |
| `--replace-query-string=<val>` | Replace characters in query string filenames.<br>Can be specified multiple times. |
| `--offline-export-lowercase` | Convert all filenames to lowercase for offline export. Useful for case-insensitive filesystems. |
| `--ignore-store-file-error` | Ignore any file storing errors and continue. |
| `--disable-astro-inline-modules` | Disable inlining of Astro module scripts for offline export.<br>Scripts will remain as external files with corrected relative paths. |

### Markdown exporter options

| Parameter | Description |
|-----------|-------------|
| `--markdown-export-dir=<dir>` | Path to directory where to save the markdown version of the website. |
| `--markdown-export-single-file=<file>` | Path to a file for combined markdown. Requires `--markdown-export-dir`. |
| `--markdown-move-content-before-h1-to-end` | Move content before main H1 heading to the end of the markdown. |
| `--markdown-disable-images` | Do not export and show images in markdown files. |
| `--markdown-disable-files` | Do not export files other than HTML/CSS/JS/fonts/images (e.g. PDF, ZIP). |
| `--markdown-remove-links-and-images-from-single-file` | Remove links and images from combined single file. |
| `--markdown-exclude-selector=<val>` | Exclude DOM elements by CSS selector from markdown export.<br>Can be specified multiple times. |
| `--markdown-replace-content=<val>` | Replace text content with `foo -> bar` or PCRE regexp.<br>Can be specified multiple times. |
| `--markdown-replace-query-string=<val>` | Replace characters in query string filenames.<br>Can be specified multiple times. |
| `--markdown-export-store-only-url-regex=<regex>` | Debug: store only URLs matching these PCRE regexes. Can be specified multiple times. |
| `--markdown-ignore-store-file-error` | Ignore any file storing errors and continue. |

### Sitemap options

| Parameter | Description |
|-----------|-------------|
| `--sitemap-xml-file=<file>` | File path for generated XML Sitemap. Extension `.xml` added if not specified. |
| `--sitemap-txt-file=<file>` | File path for generated TXT Sitemap. Extension `.txt` added if not specified. |
| `--sitemap-base-priority=<num>` | Base priority for XML sitemap. Default is `0.5`. |
| `--sitemap-priority-increase=<num>` | Priority increase based on slashes in URL. Default is `0.1`. |

### Expert options

| Parameter | Description |
|-----------|-------------|
| `--debug` | Activate debug mode. |
| `--debug-log-file=<file>` | Log file for debug messages. When set without `--debug`, logging is active without visible output. |
| `--debug-url-regex=<regex>` | Regex for URL(s) to debug. Can be specified multiple times. |
| `--result-storage=<val>` | Result storage type. Values: `memory` or `file`. Use `file` for large websites. Default is `memory`. |
| `--result-storage-dir=<dir>` | Directory for `--result-storage=file`. Default is `tmp/result-storage`. |
| `--result-storage-compression` | Enable compression for results storage. |
| `--http-cache-dir=<dir>` | Cache dir for HTTP responses. Disable with `--http-cache-dir='off'` or `--no-cache`.<br>Default is `~/.cache/siteone-crawler/http-cache` (XDG-compliant, respects `$XDG_CACHE_HOME`). |
| `--http-cache-compression` | Enable compression for HTTP cache storage. |
| `--http-cache-ttl=<val>` | TTL for HTTP cache entries (e.g. `1h`, `7d`, `30m`). Use `0` for infinite. Default is `24h`. |
| `--websocket-server=<host:port>` | Start crawler with websocket server on given host:port. |
| `--console-width=<int>` | Enforce a fixed console width, disabling automatic detection. |

### Fastest URL analyzer

| Parameter | Description |
|-----------|-------------|
| `--fastest-urls-top-limit=<int>` | Number of URLs in TOP fastest list. Default is `20`. |
| `--fastest-urls-max-time=<val>` | Maximum response time for an URL to be considered fast. Default is `1`. |

### SEO and OpenGraph analyzer

| Parameter | Description |
|-----------|-------------|
| `--max-heading-level=<int>` | Max heading level from 1 to 6 for analysis. Default is `3`. |

### Slowest URL analyzer

| Parameter | Description |
|-----------|-------------|
| `--slowest-urls-top-limit=<int>` | Number of URLs in TOP slowest list. Default is `20`. |
| `--slowest-urls-min-time=<val>` | Minimum response time threshold for slow URLs. Default is `0.01`. |
| `--slowest-urls-max-time=<val>` | Maximum response time for very slow evaluation. Default is `3`. |

### Built-in HTTP server

Browse exported markdown or offline HTML files through a local web server with a built-in viewer.

| Parameter | Description |
|-----------|-------------|
| `--serve-markdown=<dir>` | Start built-in HTTP server for browsing a markdown export directory.<br>Renders `.md` files as styled HTML with tables, accordions, dark/light mode, and breadcrumb navigation. |
| `--serve-offline=<dir>` | Start built-in HTTP server for browsing an offline HTML export directory.<br>Serves static files with Content-Security-Policy restricting assets to the same origin. |
| `--serve-port=<int>` | Port for the built-in HTTP server. Default is `8321`. |
| `--serve-bind-address=<addr>` | Bind address for the built-in HTTP server. Default is `127.0.0.1` (localhost only).<br>Use `0.0.0.0` to listen on all network interfaces and their IP addresses. |

**Example:**

```bash
# Browse markdown export
./siteone-crawler --serve-markdown=./exports/markdown

# Browse offline export on custom port, accessible from network
./siteone-crawler --serve-offline=./exports/offline --serve-port=9000 --serve-bind-address=0.0.0.0
```

### HTML-to-Markdown conversion

Convert a local HTML file to clean Markdown without crawling. Uses the same conversion pipeline as the markdown exporter.

| Parameter | Description |
|-----------|-------------|
| `--html-to-markdown=<file>` | Convert a local HTML file to Markdown and print to stdout. No crawling is performed.<br>Respects `--markdown-disable-images`, `--markdown-disable-files`, `--markdown-move-content-before-h1-to-end`, and `--markdown-exclude-selector`. |
| `--html-to-markdown-output=<file>` | Write the converted Markdown to a file instead of stdout. Requires `--html-to-markdown`. |

**Examples:**

```bash
# Convert HTML file to Markdown (printed to stdout)
./siteone-crawler --html-to-markdown=page.html

# Convert and save to a file
./siteone-crawler --html-to-markdown=page.html --html-to-markdown-output=page.md

# Convert with options: remove images, exclude navigation, move header below h1
./siteone-crawler --html-to-markdown=page.html \
  --markdown-disable-images \
  --markdown-exclude-selector=nav \
  --markdown-move-content-before-h1-to-end

# Pipe to other tools (e.g. clipboard, AI, wc)
./siteone-crawler --html-to-markdown=page.html | pbcopy
./siteone-crawler --html-to-markdown=page.html | wc -l
```

### CI/CD settings

| Parameter | Description |
|-----------|-------------|
| `--ci` | Enable CI/CD quality gate. Crawler exits with code 10 if thresholds are not met. Default file outputs (HTML, JSON, TXT reports) are suppressed unless explicitly requested via `--output-*` options. |
| `--ci-min-score=<val>` | Minimum overall quality score (0.0-10.0). Default is `5.0`. |
| `--ci-min-performance=<val>` | Minimum Performance category score (0.0-10.0). Default is `5.0`. |
| `--ci-min-seo=<val>` | Minimum SEO category score (0.0-10.0). Default is `5.0`. |
| `--ci-min-security=<val>` | Minimum Security category score (0.0-10.0). Default is `5.0`. |
| `--ci-min-accessibility=<val>` | Minimum Accessibility category score (0.0-10.0). Default is `3.0`. |
| `--ci-min-best-practices=<val>` | Minimum Best Practices category score (0.0-10.0). Default is `5.0`. |
| `--ci-max-404=<int>` | Maximum number of 404 responses allowed. Default is `0`. |
| `--ci-max-5xx=<int>` | Maximum number of 5xx server error responses allowed. Default is `0`. |
| `--ci-max-criticals=<int>` | Maximum number of critical analysis findings allowed. Default is `0`. |
| `--ci-max-warnings=<int>` | Maximum number of warning analysis findings allowed. Not checked by default. |
| `--ci-max-avg-response=<val>` | Maximum average response time in seconds. Not checked by default. |
| `--ci-min-pages=<int>` | Minimum number of HTML pages that must be found. Default is `10`. |
| `--ci-min-assets=<int>` | Minimum number of assets (JS, CSS, images, fonts) that must be found. Default is `10`. |
| `--ci-min-documents=<int>` | Minimum number of documents (PDF, etc.) that must be found. Default is `0` (not checked). |

**Default behavior with `--ci` alone:** overall score >= 5.0, each category score >= 5.0 (Performance, SEO, Security, Best Practices) and Accessibility >= 3.0, 404 errors <= 0, 5xx errors <= 0, critical findings <= 0, HTML pages >= 10, assets >= 10. File outputs (HTML, JSON, TXT reports) are not generated. To save reports in CI mode, specify the desired output explicitly, e.g. `--ci --output-html-report=report.html`.

## 🏆 Quality Scoring

The crawler automatically calculates a quality score (0.0-10.0) across 5 weighted categories:

| Category | Weight | What it measures |
|----------|--------|------------------|
| **Performance** | 20% | Response times, slow URLs |
| **SEO** | 20% | Missing H1, title uniqueness, meta descriptions, 404s, redirects |
| **Security** | 25% | SSL/TLS certificates, security headers, unsafe protocols |
| **Accessibility** | 20% | Lang attribute, image alt text, form labels, ARIA, heading levels |
| **Best Practices** | 15% | Duplicate/large SVGs, deep DOM, Brotli/WebP support |

The overall score is a weighted average of all categories. Scores are displayed in a colored box in the console output and included in JSON and HTML report outputs.

Score labels:
- **9.0-10.0** — Excellent (green)
- **7.0-8.9** — Good (blue)
- **5.0-6.9** — Fair (yellow)
- **3.0-4.9** — Poor (purple)
- **0.0-2.9** — Critical (red)

## 🔄 CI/CD Integration

The `--ci` flag enables a quality gate that evaluates configurable thresholds after crawling completes. When any threshold is not met, the crawler exits with **code 10** (distinct from exit code 1 for runtime errors). In CI mode, default file outputs (HTML, JSON, TXT reports) are automatically suppressed — only the console output and exit code matter. If you need report files in CI, specify them explicitly (e.g. `--output-html-report=report.html`).

**Bonus: Cache warming** — running the crawler as a post-deployment step in your CI/CD pipeline crawls every page and asset on your site, which populates the HTML/asset cache on your **reverse proxy** (Varnish, Nginx) or **CDN** (Cloudflare, CloudFront). This way, the first real visitors always hit a warm cache instead of cold origin requests.

### Exit codes

| Code | Meaning |
|------|---------|
| `0` | Success (with `--ci` this also means all quality thresholds passed) |
| `1` | Runtime error |
| `2` | Help/version displayed |
| `3` | No pages crawled (e.g. DNS failure, timeout, connection refused) |
| `10` | CI/CD quality gate failed |
| `101` | Configuration error |

### Example: GitHub Actions

```yaml
- name: Check website quality
  run: |
    ./siteone-crawler \
      --url=https://staging.example.com \
      --ci \
      --ci-min-score=7.0 \
      --ci-min-security=8.0 \
      --ci-max-404=0 \
      --ci-max-5xx=0
```

### Example: GitLab CI

```yaml
quality_check:
  script:
    - ./siteone-crawler --url=$STAGING_URL --ci --ci-min-score=6.0
  allow_failure: false
```

### Console output

When `--ci` is enabled, a quality gate box is displayed after the quality scores:

```
╔══════════════════════════════════════════════════════════════╗
║                      CI/CD QUALITY GATE                      ║
╠══════════════════════════════════════════════════════════════╣
║  [PASS] Overall score: 7.2 >= 5                              ║
║  [PASS] 404 errors: 0 <= 0                                   ║
║  [PASS] 5xx errors: 0 <= 0                                   ║
║  [FAIL] Critical findings: 2 > 0 (max: 0)                    ║
╠══════════════════════════════════════════════════════════════╣
║  RESULT: FAIL (1 of 4 checks failed) — exit code 10          ║
╚══════════════════════════════════════════════════════════════╝
```

### JSON output

When using `--output=json --ci`, the JSON includes a `ciGate` object:

```json
{
  "ciGate": {
    "passed": false,
    "exitCode": 10,
    "checks": [
      {"metric": "Overall score", "operator": ">=", "threshold": 5.0, "actual": 7.2, "passed": true},
      {"metric": "404 errors", "operator": "<=", "threshold": 0.0, "actual": 0.0, "passed": true},
      {"metric": "Critical findings", "operator": "<=", "threshold": 0.0, "actual": 2.0, "passed": false}
    ]
  }
}
```

## 📄 Output Examples

To understand the richness of the data provided by the crawler, you can examine real output examples generated from crawling `crawler.siteone.io`:

*   **Text Output Example:** [`docs/OUTPUT-crawler.siteone.io.txt`](docs/OUTPUT-crawler.siteone.io.txt)
    *   Provides a human-readable summary suitable for quick review.
    *   See the detailed [Text Output Documentation](docs/TEXT-OUTPUT.md).
*   **JSON Output Example:** [`docs/OUTPUT-crawler.siteone.io.json`](docs/OUTPUT-crawler.siteone.io.json)
    *   Provides structured data ideal for programmatic consumption and detailed analysis.
    *   See the detailed [JSON Output Documentation](docs/JSON-OUTPUT.md).

These examples showcase the various tables and metrics generated, demonstrating the tool's capabilities in analyzing website structure, performance, SEO, security, and more.

## 🧪 Testing

```bash
cargo test                                       # unit tests + offline integration tests
cargo test --test integration_crawl -- --ignored --test-threads=1  # network integration tests (crawls crawler.siteone.io)
```

Unit tests live in each source file (`#[cfg(test)] mod tests`). Integration tests are in `tests/integration_crawl.rs` — network-dependent tests are `#[ignore]` by default so that `cargo test` stays fast and offline.

## ⚠️ Disclaimer

Please use responsibly and ensure that you have the necessary permissions when crawling websites. Some sites may have
rules against automated access detailed in their robots.txt.

**The author is not responsible for any consequences caused by inappropriate use or deliberate misuse of this tool.**

## 📜 License

This work is licensed under a [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) license.

## Powered by

[![Hosted By: Cloudsmith](https://img.shields.io/badge/OSS%20hosting%20by-cloudsmith-blue?logo=cloudsmith&style=for-the-badge)](https://cloudsmith.com)

Package repository hosting is graciously provided by  [Cloudsmith](https://cloudsmith.com).
Cloudsmith is the only fully hosted, cloud-native, universal package management solution, that
enables your organization to create, store and share packages in any format, to any place, with total
confidence.

[![PhpStorm logo.](https://resources.jetbrains.com/storage/products/company/brand/logos/PhpStorm.svg)](https://jb.gg/OpenSourceSupport)


================================================
FILE: docs/JSON-OUTPUT.md
================================================
# SiteOne Crawler: JSON Output Documentation

## Table of Contents

*   [1. Introduction](#1-introduction)
*   [2. Potential Use Cases](#2-potential-use-cases)
*   [3. Detailed JSON Structure](#3-detailed-json-structure)
    *   [3.1. `crawler` (Object)](#31-crawler-object)
    *   [3.2. `extraColumnsFromAnalysis` (Array)](#32-extracolumnsfromanalysis-array)
    *   [3.3. `options` (Object)](#33-options-object)
    *   [3.4. `qualityScores` (Object)](#34-qualityscores-object)
    *   [3.5. `results` (Array)](#35-results-array)
    *   [3.6. `stats` (Object)](#36-stats-object)
    *   [3.7. `summary` (Object)](#37-summary-object)
    *   [3.8. `tables` (Object)](#38-tables-object)
*   [4. JSON Schema (Draft)](#4-json-schema-draft)
*   [5. Analysis Tables Description (`tables` key)](#5-analysis-tables-description-tables-key)
    *   [5.1. `skipped-summary` (Skipped URLs Summary)](#51-skipped-summary-skipped-urls-summary)
    *   [5.2. `skipped` (Skipped URLs)](#52-skipped-skipped-urls)
    *   [5.3. `redirects` (Redirected URLs)](#53-redirects-redirected-urls)
    *   [5.4. `404` (404 URLs)](#54-404-404-urls)
    *   [5.5. `certificate-info` (SSL/TLS info)](#55-certificate-info-ssltls-info)
    *   [5.6. `fastest-urls` (TOP fastest URLs)](#56-fastest-urls-top-fastest-urls)
    *   [5.7. `slowest-urls` (TOP slowest URLs)](#57-slowest-urls-top-slowest-urls)
    *   [5.8. `seo` (SEO metadata)](#58-seo-seo-metadata)
    *   [5.9. `open-graph` (OpenGraph metadata)](#59-open-graph-opengraph-metadata)
    *   [5.10. `seo-headings` (Heading structure)](#510-seo-headings-heading-structure)
    *   [5.11. `headers` (HTTP headers)](#511-headers-http-headers)
    *   [5.12. `headers-values` (HTTP header values)](#512-headers-values-http-header-values)
    *   [5.13. `caching-per-content-type` (HTTP Caching by content type)](#513-caching-per-content-type-http-caching-by-content-type)
    *   [5.14. `caching-per-domain` (HTTP Caching by domain)](#514-caching-per-domain-http-caching-by-domain)
    *   [5.15. `caching-per-domain-and-content-type` (HTTP Caching by domain and content type)](#515-caching-per-domain-and-content-type-http-caching-by-domain-and-content-type)
    *   [5.16. `non-unique-titles` (TOP non-unique titles)](#516-non-unique-titles-top-non-unique-titles)
    *   [5.17. `non-unique-descriptions` (TOP non-unique descriptions)](#517-non-unique-descriptions-top-non-unique-descriptions)
    *   [5.18. `best-practices` (Best practices)](#518-best-practices-best-practices)
    *   [5.19. `accessibility` (Accessibility)](#519-accessibility-accessibility)
    *   [5.20. `source-domains` (Source domains)](#520-source-domains-source-domains)
    *   [5.21. `content-types` (Content types)](#521-content-types-content-types)
    *   [5.22. `content-types-raw` (Content types (MIME types))](#522-content-types-raw-content-types-mime-types)
    *   [5.23. `dns` (DNS info)](#523-dns-dns-info)
    *   [5.24. `security` (Security)](#524-security-security)
    *   [5.25. `analysis-stats` (Analysis stats)](#525-analysis-stats-analysis-stats)
    *   [5.26. `content-processors-stats` (Content processor stats)](#526-content-processors-stats-content-processor-stats)
    *   [5.27. `external-urls` (External URLs)](#527-external-urls-external-urls)
*   [6. Note on Text Output](#6-note-on-text-output)


This document describes the structure and content of the JSON output file generated by the SiteOne Crawler. This JSON file contains detailed information about the crawled website, including metadata about the crawl process, results for each visited URL, quality scores, summary findings, and various analysis tables.

## 1. Introduction

The JSON output provides a comprehensive dataset about the crawled website. Key information includes:

*   **Crawl Metadata:** Details about the crawler execution, such as version, execution time, command used, hostname, and the final user agent.
*   **Options:** A complete record of all CLI configuration values used for the crawl.
*   **Quality Scores:** Overall and per-category quality scores (0-10) with deduction details.
*   **Visited URL Results:** For each URL visited during the crawl:
    *   URL address
    *   HTTP status code
    *   Elapsed time for the request (performance)
    *   Size of the response body
    *   Content type (HTML, CSS, JS, Image, etc.)
    *   Caching information (cache flags, lifetime)
    *   Additional analysis results stored in the `extras` field.
*   **Stats:** Aggregate statistics about the crawl (total URLs, sizes, timings, status code counts).
*   **Summary:** A list of findings (OK, Warning, Critical, Info) that feed into quality scoring.
*   **Analysis Tables:** Aggregated data and specific findings presented in structured tables:
    *   **Skipped URLs:** Reasons why certain URLs were not crawled (e.g., external domain, disallowed by robots.txt, specific rules).
    *   **Redirects:** List of URLs that resulted in redirects (3xx status codes).
    *   **404 Errors:** List of URLs that resulted in a 404 Not Found status.
    *   **SSL/TLS Info:** Details about the website's SSL certificate (issuer, subject, validity dates, supported protocols).
    *   **Performance:** Tables listing the fastest and slowest URLs encountered during the crawl.
    *   **SEO &amp; Content:**
        *   SEO metadata (title, description, keywords, H1, indexing directives) for HTML pages.
        *   OpenGraph and Twitter Card metadata.
        *   Heading structure analysis (correctness of H1-H6 hierarchy).
        *   Analysis of non-unique titles and descriptions across pages.
    *   **Technical Details:**
        *   HTTP Headers: Summary of headers found, their occurrences, and unique values.
        *   Caching Analysis: Breakdown of caching strategies by content type and domain.
        *   DNS Information: DNS resolution details for the target domain.
        *   Security Analysis: Evaluation of security-related HTTP headers.
        *   External URLs: List of external URLs discovered during the crawl.
    *   **Crawler Statistics:** Performance metrics for the crawler itself, individual analyzers, and content processors.

## 2. Potential Use Cases

The detailed data within the JSON output enables a wide variety of use cases:

1.  **Comprehensive SEO Audits:** Analyze titles, descriptions, heading structures, indexing status, and OpenGraph tags across the entire site.
2.  **Performance Monitoring &amp; Optimization:** Identify the slowest pages and resources, analyze load times, and check caching headers.
3.  **Broken Link Checking:** Easily extract lists of all 404 errors and the pages where they were found.
4.  **Redirect Chain Analysis:** Identify and analyze redirect chains.
5.  **Security Header Audits:** Verify the implementation of crucial security headers (CSP, HSTS, X-Frame-Options, etc.) across the site.
6.  **Content Inventory &amp; Analysis:** Get a list of all crawled resources, their types, sizes, and status codes. Analyze content type distribution.
7.  **Website Archiving/Cloning:** While the crawler has a dedicated offline export, the JSON contains the list of all discovered resources, which could inform a custom archiving process.
8.  **Competitive Analysis:** Run the crawler on competitor sites (respecting their `robots.txt`) to gather insights into their structure, performance, and technology.
9.  **CI/CD Integration:** Integrate the crawler into deployment pipelines to automatically check for new errors (404s, performance regressions) after deployments. Use quality scores and thresholds for automated pass/fail decisions.
10. **Technical Debt Assessment:** Identify outdated practices, missing security headers, or performance issues that need addressing.

## 3. Detailed JSON Structure

The JSON output has 8 top-level keys:

### 3.1. `crawler` (Object)

Contains metadata about the crawler execution:
*   `name` (String): Name of the crawler software.
*   `version` (String): Version of the crawler.
*   `executedAt` (String): Timestamp when the crawl was executed, in the format `"YYYY-MM-DD HH:MM:SS"` (space separator, no timezone). Example: `"2026-03-16 14:55:13"`.
*   `command` (String): The command-line arguments used to run the crawl.
*   `hostname` (String): The hostname where the crawler was run.
*   `finalUserAgent` (String): The User-Agent string used for the HTTP requests.

### 3.2. `extraColumnsFromAnalysis` (Array)

An array of objects defining extra columns that might be added during specific analyses. These are primarily intended for augmenting report outputs. Each object contains:
*   `name` (String): The display name of the column.
*   `length` (Integer): Suggested display length/width.
*   `truncate` (Boolean): Whether the content should be truncated if it exceeds the length.
*   `customMethod`, `customPattern`, `customGroup`: Fields used for custom data extraction logic (null when not configured).

### 3.3. `options` (Object)

A flat object containing all 132 CLI configuration values used for the crawl. Every option from the command line (or its default value) is recorded here. Keys are the option names in camelCase (e.g., `url`, `workers`, `maxReqsPerSec`, `timeout`, `outputType`, `userAgent`, `acceptEncoding`, etc.). Values are strings, integers, booleans, or null, depending on the option type.

This is useful for reproducing a crawl or understanding the exact configuration that produced the results.

### 3.4. `qualityScores` (Object)

Contains overall and per-category quality scores computed after analysis.

*   `overall` (Object): The aggregate quality score.
    *   `score` (Float): Overall score from 0.0 to 10.0.
    *   `label` (String): Human-readable label (e.g., `"A+"`, `"A"`, `"B"`, `"C"`, `"D"`, `"F"`).
    *   `weight` (Float): Total weight (1.0 for overall).
    *   `deductions` (Array): Array of objects, each with:
        *   `points` (Float): Number of points deducted.
        *   `reason` (String): Explanation for the deduction.

*   `categories` (Array): Array of 5 category objects, each with:
    *   `code` (String): Category identifier. One of: `"performance"`, `"seo"`, `"security"`, `"accessibility"`, `"bestPractices"`.
    *   `name` (String): Human-readable category name.
    *   `score` (Float): Category score from 0.0 to 10.0.
    *   `label` (String): Human-readable label.
    *   `weight` (Float): Weight of this category in the overall score (e.g., 0.20 for SEO, 0.25 for Security).
    *   `deductions` (Array): Array of deduction objects (same structure as overall deductions).

### 3.5. `results` (Array)

An array of objects, where each object represents a single visited URL.
*   `url` (String): The absolute URL that was visited.
*   `status` (String): The HTTP status code returned (e.g., `"200"`, `"404"`).
*   `elapsedTime` (Float): Time taken to fetch the URL in seconds (e.g., `0.005`).
*   `size` (Integer): Size of the response body in bytes (e.g., `50961`).
*   `type` (Integer): An enum representing the detected content type:
    *   `1`: HTML
    *   `2`: JavaScript
    *   `3`: CSS
    *   `4`: Image
    *   `7`: Document (e.g., robots.txt)
    *   `8`: JSON
    *   Other types may exist (Audio, Font, Video, XML, Redirect, Other).
*   `cacheTypeFlags` (Integer): Bitmask representing detected caching mechanisms (e.g., Cache-Control, ETag, Last-Modified). For example, `31` typically means Cache-Control + ETag + Last-Modified are all present. `32768` might indicate no caching headers found.
*   `cacheLifetime` (Integer): Cache lifetime in seconds derived from `Cache-Control: max-age` or `Expires` header. `0` if no lifetime could be determined.
*   `extras` (Array): Contains additional data from specific analyzers run on this URL. Typically an empty array `[]`.

### 3.6. `stats` (Object)

Aggregate statistics about the entire crawl:
*   `totalUrls` (Integer): Total number of URLs visited.
*   `totalSize` (Integer): Total size of all responses in bytes.
*   `totalSizeFormatted` (String): Human-readable formatted total size (e.g., `"31.33 MB"`).
*   `totalExecutionTime` (Float): Total wall-clock execution time in seconds.
*   `totalRequestsTimes` (Float): Sum of all individual request times in seconds.
*   `totalRequestsTimesAvg` (Float): Average request time in seconds.
*   `totalRequestsTimesMin` (Float): Minimum request time in seconds.
*   `totalRequestsTimesMax` (Float): Maximum request time in seconds.
*   `countByStatus` (Object): An object mapping HTTP status codes to counts. Keys are status code strings (e.g., `"200"`, `"404"`, `"429"`), values are integers. Only status codes that were actually encountered appear as keys.

### 3.7. `summary` (Object)

Contains a list of summary findings that feed into quality scoring.

*   `items` (Array): Array of finding objects, each with:
    *   `aplCode` (String): A unique code identifying the finding (e.g., `"s201"`, `"s404"`, `"s502"`).
    *   `status` (String): Severity level. One of: `"CRITICAL"`, `"WARNING"`, `"OK"`, `"INFO"`.
    *   `text` (String): Human-readable description of the finding (e.g., `"Brotli is supported for HTML"`, `"1 URL(s) returned a 404 status code"`).

### 3.8. `tables` (Object)

An object where each key is a table identifier (e.g., `skipped-summary`, `404`, `seo`) and the value is an object describing that table. Each table object contains:
*   `aplCode` (String): A unique code for the table.
*   `title` (String): A human-readable title for the table.
*   `columns` (Object): An object describing the columns of the table. Each key is a column identifier (e.g., `reason`, `url`, `statusCode`). The value is an object detailing the column:
    *   `aplCode` (String): Unique code for the column.
    *   `name` (String): Display name for the column header.
    *   `width` (Integer): Suggested display width (-1 might mean auto).
    *   `formatter` (Object | null): Defines how the data should be formatted (e.g., adding units like 'ms' or 'kB'). Empty object `{}` indicates default formatting.
    *   `renderer` (Object | null): Defines how the data should be rendered (e.g., adding color or links). Empty object `{}` indicates default rendering.
    *   `truncateIfLonger` (Boolean): Whether to truncate the value if it exceeds the width.
    *   Other fields like `formatterWillChangeValueLength`, `nonBreakingSpaces`, `escapeOutputHtml`, `getDataValueCallback`, `forcedDataType` provide more hints for rendering.
*   `rows` (Array): An array of objects, where each object represents a row in the table. The keys in each row object correspond to the column identifiers defined in `columns`. **Important: All values in all table rows are strings**, regardless of whether the data represents a number, count, or other type. For example, a count of `51` appears as `"51"`, a request time of `0.003` appears as `"0.003"`, and an empty value appears as `""`. Rows may also contain extra keys beyond the declared columns (see individual table descriptions for details).
*   `position` (String): A hint about where this table should typically be positioned in a report (e.g., `before-url-table`, `after-url-table`).

**Note:** The specific content and structure within `tables` depend on the analyzers enabled during the crawl. The set of tables may vary depending on what data was encountered (e.g., `certificate-info` only appears for HTTPS sites).

## 4. JSON Schema (Draft)

This is a draft JSON schema based on the actual output. It may need refinement for edge cases.

```json
{
  "$schema": "http://json-schema.org/draft-07/schema#",
  "title": "SiteOne Crawler JSON Output",
  "description": "Schema for the JSON output file generated by SiteOne Crawler.",
  "type": "object",
  "properties": {
    "crawler": {
      "description": "Metadata about the crawler execution.",
      "type": "object",
      "properties": {
        "name": { "type": "string" },
        "version": { "type": "string" },
        "executedAt": { "type": "string", "description": "Format: YYYY-MM-DD HH:MM:SS" },
        "command": { "type": "string" },
        "hostname": { "type": "string" },
        "finalUserAgent": { "type": "string" }
      },
      "required": ["name", "version", "executedAt", "command", "hostname", "finalUserAgent"]
    },
    "extraColumnsFromAnalysis": {
      "description": "Definitions for extra columns used in analyses.",
      "type": "array",
      "items": {
        "type": "object",
        "properties": {
          "name": { "type": "string" },
          "length": { "type": "integer" },
          "truncate": { "type": "boolean" },
          "customMethod": { "type": ["string", "null"] },
          "customPattern": { "type": ["string", "null"] },
          "customGroup": { "type": ["string", "null"] }
        },
        "required": ["name", "length", "truncate"]
      }
    },
    "options": {
      "description": "All CLI configuration values used for the crawl.",
      "type": "object",
      "additionalProperties": true
    },
    "qualityScores": {
      "description": "Overall and per-category quality scores.",
      "type": "object",
      "properties": {
        "overall": {
          "type": "object",
          "properties": {
            "score": { "type": "number" },
            "label": { "type": "string" },
            "weight": { "type": "number" },
            "deductions": {
              "type": "array",
              "items": {
                "type": "object",
                "properties": {
                  "points": { "type": "number" },
                  "reason": { "type": "string" }
                },
                "required": ["points", "reason"]
              }
            }
          },
          "required": ["score", "label", "weight", "deductions"]
        },
        "categories": {
          "type": "array",
          "items": {
            "type": "object",
            "properties": {
              "code": { "type": "string", "enum": ["performance", "seo", "security", "accessibility", "bestPractices"] },
              "name": { "type": "string" },
              "score": { "type": "number" },
              "label": { "type": "string" },
              "weight": { "type": "number" },
              "deductions": {
                "type": "array",
                "items": {
                  "type": "object",
                  "properties": {
                    "points": { "type": "number" },
                    "reason": { "type": "string" }
                  },
                  "required": ["points", "reason"]
                }
              }
            },
            "required": ["code", "name", "score", "label", "weight", "deductions"]
          }
        }
      },
      "required": ["overall", "categories"]
    },
    "results": {
      "description": "Array of results for each visited URL.",
      "type": "array",
      "items": {
        "type": "object",
        "properties": {
          "url": { "type": "string", "format": "uri" },
          "status": { "type": "string" },
          "elapsedTime": { "type": "number" },
          "size": { "type": "integer" },
          "type": { "type": "integer", "description": "Enum for content type (1:HTML, 2:JS, 3:CSS, 4:Image, 7:Document, 8:JSON, ...)" },
          "cacheTypeFlags": { "type": "integer", "description": "Bitmask for caching mechanisms" },
          "cacheLifetime": { "type": "integer", "description": "Cache lifetime in seconds, 0 if undetermined" },
          "extras": {
            "type": "array",
            "description": "Additional analysis data for this URL (typically empty)"
          }
        },
        "required": ["url", "status", "elapsedTime", "size", "type", "cacheTypeFlags", "cacheLifetime", "extras"]
      }
    },
    "stats": {
      "description": "Aggregate crawl statistics.",
      "type": "object",
      "properties": {
        "totalUrls": { "type": "integer" },
        "totalSize": { "type": "integer" },
        "totalSizeFormatted": { "type": "string" },
        "totalExecutionTime": { "type": "number" },
        "totalRequestsTimes": { "type": "number" },
        "totalRequestsTimesAvg": { "type": "number" },
        "totalRequestsTimesMin": { "type": "number" },
        "totalRequestsTimesMax": { "type": "number" },
        "countByStatus": {
          "type": "object",
          "additionalProperties": { "type": "integer" }
        }
      },
      "required": ["totalUrls", "totalSize", "totalSizeFormatted", "totalExecutionTime", "totalRequestsTimes", "totalRequestsTimesAvg", "totalRequestsTimesMin", "totalRequestsTimesMax", "countByStatus"]
    },
    "summary": {
      "description": "Summary findings that feed into quality scoring.",
      "type": "object",
      "properties": {
        "items": {
          "type": "array",
          "items": {
            "type": "object",
            "properties": {
              "aplCode": { "type": "string" },
              "status": { "type": "string", "enum": ["CRITICAL", "WARNING", "OK", "INFO"] },
              "text": { "type": "string" }
            },
            "required": ["aplCode", "status", "text"]
          }
        }
      },
      "required": ["items"]
    },
    "tables": {
      "description": "Aggregated analysis results presented as tables.",
      "type": "object",
      "additionalProperties": {
        "type": "object",
        "properties": {
          "aplCode": { "type": "string" },
          "title": { "type": "string" },
          "columns": {
            "type": "object",
            "additionalProperties": {
              "type": "object",
              "properties": {
                "aplCode": { "type": "string" },
                "name": { "type": "string" },
                "width": { "type": "integer" },
                "formatter": { "type": ["object", "null"] },
                "renderer": { "type": ["object", "null"] },
                "truncateIfLonger": { "type": "boolean" }
              },
              "required": ["aplCode", "name", "width"]
            }
          },
          "rows": {
            "type": "array",
            "items": {
              "type": "object",
              "description": "All row values are strings. Rows may contain extra keys beyond the declared columns.",
              "additionalProperties": { "type": "string" }
            }
          },
          "position": { "type": "string", "enum": ["before-url-table", "after-url-table"] }
        },
        "required": ["aplCode", "title", "columns", "rows", "position"]
      }
    }
  },
  "required": ["crawler", "extraColumnsFromAnalysis", "options", "qualityScores", "results", "stats", "summary", "tables"]
}
```

## 5. Analysis Tables Description (`tables` key)

This section details the structure and columns of each table found under the `tables` key in the JSON output.

**Important note on data types:** All values in all table rows are **strings**. Numeric values such as counts, times, and sizes are serialized as strings (e.g., `"51"` not `51`, `"0.003"` not `0.003`). Empty values appear as `""`. This applies to every table described below. Where column descriptions say "count" or "time", the value is still a string representation of that number.

Some tables include **extra row keys** beyond the declared columns. These are noted in the individual table descriptions.

### 5.1. `skipped-summary` (Skipped URLs Summary)

Provides a summary of skipped URLs grouped by domain and reason.

| Column | Description |
|--------|-------------|
| `reason` | A human-readable string describing why URLs from this domain were skipped (e.g., `"Not allowed host"`, `"Blocked by robots.txt"`). |
| `domain` | The domain name whose URLs were skipped. |
| `count` | The number of unique URLs skipped for this domain and reason. |

### 5.2. `skipped` (Skipped URLs)

Lists individual URLs that were skipped during the crawl.

| Column | Description |
|--------|-------------|
| `reason` | A human-readable string describing why the URL was skipped (e.g., `"Not allowed host"`, `"Blocked by robots.txt"`, `"File extension is not allowed"`). |
| `url` | The URL that was skipped. |
| `sourceAttr` | A string describing the HTML attribute where the skipped URL was found (e.g., `"<a href>"`, `"<link href>"`, `"<script src>"`). |
| `sourceUqId` | The URL path of the page where the skipped URL was discovered (e.g., `"/"`, `"/docs/getting-started"`). This allows linking back to the source page. |

### 5.3. `redirects` (Redirected URLs)

Lists URLs that resulted in an HTTP redirect (3xx status code).

| Column | Description |
|--------|-------------|
| `statusCode` | The specific redirect status code (e.g., `"301"`, `"302"`). |
| `url` | The original URL that redirected. |
| `targetUrl` | The target URL to which the original URL redirected. |
| `sourceUqId` | URL path of the page where the redirected URL was found. |

### 5.4. `404` (404 URLs)

Lists URLs that resulted in a "404 Not Found" status code.

| Column | Description |
|--------|-------------|
| `statusCode` | The HTTP status code (typically `"404"`). |
| `url` | The URL that resulted in the 404 error. |
| `sourceUqId` | URL path of the page where the broken URL was found. |

### 5.5. `certificate-info` (SSL/TLS info)

Provides details about the SSL/TLS certificate of the crawled domain.

| Column | Description |
|--------|-------------|
| `info` | The name of the certificate attribute (e.g., `"Issuer"`, `"Subject"`, `"Valid from"`, `"Valid to"`, `"Supported protocols"`, `"RAW certificate output"`, `"RAW protocols output"`). |
| `value` | The value of the corresponding certificate attribute. Always a string. For multi-line values like raw certificate or protocol output, the entire content is a single string with embedded newlines. |

### 5.6. `fastest-urls` (TOP fastest URLs)

Lists the URLs with the lowest request times encountered during the crawl.

| Column | Description |
|--------|-------------|
| `requestTime` | The time taken to fetch the URL in seconds (e.g., `"0.003"`). |
| `statusCode` | The HTTP status code of the URL (e.g., `"200"`). |
| `url` | The URL itself. |

### 5.7. `slowest-urls` (TOP slowest URLs)

Lists the URLs with the highest request times encountered during the crawl.

| Column | Description |
|--------|-------------|
| `requestTime` | The time taken to fetch the URL in seconds (e.g., `"1.234"`). |
| `statusCode` | The HTTP status code of the URL (e.g., `"200"`). |
| `url` | The URL itself. |

### 5.8. `seo` (SEO metadata)

Provides SEO-related metadata extracted from HTML pages.

| Column | Description |
|--------|-------------|
| `urlPathAndQuery` | The path and query string of the URL. |
| `indexing` | A string describing the indexing status (e.g., `"index, follow"`, `"noindex, follow"`). |
| `title` | The content of the `<title>` tag, or empty string if not found. |
| `h1` | The content of the first `<h1>` tag found, or empty string. |
| `description` | The content of the `meta name="description"` tag, or empty string. |
| `keywords` | The content of the `meta name="keywords"` tag, or empty string. |

**Extra row keys** (present in each row object but not declared as columns):
*   `robotsIndex` (String): Whether the page allows indexing (e.g., `"1"` for index, `"0"` for noindex).
*   `deniedByRobotsTxt` (String): Whether the page is denied by robots.txt (e.g., `"0"` for allowed, `"1"` for denied).

### 5.9. `open-graph` (OpenGraph metadata)

Provides Open Graph and Twitter Card metadata extracted from HTML pages.

| Column | Description |
|--------|-------------|
| `urlPathAndQuery` | The path and query string of the URL. |
| `ogTitle` | Content of the `og:title` meta tag, or empty string. |
| `ogDescription` | Content of the `og:description` meta tag, or empty string. |
| `ogImage` | Content of the `og:image` meta tag, or empty string. |
| `twitterTitle` | Content of the `twitter:title` meta tag, or empty string. |
| `twitterDescription` | Content of the `twitter:description` meta tag, or empty string. |
| `twitterImage` | Content of the `twitter:image` meta tag, or empty string. |

### 5.10. `seo-headings` (Heading structure)

Provides analysis of the heading (H1-H6) structure for each HTML page.

| Column | Description |
|--------|-------------|
| `headings` | A formatted string representation of the heading structure showing hierarchy and potential errors (e.g., `"OK H1, H2, H2, H3"` or `"ERR H1, H3 (skipped H2)"`). |
| `headingsCount` | Total number of headings found on the page (e.g., `"5"`). |
| `headingsErrorsCount` | Number of structural errors found in the headings (e.g., `"0"`, `"2"`). |
| `urlPathAndQuery` | The path and query string of the URL. |

**Extra row key:**
*   `headingsHtml` (String): An HTML string containing the full heading tree with markup (e.g., `"<b>H1</b> Title<br><b>H2</b> Section..."`). Useful for rendering a visual heading tree in reports.

### 5.11. `headers` (HTTP headers)

Summarizes the HTTP response headers encountered across all crawled URLs.

| Column | Description |
|--------|-------------|
| `header` | The name of the HTTP header. |
| `occurrences` | The total number of times this header was found (e.g., `"73"`). |
| `uniqueValues` | The count of distinct values found for this header, as a string (e.g., `"3"`). |
| `valuesPreview` | A preview string showing some of the values encountered (truncated if many). |
| `minValue` | The minimum value found (relevant for numerical or date headers), or empty string. |
| `maxValue` | The maximum value found, or empty string. |

### 5.12. `headers-values` (HTTP header values)

Lists unique values for each HTTP header and their occurrence count.

| Column | Description |
|--------|-------------|
| `header` | The name of the HTTP header. |
| `occurrences` | The number of times this specific value occurred for this header (e.g., `"51"`). |
| `value` | The specific unique value of the HTTP header. |

### 5.13. `caching-per-content-type` (HTTP Caching by content type)

Analyzes caching effectiveness grouped by general content type (HTML, Image, JS, CSS, etc.).

| Column | Description |
|--------|-------------|
| `contentType` | The general content type category (e.g., `"HTML"`, `"Image"`, `"JS"`). |
| `cacheType` | Description of the caching mechanism detected (e.g., `"Cache-Control + ETag + Last-Modified"`, `"No cache headers"`). |
| `count` | Number of URLs matching this content type and cache type. |
| `avgLifetime` | Average cache lifetime in seconds for URLs in this group, or empty string if not determinable. |
| `minLifetime` | Minimum cache lifetime in seconds, or empty string. |
| `maxLifetime` | Maximum cache lifetime in seconds, or empty string. |

### 5.14. `caching-per-domain` (HTTP Caching by domain)

Analyzes caching effectiveness grouped by domain.

| Column | Description |
|--------|-------------|
| `domain` | The domain name. |
| `cacheType` | Description of the caching mechanism detected. |
| `count` | Number of URLs from this domain matching this cache type. |
| `avgLifetime` | Average cache lifetime in seconds, or empty string. |
| `minLifetime` | Minimum cache lifetime in seconds, or empty string. |
| `maxLifetime` | Maximum cache lifetime in seconds, or empty string. |

### 5.15. `caching-per-domain-and-content-type` (HTTP Caching by domain and content type)

Analyzes caching effectiveness grouped by both domain and general content type.

| Column | Description |
|--------|-------------|
| `domain` | The domain name. |
| `contentType` | The general content type category. |
| `cacheType` | Description of the caching mechanism detected. |
| `count` | Number of URLs matching this domain, content type, and cache type. |
| `avgLifetime` | Average cache lifetime in seconds, or empty string. |
| `minLifetime` | Minimum cache lifetime in seconds, or empty string. |
| `maxLifetime` | Maximum cache lifetime in seconds, or empty string. |

### 5.16. `non-unique-titles` (TOP non-unique titles)

Lists page titles that appear on more than one page.

| Column | Description |
|--------|-------------|
| `count` | The number of pages sharing this title. |
| `title` | The non-unique page title. |

### 5.17. `non-unique-descriptions` (TOP non-unique descriptions)

Lists meta descriptions that appear on more than one page.

| Column | Description |
|--------|-------------|
| `count` | The number of pages sharing this description. |
| `description` | The non-unique meta description content. |

### 5.18. `best-practices` (Best practices)

Summarizes the results of various best practice checks performed by analyzers.

| Column | Description |
|--------|-------------|
| `analysisName` | The name of the specific best practice check (e.g., `"Large inline SVGs"`, `"Heading structure"`, `"Brotli support"`). |
| `ok` | Count of URLs passing this check. |
| `notice` | Count of URLs with a notice-level finding. |
| `warning` | Count of URLs with a warning-level finding. |
| `critical` | Count of URLs with a critical-level finding. |

### 5.19. `accessibility` (Accessibility)

Summarizes the results of accessibility checks.

| Column | Description |
|--------|-------------|
| `analysisName` | The name of the specific accessibility check (e.g., `"Missing image alt attributes"`, `"Missing html lang attribute"`, `"ARIA roles and landmarks"`). |
| `ok` | Count of elements/pages passing this check. |
| `notice` | Count of notice-level findings. |
| `warning` | Count of warning-level findings. |
| `critical` | Count of critical-level findings. |

### 5.20. `source-domains` (Source domains)

Provides statistics about the domains from which resources were loaded.

| Column | Description |
|--------|-------------|
| `domain` | The domain name. |
| `totals` | A summary string showing total count, size, and time for resources from this domain (e.g., `"67/30MB/6.2s"`). |
| `HTML` | Summary string (count/size/time) for HTML resources from this domain. |
| `Image` | Summary string for Image resources. |
| `JS` | Summary string for JavaScript resources. |
| `CSS` | Summary string for CSS resources. |
| `Document` | Summary string for Document resources (e.g., robots.txt). |

**Extra row keys** (dynamic, present when data exists):
*   `Audio`, `Font`, `JSON`, `Other`, `Redirect`, `Video`, `XML` (String): Summary strings for additional content types, included only when resources of that type are present.
*   `totalCount` (String): Total number of resources loaded from this domain.

**Note:** The set of content type columns is dynamic. The declared columns (`HTML`, `Image`, `JS`, `CSS`, `Document`) are always present, but additional content type columns appear in row data based on what resource types were actually encountered during the crawl.

### 5.21. `content-types` (Content types)

Summarizes statistics grouped by general content type.

| Column | Description |
|--------|-------------|
| `contentType` | The general content type category (e.g., `"HTML"`, `"Image"`). |
| `count` | Total number of URLs of this content type. |
| `totalSize` | Total size in bytes for this content type. |
| `totalTime` | Total time spent fetching resources of this content type. |
| `avgTime` | Average time spent fetching a resource of this content type. |
| `status20x` | Count of URLs with a 2xx status code. |
| `status40x` | Count of URLs with a 4xx status code. |

**Note:** The status columns are dynamic. Additional columns like `status42x` (for HTTP 429) or `status30x`, `status50x` may appear depending on which status codes were actually encountered during the crawl. These dynamic columns will also be declared in the table's `columns` object.

### 5.22. `content-types-raw` (Content types (MIME types))

Summarizes statistics grouped by the specific MIME type reported in the `Content-Type` HTTP header.

| Column | Description |
|--------|-------------|
| `contentType` | The raw MIME type string (e.g., `"text/html"`, `"image/svg+xml"`, `"text/html; charset=utf-8"`). |
| `count` | Total number of URLs with this MIME type. |
| `totalSize` | Total size in bytes. |
| `totalTime` | Total time spent fetching. |
| `avgTime` | Average time spent fetching. |
| `status20x` | Count of URLs with a 2xx status code. |
| `status40x` | Count of URLs with a 4xx status code. |

**Note:** Like `content-types`, the status columns are dynamic. Additional status columns (e.g., `status42x`) appear when the corresponding status codes are encountered.

### 5.23. `dns` (DNS info)

Shows the DNS resolution information for the crawled domain(s).

| Column | Description |
|--------|-------------|
| `info` | A line of text representing part of the DNS resolution (e.g., the domain name, an IP address, the DNS server used). Presented as a simple text tree. |

### 5.24. `security` (Security)

Summarizes findings related to security HTTP headers.

| Column | Description |
|--------|-------------|
| `header` | The name of the security header being analyzed (e.g., `"Strict-Transport-Security"`, `"X-Frame-Options"`, `"Content-Security-Policy"`). |
| `ok` | Count of URLs where the header was configured correctly. |
| `notice` | Count of URLs with a notice-level finding. |
| `warning` | Count of URLs with a warning-level finding. |
| `critical` | Count of URLs with a critical-level finding. |
| `recommendation` | A string containing textual recommendations for improving the configuration of this header. |

**Extra row key:**
*   `highestSeverity` (String): The highest severity level found for this header across all URLs (e.g., `"ok"`, `"warning"`, `"critical"`).

### 5.25. `analysis-stats` (Analysis stats)

Provides performance metrics for individual analyzer methods.

| Column | Description |
|--------|-------------|
| `classAndMethod` | The class and method name of the analyzer function. |
| `execTime` | Total execution time in seconds spent in this method across all relevant URLs/data points. |
| `execCount` | The number of times this method was executed. |

**Extra row key:**
*   `execTimeFormatted` (String): Human-readable formatted execution time (e.g., `"0.012 s"`, `"1.234 s"`).

### 5.26. `content-processors-stats` (Content processor stats)

Provides performance metrics for content processor methods (HTML, CSS, JS, XML processors that run during the crawl).

| Column | Description |
|--------|-------------|
| `classAndMethod` | The class and method name of the content processor function. |
| `execTime` | Total execution time in seconds spent in this method. |
| `execCount` | The number of times this method was executed. |

**Extra row key:**
*   `execTimeFormatted` (String): Human-readable formatted execution time.

### 5.27. `external-urls` (External URLs)

Lists external URLs discovered during the crawl along with where they were found.

| Column | Description |
|--------|-------------|
| `url` | The external URL that was discovered. |
| `count` | The number of times this external URL was found across all crawled pages. |
| `foundOn` | The URL of the page where this external URL was found (typically the first occurrence). |

## 6. Note on Text Output

While this document focuses on the JSON output, SiteOne Crawler also offers a simpler Text output format (`--output-text-file`). The Text output provides a human-readable summary suitable for quick review in a terminal or text editor.

See the [Text Output Documentation](TEXT-OUTPUT.md) for more details on the Text format.


================================================
FILE: docs/OUTPUT-crawler.siteone.io.json
================================================
{
  "crawler": {
    "command": "./siteone-crawler --url=https://crawler.siteone.io/ --output=json --http-cache-dir=",
    "executedAt": "2026-03-16 14:55:13",
    "finalUserAgent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/26.0.0.0 Safari/537.36 siteone-crawler/2.0.0.20260316",
    "hostname": "DESKTOP-PC",
    "name": "SiteOne Crawler",
    "version": "2.0.0.20260316"
  },
  "extraColumnsFromAnalysis": [
    {
      "customGroup": null,
      "customMethod": null,
      "customPattern": null,
      "length": 8,
      "name": "Access.",
      "truncate": false
    },
    {
      "customGroup": null,
      "customMethod": null,
      "customPattern": null,
      "length": 8,
      "name": "Best pr.",
      "truncate": false
    }
  ],
  "options": {
    "acceptEncoding": "gzip, deflate, br",
    "addHostToOutputFile": false,
    "addRandomQueryParams": false,
    "addTimestampToOutputFile": false,
    "allowedDomainsForCrawling": [],
    "allowedDomainsForExternalFiles": [],
    "analyzerFilterRegex": null,
    "ci": false,
    "ciMax404": 0,
    "ciMax5xx": 0,
    "ciMaxAvgResponse": null,
    "ciMaxCriticals": 0,
    "ciMaxWarnings": null,
    "ciMinAccessibility": 3.0,
    "ciMinAssets": 10,
    "ciMinBestPractices": 5.0,
    "ciMinDocuments": 0,
    "ciMinPages": 10,
    "ciMinPerformance": 5.0,
    "ciMinScore": 5.0,
    "ciMinSecurity": 5.0,
    "ciMinSeo": 5.0,
    "consoleWidth": null,
    "debug": false,
    "debugLogFile": null,
    "debugUrlRegex": [],
    "device": "desktop",
    "disableAllAssets": false,
    "disableAstroInlineModules": false,
    "disableFiles": false,
    "disableFonts": false,
    "disableImages": false,
    "disableJavascript": false,
    "disableStyles": false,
    "doNotTruncateUrl": false,
    "extraColumns": [],
    "extraColumnsNamesOnly": [],
    "fastestMaxTime": 1.0,
    "fastestTopLimit": 20,
    "forceColor": false,
    "forceRelativeUrls": false,
    "hideProgressBar": false,
    "htmlReportOptions": null,
    "httpAuth": null,
    "httpCacheCompression": false,
    "httpCacheDir": "",
    "httpCacheTtl": 86400,
    "ignoreRegex": [],
    "ignoreRobotsTxt": false,
    "ignoreStoreFileError": false,
    "includeRegex": [],
    "mailFrom": "siteone-crawler@your-hostname.com",
    "mailFromName": "SiteOne Crawler",
    "mailSmtpHost": "localhost",
    "mailSmtpPass": null,
    "mailSmtpPort": 25,
    "mailSmtpUser": null,
    "mailSubjectTemplate": "Crawler Report for %domain% (%date%)",
    "mailTo": [],
    "markdownDisableFiles": false,
    "markdownDisableImages": false,
    "markdownExcludeSelector": [],
    "markdownExportDir": null,
    "markdownExportSingleFile": null,
    "markdownExportStoreOnlyUrlRegex": [],
    "markdownIgnoreStoreFileError": false,
    "markdownMoveContentBeforeH1ToEnd": false,
    "markdownRemoveLinksAndImagesFromSingleFile": false,
    "markdownReplaceContent": [],
    "markdownReplaceQueryString": [],
    "maxDepth": 0,
    "maxHeadingLevel": 3,
    "maxNon200ResponsesPerBasename": 5,
    "maxQueueLength": 9000,
    "maxReqsPerSec": 10.0,
    "maxSkippedUrls": 10000,
    "maxUrlLength": 2083,
    "maxVisitedUrls": 10000,
    "memoryLimit": "2048M",
    "noColor": false,
    "offlineExportDir": null,
    "offlineExportLowercase": false,
    "offlineExportNoAutoRedirectHtml": false,
    "offlineExportPreserveUrlStructure": false,
    "offlineExportRemoveUnwantedCode": true,
    "offlineExportStoreOnlyUrlRegex": [],
    "outputHtmlReport": "/home/janreges/siteone-crawler/tmp/crawler.siteone.io.report.20260316-155513.html",
    "outputJsonFile": "/home/janreges/siteone-crawler/tmp/crawler.siteone.io.output.20260316-155513.json",
    "outputTextFile": "/home/janreges/siteone-crawler/tmp/crawler.siteone.io.output.20260316-155513.txt",
    "outputType": "json",
    "proxy": null,
    "regexFilteringOnlyForPages": false,
    "removeAllAnchorListeners": false,
    "removeQueryParams": false,
    "replaceContent": [],
    "replaceQueryString": [],
    "resolve": [],
    "resultStorage": "memory",
    "resultStorageCompression": false,
    "resultStorageDir": "/home/janreges/siteone-crawler/tmp/result-storage",
    "rowsLimit": 200,
    "serveBindAddress": "127.0.0.1",
    "serveMarkdownDir": null,
    "serveOfflineDir": null,
    "servePort": 8321,
    "showHelpOnly": false,
    "showInlineCriticals": false,
    "showInlineWarnings": false,
    "showSchemeAndHost": false,
    "showVersionOnly": false,
    "singleForeignPage": false,
    "singlePage": false,
    "sitemapBasePriority": 0.5,
    "sitemapPriorityIncrease": 0.1,
    "sitemapTxtFile": null,
    "sitemapXmlFile": null,
    "slowestMaxTime": 3.0,
    "slowestMinTime": 0.01,
    "slowestTopLimit": 20,
    "timeout": 5,
    "timezone": null,
    "transformUrl": [],
    "uploadEnabled": false,
    "uploadPassword": null,
    "uploadRetention": "30d",
    "uploadTimeout": 3600,
    "uploadTo": "https://crawler.siteone.io/up",
    "url": "https://crawler.siteone.io/",
    "urlColumnSize": null,
    "userAgent": null,
    "websocketServer": null,
    "workers": 3
  },
  "qualityScores": {
    "categories": [
      {
        "code": "performance",
        "deductions": [],
        "label": "Excellent",
        "name": "Performance",
        "score": 10.0,
        "weight": 0.2
      },
      {
        "code": "seo",
        "deductions": [
          {
            "points": 0.5,
            "reason": "1 page(s) returned 404"
          }
        ],
        "label": "Excellent",
        "name": "SEO",
        "score": 9.5,
        "weight": 0.2
      },
      {
        "code": "security",
        "deductions": [
          {
            "points": 2.0,
            "reason": "3 page(s) with critical security findings"
          }
        ],
        "label": "Good",
        "name": "Security",
        "score": 8.0,
        "weight": 0.25
      },
      {
        "code": "accessibility",
        "deductions": [
          {
            "points": 0.5,
            "reason": "1 page(s) without image alt attributes"
          },
          {
            "points": 2.5,
            "reason": "50 page(s) with skipped heading levels"
          },
          {
            "points": 2.0,
            "reason": "51 page(s) without aria labels"
          }
        ],
        "label": "Fair",
        "name": "Accessibility",
        "score": 5.0,
        "weight": 0.2
      },
      {
        "code": "best-practices",
        "deductions": [
          {
            "points": 0.5,
            "reason": "No Brotli compression support"
          }
        ],
        "label": "Excellent",
        "name": "Best Practices",
        "score": 9.5,
        "weight": 0.15
      }
    ],
    "overall": {
      "code": "overall",
      "deductions": [],
      "label": "Good",
      "name": "Overall",
      "score": 8.3,
      "weight": 1.0
    }
  },
  "results": [
    {
      "cacheLifetime": 3600,
      "cacheTypeFlags": 31,
      "elapsedTime": 0.005,
      "extras": [],
      "size": 50961,
      "status": "200",
      "type": 1,
      "url": "https://crawler.siteone.io/"
    },
    {
      "cacheLifetime": 3600,
      "cacheTypeFlags": 31,
      "elapsedTime": 0.004,
      "extras": [],
      "size": 60465,
      "status": "200",
      "type": 1,
      "url": "https://crawler.siteone.io/introduction/overview/"
    },
    {
      "cacheLifetime": 3600,
      "cacheTypeFlags": 31,
      "elapsedTime": 0.005,
      "extras": [],
      "size": 54792,
      "status": "200",
      "type": 1,
      "url": "https://crawler.siteone.io/introduction/key-features/"
    },
    {
      "cacheLifetime": 3600,
      "cacheTypeFlags": 31,
      "elapsedTime": 0.033,
      "extras": [],
      "size": 43381,
      "status": "200",
      "type": 1,
      "url": "https://crawler.siteone.io/features/ease-of-use/"
    },
    {
      "cacheLifetime": 31536000,
      "cacheTypeFlags": 31,
      "elapsedTime": 0.138,
      "extras": [],
      "size": 3605887,
      "status": "200",
      "type": 4,
      "url": "https://crawler.siteone.io/siteone-crawler-app-demo.gif"
    },
    {
      "cacheLifetime": 3600,
      "cacheTypeFlags": 31,
      "elapsedTime": 0.005,
      "extras": [],
      "size": 52620,
      "status": "200",
      "type": 1,
      "url": "https://crawler.siteone.io/features/offline-website-generator/"
    },
    {
      "cacheLifetime": 31536000,
      "cacheTypeFlags": 31,
      "elapsedTime": 0.003,
      "extras": [],
      "size": 31582,
      "status": "200",
      "type": 4,
      "url": "https://crawler.siteone.io/_astro/siteone-crawler-mascot.CPk15tXh_HGMwJ.webp"
    },
    {
      "cacheLifetime": 3600,
      "cacheTypeFlags": 31,
      "elapsedTime": 0.055,
      "extras": [],
      "size": 109656,
      "status": "200",
      "type": 1,
      "url": "https://crawler.siteone.io/configuration/command-line-options/"
    },
    {
      "cacheLifetime": 3600,
      "cacheTypeFlags": 31,
      "elapsedTime": 0.073,
      "extras": [],
      "size": 61848,
      "status": "200",
      "type": 1,
      "url": "https://crawler.siteone.io/getting-started/basic-usage/"
    },
    {
      "cacheLifetime": 3600,
      "cacheTypeFlags": 31,
      "elapsedTime": 0.04,
      "extras": [],
      "size": 54226,
      "status": "200",
      "type": 1,
      "url": "https://crawler.siteone.io/introduction/contact-and-community/"
    },
    {
      "cacheLifetime": 31536000,
      "cacheTypeFlags": 31,
      "elapsedTime": 0.354,
      "extras": [],
      "size": 8701267,
      "status": "200",
      "type": 4,
      "url": "https://crawler.siteone.io/siteone-crawler-command-line-demo-full.gif"
    },
    {
      "cacheLifetime": 31536000,
      "cacheTypeFlags": 31,
      "elapsedTime": 0.056,
      "extras": [],
      "size": 673,
      "status": "200",
      "type": 4,
      "url": "https://crawler.siteone.io/_astro/siteone-crawler-logo-dark.DaIuiR1U.svg"
    },
    {
      "cacheLifetime": 31536000,
      "cacheTypeFlags": 31,
      "elapsedTime": 0.003,
      "extras": [],
      "size": 673,
      "status": "200",
      "type": 4,
      "url": "https://crawler.siteone.io/favicon.svg"
    },
    {
      "cacheLifetime": 3600,
      "cacheTypeFlags": 31,
      "elapsedTime": 0.004,
      "extras": [],
      "size": 56366,
      "status": "200",
      "type": 1,
      "url": "https://crawler.siteone.io/features/security-analysis/"
    },
    {
      "cacheLifetime": 31536000,
      "cacheTypeFlags": 31,
      "elapsedTime": 0.003,
      "extras": [],
      "size": 2690,
      "status": "200",
      "type": 2,
      "url": "https://crawler.siteone.io/_astro/Search.astro_astro_type_script_index_0_lang.DMZ5WJ-J.js"
    },
    {
      "cacheLifetime": 31536000,
      "cacheTypeFlags": 31,
      "elapsedTime": 0.003,
      "extras": [],
      "size": 3554,
      "status": "200",
      "type": 3,
      "url": "https://crawler.siteone.io/_astro/print.DNXP8c50.css"
    },
    {
      "cacheLifetime": 31536000,
      "cacheTypeFlags": 31,
      "elapsedTime": 0.005,
      "extras": [],
      "size": 61764,
      "status": "200",
      "type": 3,
      "url": "https://crawler.siteone.io/_astro/index.BRwACyc2.css"
    },
    {
      "cacheLifetime": 3600,
      "cacheTypeFlags": 31,
      "elapsedTime": 0.005,
      "extras": [],
      "size": 44537,
      "status": "200",
      "type": 1,
      "url": "https://crawler.siteone.io/features/dev-devops-assistant/"
    },
    {
      "cacheLifetime": 3600,
      "cacheTypeFlags": 31,
      "elapsedTime": 0.005,
      "extras": [],
      "size": 51420,
      "status": "200",
      "type": 1,
      "url": "https://crawler.siteone.io/features/redirect-and-404-analysis/"
    },
    {
      "cacheLifetime": 3600,
      "cacheTypeFlags": 31,
      "elapsedTime": 0.005,
      "extras": [],
      "size": 58940,
      "status": "200",
      "type": 1,
      "url": "https://crawler.siteone.io/features/performance-analysis/"
    },
    {
      "cacheLifetime": 3600,
      "cacheTypeFlags": 31,
      "elapsedTime": 0.005,
      "extras": [],
      "size": 44922,
      "status": "200",
      "type": 1,
      "url": "https://crawler.siteone.io/features/seo-and-opengraph-analysis/"
    },
    {
      "cacheLifetime": 3600,
      "cacheTypeFlags": 31,
      "elapsedTime": 0.005,
      "extras": [],
      "size": 51612,
      "status": "200",
      "type": 1,
      "url": "https://crawler.siteone.io/installation-and-requirements/desktop-application/"
    },
    {
      "cacheLifetime": 3600,
      "cacheTypeFlags": 31,
      "elapsedTime": 0.005,
      "extras": [],
      "size": 81246,
      "status": "200",
      "type": 1,
      "url": "https://crawler.siteone.io/configuration/examples/"
    },
    {
      "cacheLifetime": 31536000,
      "cacheTypeFlags": 31,
      "elapsedTime": 0.049,
      "extras": [],
      "size": 1333725,
      "status": "200",
      "type": 4,
      "url": "https://crawler.siteone.io/siteone-crawler-command-line-demo-w960.avif"
    },
    {
      "cacheLifetime": 3600,
      "cacheTypeFlags": 31,
      "elapsedTime": 0.005,
      "extras": [],
      "size": 53521,
      "status": "200",
      "type": 1,
      "url": "https://crawler.siteone.io/features/website-to-markdown-converter/"
    },
    {
      "cacheLifetime": 31536000,
      "cacheTypeFlags": 31,
      "elapsedTime": 0.024,
      "extras": [],
      "size": 605564,
      "status": "200",
      "type": 4,
      "url": "https://crawler.siteone.io/siteone-crawler-app-demo.avif"
    },
    {
      "cacheLifetime": 3600,
      "cacheTypeFlags": 31,
      "elapsedTime": 0.005,
      "extras": [],
      "size": 68186,
      "status": "200",
      "type": 1,
      "url": "https://crawler.siteone.io/installation-and-requirements/ready-to-use-packages/"
    },
    {
      "cacheLifetime": 3600,
      "cacheTypeFlags": 31,
      "elapsedTime": 0.005,
      "extras": [],
      "size": 57907,
      "status": "200",
      "type": 1,
      "url": "https://crawler.siteone.io/installation-and-requirements/system-requirements/"
    },
    {
      "cacheLifetime": 3600,
      "cacheTypeFlags": 31,
      "elapsedTime": 0.005,
      "extras": [],
      "size": 53752,
      "status": "200",
      "type": 1,
      "url": "https://crawler.siteone.io/introduction/ideas-and-roadmap/"
    },
    {
      "cacheLifetime": 3600,
      "cacheTypeFlags": 31,
      "elapsedTime": 0.005,
      "extras": [],
      "size": 74321,
      "status": "200",
      "type": 1,
      "url": "https://crawler.siteone.io/features/technical-analysis/"
    },
    {
      "cacheLifetime": 3600,
      "cacheTypeFlags": 31,
      "elapsedTime": 0.005,
      "extras": [],
      "size": 53689,
      "status": "200",
      "type": 1,
      "url": "https://crawler.siteone.io/features/audit-report/"
    },
    {
      "cacheLifetime": 31536000,
      "cacheTypeFlags": 31,
      "elapsedTime": 0.047,
      "extras": [],
      "size": 2165,
      "status": "200",
      "type": 2,
      "url": "https://crawler.siteone.io/_astro/page.7qqag-5g.js"
    },
    {
      "cacheLifetime": 31536000,
      "cacheTypeFlags": 31,
      "elapsedTime": 0.15,
      "extras": [],
      "size": 4134394,
      "status": "200",
      "type": 4,
      "url": "https://crawler.siteone.io/siteone-crawler-command-line-demo-w960.gif"
    },
    {
      "cacheLifetime": 3600,
      "cacheTypeFlags": 31,
      "elapsedTime": 0.006,
      "extras": [],
      "size": 72133,
      "status": "200",
      "type": 1,
      "url": "https://crawler.siteone.io/installation-and-requirements/manual-installation/"
    },
    {
      "cacheLifetime": 3600,
      "cacheTypeFlags": 31,
      "elapsedTime": 0.004,
      "extras": [],
      "size": 43295,
      "status": "200",
      "type": 1,
      "url": "https://crawler.siteone.io/features/heading-analysis/"
    },
    {
      "cacheLifetime": 3600,
      "cacheTypeFlags": 31,
      "elapsedTime": 0.003,
      "extras": [],
      "size": 152,
      "status": "200",
      "type": 7,
      "url": "https://crawler.siteone.io/robots.txt"
    },
    {
      "cacheLifetime": 3600,
      "cacheTypeFlags": 31,
      "elapsedTime": 0.004,
      "extras": [],
      "size": 50887,
      "status": "200",
      "type": 1,
      "url": "https://crawler.siteone.io/features/accessibility-analysis/"
    },
    {
      "cacheLifetime": 3600,
      "cacheTypeFlags": 31,
      "elapsedTime": 0.006,
      "extras": [],
      "size": 59500,
      "status": "200",
      "type": 1,
      "url": "https://crawler.siteone.io/features/online-html-report-upload/"
    },
    {
      "cacheLifetime": 3600,
      "cacheTypeFlags": 31,
      "elapsedTime": 0.006,
      "extras": [],
      "size": 45363,
      "status": "200",
      "type": 1,
      "url": "https://crawler.siteone.io/features/deep-website-crawling/"
    },
    {
      "cacheLifetime": 3600,
      "cacheTypeFlags": 31,
      "elapsedTime": 0.006,
      "extras": [],
      "size": 53489,
      "status": "200",
      "type": 1,
      "url": "https://crawler.siteone.io/getting-started/quick-start-guide/"
    },
    {
      "cacheLifetime": 3600,
      "cacheTypeFlags": 31,
      "elapsedTime": 0.007,
      "extras": [],
      "size": 114569,
      "status": "200",
      "type": 1,
      "url": "https://crawler.siteone.io/advanced-topics/extending/"
    },
    {
      "cacheLifetime": 3600,
      "cacheTypeFlags": 31,
      "elapsedTime": 0.005,
      "extras": [],
      "size": 47494,
      "status": "200",
      "type": 1,
      "url": "https://crawler.siteone.io/features/stress-testing/"
    },
    {
      "cacheLifetime": 31536000,
      "cacheTypeFlags": 31,
      "elapsedTime": 0.003,
      "extras": [],
      "size": 1673,
      "status": "200",
      "type": 2,
      "url": "https://crawler.siteone.io/_astro/TableOfContents.astro_astro_type_script_index_0_lang.CKWWgpjV.js"
    },
    {
      "cacheLifetime": 3600,
      "cacheTypeFlags": 31,
      "elapsedTime": 0.005,
      "extras": [],
      "size": 57380,
      "status": "200",
      "type": 1,
      "url": "https://crawler.siteone.io/features/dns-analysis/"
    },
    {
      "cacheLifetime": 3600,
      "cacheTypeFlags": 31,
      "elapsedTime": 0.005,
      "extras": [],
      "size": 64607,
      "status": "200",
      "type": 1,
      "url": "https://crawler.siteone.io/features/performance-metrics/"
    },
    {
      "cacheLifetime": 3600,
      "cacheTypeFlags": 31,
      "elapsedTime": 0.006,
      "extras": [],
      "size": 72648,
      "status": "200",
      "type": 1,
      "url": "https://crawler.siteone.io/advanced-topics/troubleshooting/"
    },
    {
      "cacheLifetime": 3600,
      "cacheTypeFlags": 31,
      "elapsedTime": 0.006,
      "extras": [],
      "size": 65733,
      "status": "200",
      "type": 1,
      "url": "https://crawler.siteone.io/introduction/faq/"
    },
    {
      "cacheLifetime": 3600,
      "cacheTypeFlags": 31,
      "elapsedTime": 0.006,
      "extras": [],
      "size": 55178,
      "status": "200",
      "type": 1,
      "url": "https://crawler.siteone.io/features/content-type-analysis/"
    },
    {
      "cacheLifetime": 3600,
      "cacheTypeFlags": 31,
      "elapsedTime": 0.005,
      "extras": [],
      "size": 45600,
      "status": "200",
      "type": 1,
      "url": "https://crawler.siteone.io/features/improvement-meter/"
    },
    {
      "cacheLifetime": 3600,
      "cacheTypeFlags": 31,
      "elapsedTime": 0.006,
      "extras": [],
      "size": 59838,
      "status": "200",
      "type": 1,
      "url": "https://crawler.siteone.io/features/source-domains-analysis/"
    },
    {
      "cacheLifetime": 3600,
      "cacheTypeFlags": 31,
      "elapsedTime": 0.006,
      "extras": [],
      "size": 65679,
      "status": "200",
      "type": 1,
      "url": "https://crawler.siteone.io/advanced-topics/crawler-behavior/"
    },
    {
      "cacheLifetime": 31536000,
      "cacheTypeFlags": 31,
      "elapsedTime": 0.003,
      "extras": [],
      "size": 667,
      "status": "200",
      "type": 2,
      "url": "https://crawler.siteone.io/_astro/MobileTableOfContents.astro_astro_type_script_index_0_lang.C181hMzK.js"
    },
    {
      "cacheLifetime": 3600,
      "cacheTypeFlags": 31,
      "elapsedTime": 0.004,
      "extras": [],
      "size": 45737,
      "status": "200",
      "type": 1,
      "url": "https://crawler.siteone.io/features/mailer/"
    },
    {
      "cacheLifetime": 3600,
      "cacheTypeFlags": 31,
      "elapsedTime": 0.005,
      "extras": [],
      "size": 46601,
      "status": "200",
      "type": 1,
      "url": "https://crawler.siteone.io/features/exports-and-reports/"
    },
    {
      "cacheLifetime": 3600,
      "cacheTypeFlags": 31,
      "elapsedTime": 0.005,
      "extras": [],
      "size": 51057,
      "status": "200",
      "type": 1,
      "url": "https://crawler.siteone.io/introduction/motivation/"
    },
    {
      "cacheLifetime": 3600,
      "cacheTypeFlags": 31,
      "elapsedTime": 0.005,
      "extras": [],
      "size": 52033,
      "status": "200",
      "type": 1,
      "url": "https://crawler.siteone.io/features/best-practices-analysis/"
    },
    {
      "cacheLifetime": 3600,
      "cacheTypeFlags": 31,
      "elapsedTime": 0.007,
      "extras": [],
      "size": 72685,
      "status": "200",
      "type": 1,
      "url": "https://crawler.siteone.io/getting-started/advanced-usage/"
    },
    {
      "cacheLifetime": 3600,
      "cacheTypeFlags": 31,
      "elapsedTime": 0.004,
      "extras": [],
      "size": 44108,
      "status": "200",
      "type": 1,
      "url": "https://crawler.siteone.io/features/availability/"
    },
    {
      "cacheLifetime": 3600,
      "cacheTypeFlags": 31,
      "elapsedTime": 0.005,
      "extras": [],
      "size": 47561,
      "status": "200",
      "type": 1,
      "url": "https://crawler.siteone.io/introduction/thanks/"
    },
    {
      "cacheLifetime": 3600,
      "cacheTypeFlags": 31,
      "elapsedTime": 0.005,
      "extras": [],
      "size": 46558,
      "status": "200",
      "type": 1,
      "url": "https://crawler.siteone.io/features/sitemap-generator/"
    },
    {
      "cacheLifetime": 3600,
      "cacheTypeFlags": 31,
      "elapsedTime": 0.004,
      "extras": [],
      "size": 63763,
      "status": "200",
      "type": 1,
      "url": "https://crawler.siteone.io/features/caching-analysis/"
    },
    {
      "cacheLifetime": 3600,
      "cacheTypeFlags": 31,
      "elapsedTime": 0.005,
      "extras": [],
      "size": 69474,
      "status": "200",
      "type": 1,
      "url": "https://crawler.siteone.io/features/headers-analysis/"
    },
    {
      "cacheLifetime": 3600,
      "cacheTypeFlags": 31,
      "elapsedTime": 0.005,
      "extras": [],
      "size": 60076,
      "status": "200",
      "type": 1,
      "url": "https://crawler.siteone.io/features/ssl-tls-analysis/"
    },
    {
      "cacheLifetime": 3600,
      "cacheTypeFlags": 31,
      "elapsedTime": 0.005,
      "extras": [],
      "size": 78066,
      "status": "200",
      "type": 1,
      "url": "https://crawler.siteone.io/advanced-topics/contribution-and-development/"
    },
    {
      "cacheLifetime": 3600,
      "cacheTypeFlags": 31,
      "elapsedTime": 0.004,
      "extras": [],
      "size": 41701,
      "status": "200",
      "type": 1,
      "url": "https://crawler.siteone.io/introduction/support-us/"
    },
    {
      "cacheLifetime": 3600,
      "cacheTypeFlags": 31,
      "elapsedTime": 0.005,
      "extras": [],
      "size": 58031,
      "status": "200",
      "type": 1,
      "url": "https://crawler.siteone.io/advanced-topics/caching/"
    },
    {
      "cacheLifetime": 31536000,
      "cacheTypeFlags": 31,
      "elapsedTime": 0.004,
      "extras": [],
      "size": 15632,
      "status": "200",
      "type": 3,
      "url": "https://crawler.siteone.io/_astro/ec.5wl1j.css"
    },
    {
      "cacheLifetime": 31536000,
      "cacheTypeFlags": 31,
      "elapsedTime": 0.003,
      "extras": [],
      "size": 2416,
      "status": "200",
      "type": 2,
      "url": "https://crawler.siteone.io/_astro/ec.8zarh.js"
    },
    {
      "cacheLifetime": null,
      "cacheTypeFlags": 4,
      "elapsedTime": 0.003,
      "extras": [],
      "size": 788,
      "status": "429",
      "type": 1,
      "url": "https://crawler.siteone.io/_astro/desktop-app-release-assets.DBTr-vv8_Z2rPu7O.webp"
    },
    {
      "cacheLifetime": 31536000,
      "cacheTypeFlags": 31,
      "elapsedTime": 0.003,
      "extras": [],
      "size": 17372,
      "status": "200",
      "type": 4,
      "url": "https://crawler.siteone.io/_astro/ready-to-use-packages.BYCPharn_Z1ivwN5.webp"
    },
    {
      "cacheLifetime": null,
      "cacheTypeFlags": 4,
      "elapsedTime": 0.003,
      "extras": [],
      "size": 788,
      "status": "429",
      "type": 1,
      "url": "https://crawler.siteone.io/docs/features/best-practices-analysis"
    },
    {
      "cacheLifetime": null,
      "cacheTypeFlags": 4,
      "elapsedTime": 0.003,
      "extras": [],
      "size": 780,
      "status": "404",
      "type": 1,
      "url": "https://crawler.siteone.io/docs/features/content-type-analysis"
    },
    {
      "cacheLifetime": null,
      "cacheTypeFlags": 4,
      "elapsedTime": 0.003,
      "extras": [],
      "size": 788,
      "status": "429",
      "type": 1,
      "url": "https://crawler.siteone.io/docs/features/technical-analysis"
    }
  ],
  "stats": {
    "countByStatus": {
      "200": 69,
      "404": 1,
      "429": 3
    },
    "totalExecutionTime": 9.241,
    "totalRequestsTimes": 1.308,
    "totalRequestsTimesAvg": 0.018,
    "totalRequestsTimesMax": 0.354,
    "totalRequestsTimesMin": 0.003,
    "totalSize": 21514206,
    "totalSizeFormatted": "21 MB",
    "totalUrls": 73
  },
  "summary": {
    "items": [
      {
        "aplCode": "skipped",
        "status": "CRITICAL",
        "text": "Skipped URLs - 95 skipped URLs found"
      },
      {
        "aplCode": "security",
        "status": "CRITICAL",
        "text": "Security - 3 pages(s) with critical finding(s)."
      },
      {
        "aplCode": "ssl-protocol-hint",
        "status": "WARNING",
        "text": "Latest SSL/TLS protocol TLSv1.3 is not supported. Ask your admin/provider to add TLSv1.3 support."
      },
      {
        "aplCode": "brotli-support",
        "status": "WARNING",
        "text": "51 page(s) do not support Brotli compression."
      },
      {
        "aplCode": "pages-with-skipped-heading-levels",
        "status": "WARNING",
        "text": "50 page(s) with skipped heading levels"
      },
      {
        "aplCode": "pages-without-image-alt-attributes",
        "status": "WARNING",
        "text": "1 page(s) without image alt attributes"
      },
      {
        "aplCode": "pages-without-aria-labels",
        "status": "WARNING",
        "text": "51 page(s) without aria labels"
      },
      {
        "aplCode": "pages-without-roles",
        "status": "WARNING",
        "text": "51 page(s) without role attributes"
      },
      {
        "aplCode": "robots-txt-crawler.siteone.io",
        "status": "NOTICE",
        "text": "Loaded robots.txt for domain 'crawler.siteone.io': status code 200, size 152 B and took 25 ms."
      },
      {
        "aplCode": "external-urls",
        "status": "NOTICE",
        "text": "External URLs - 89 external URL(s) found"
      },
      {
        "aplCode": "404",
        "status": "NOTICE",
        "text": "404 NOTICE - 1 non-existent page(s) found"
      },
      {
        "aplCode": "dns-ipv6",
        "status": "NOTICE",
        "text": "DNS IPv6: domain crawler.siteone.io does not support IPv6 (DNS server: 10.255.255.254)"
      },
      {
        "aplCode": "redirects",
        "status": "OK",
        "text": "Redirects - no redirects found"
      },
      {
        "aplCode": "ssl-certificate-valid",
        "status": "OK",
        "text": "SSL/TLS certificate is valid until Mar 13 15:43:29 2027 GMT. Issued by C = BE, O = GlobalSign nv-sa, CN = GlobalSign GCC R6 AlphaSSL CA 2025. Subject is CN = *.siteone.io."
      },
      {
        "aplCode": "certificate-info",
        "status": "OK",
        "text": "SSL/TLS certificate issued by 'C = BE, O = GlobalSign nv-sa, CN = GlobalSign GCC R6 AlphaSSL CA 2025'."
      },
      {
        "aplCode": "slowUrls",
        "status": "OK",
        "text": "Performance OK - all non-media URLs are faster than 3 seconds"
      },
      {
        "aplCode": "unique-headers",
        "status": "OK",
        "text": "HTTP headers - found 18 unique headers"
      },
      {
        "aplCode": "title-uniqueness",
        "status": "OK",
        "text": "All 51 unique title(s) are within the allowed 10% duplicity. Highest duplicity title has 1%."
      },
      {
        "aplCode": "meta-description-uniqueness",
        "status": "OK",
        "text": "All 50 description(s) are within the allowed 10% duplicity. Highest duplicity description has 3%."
      },
      {
        "aplCode": "webp-support",
        "status": "OK",
        "text": "2 WebP image(s) found on the website."
      },
      {
        "aplCode": "avif-support",
        "status": "OK",
        "text": "2 AVIF image(s) found on the website."
      },
      {
        "aplCode": "pages-with-missing-quotes",
        "status": "OK",
        "text": "All pages have quoted attributes"
      },
      {
        "aplCode": "pages-with-large-svgs",
        "status": "OK",
        "text": "All pages have inline SVGs smaller than 5120 bytes"
      },
      {
        "aplCode": "pages-with-duplicated-svgs",
        "status": "OK",
        "text": "All pages have inline SVGs with less than 5 duplicates"
      },
      {
        "aplCode": "pages-with-invalid-svgs",
        "status": "OK",
        "text": "All pages have valid or none inline SVGs"
      },
      {
        "aplCode": "pages-with-multiple-h1",
        "status": "OK",
        "text": "All pages without multiple <h1> headings"
      },
      {
        "aplCode": "pages-without-h1",
        "status": "OK",
        "text": "All pages have <h1> heading"
      },
      {
        "aplCode": "pages-with-deep-dom",
        "status": "OK",
        "text": "All pages have DOM depth less than 30"
      },
      {
        "aplCode": "pages-with-non-clickable-phone-numbers",
        "status": "OK",
        "text": "All pages have clickable (interactive) phone numbers"
      },
      {
        "aplCode": "pages-with-invalid-html",
        "status": "OK",
        "text": "All pages have valid HTML"
      },
      {
        "aplCode": "pages-without-form-labels",
        "status": "OK",
        "text": "All pages have form labels"
      },
      {
        "aplCode": "pages-without-lang",
        "status": "OK",
        "text": "All pages have lang attribute"
      },
      {
        "aplCode": "dns-ipv4",
        "status": "OK",
        "text": "DNS IPv4 OK: domain crawler.siteone.io resolved to 86.49.167.242 (DNS server: 10.255.255.254)"
      },
      {
        "aplCode": "export-to-text",
        "status": "INFO",
        "text": "Text report saved to '/home/janreges/siteone-crawler/tmp/crawler.siteone.io.output.20260316-155513.txt' and took 0 ms"
      },
      {
        "aplCode": "export-to-json",
        "status": "INFO",
        "text": "JSON report saved to '/home/janreges/siteone-crawler/tmp/crawler.siteone.io.output.20260316-155513.json' and took 0 ms"
      },
      {
        "aplCode": "export-to-html",
        "status": "INFO",
        "text": "HTML report saved to '/home/janreges/siteone-crawler/tmp/crawler.siteone.io.report.20260316-155513.html' and took 0 ms"
      }
    ]
  },
  "tables": {
    "404": {
      "aplCode": "404",
      "columns": {
        "sourceUqId": {
          "aplCode": "sourceUqId",
          "escapeOutputHtml": true,
          "forcedDataType": null,
          "formatter": null,
          "formatterWillChangeValueLength": true,
          "getDataValueCallback": null,
          "name": "Found at URL",
          "nonBreakingSpaces": false,
          "renderer": null,
          "truncateIfLonger": true,
          "width": 61
        },
        "statusCode": {
          "aplCode": "statusCode",
          "escapeOutputHtml": true,
          "forcedDataType": null,
          "formatter": {},
          "formatterWillChangeValueLength": false,
          "getDataValueCallback": null,
          "name": "Status",
          "nonBreakingSpaces": false,
          "renderer": null,
          "truncateIfLonger": false,
          "width": 6
        },
        "url": {
          "aplCode": "url",
          "escapeOutputHtml": true,
          "forcedDataType": null,
          "formatter": null,
          "formatterWillChangeValueLength": true,
          "getDataValueCallback": null,
          "name": "URL 404",
          "nonBreakingSpaces": false,
          "renderer": null,
          "truncateIfLonger": true,
          "width": 61
        }
      },
      "position": "before-url-table",
      "rows": [
        {
          "sourceUqId": "https://crawler.siteone.io/features/performance-metrics/",
          "statusCode": "404",
          "url": "https://crawler.siteone.io/docs/features/content-type-analysis"
        }
      ],
      "title": "404 URLs"
    },
    "accessibility": {
      "aplCode": "accessibility",
      "columns": {
        "analysisName": {
          "aplCode": "analysisName",
          "escapeOutputHtml": true,
          "forcedDataType": null,
          "formatter": null,
          "formatterWillChangeValueLength": false,
          "getDataValueCallback": null,
          "name": "Analysis name",
          "nonBreakingSpaces": false,
          "renderer": null,
          "truncateIfLonger": false,
          "width": -1
        },
        "critical": {
          "aplCode": "critical",
          "escapeOutputHtml": true,
          "forcedDataType": null,
          "formatter": {},
          "formatterWillChangeValueLength": true,
          "getDataValueCallback": null,
          "name": "Critical",
          "nonBreakingSpaces": false,
          "renderer": null,
          "truncateIfLonger": false,
          "width": 8
        },
        "notice": {
          "aplCode": "notice",
          "escapeOutputHtml": true,
          "forcedDataType": null,
          "formatter": {},
          "formatterWillChangeValueLength": true,
          "getDataValueCallback": null,
          "name": "Notice",
          "nonBreakingSpaces": false,
          "renderer": null,
          "truncateIfLonger": false,
          "width": 6
        },
        "ok": {
          "aplCode": "ok",
          "escapeOutputHtml": true,
          "forcedDataType": null,
          "formatter": {},
          "formatterWillChangeValueLength": false,
          "getDataValueCallback": null,
          "name": "OK",
          "nonBreakingSpaces": false,
          "renderer": null,
          "truncateIfLonger": false,
          "width": 5
        },
        "warning": {
          "aplCode": "warning",
          "escapeOutputHtml": true,
          "forcedDataType": null,
          "formatter": {},
          "formatterWillChangeValueLength": true,
          "getDataValueCallback": null,
          "name": "Warning",
          "nonBreakingSpaces": false,
          "renderer": null,
          "truncateIfLonger": false,
          "width": 7
        }
      },
      "position": "before-url-table",
      "rows": [
        {
          "analysisName": "Missing aria labels",
          "critical": "0",
          "notice": "0",
          "ok": "2",
          "warning": "119"
        },
        {
          "analysisName": "Missing roles",
          "critical": "0",
          "notice": "0",
          "ok": "0",
          "warning": "35"
        },
        {
          "analysisName": "Missing image alt attributes",
          "critical": "0",
          "notice": "0",
          "ok": "6",
          "warning": "1"
        },
        {
          "analysisName": "Missing html lang attribute",
          "critical": "0",
          "notice": "0",
          "ok": "1",
          "warning": "0"
        }
      ],
      "title": "Accessibility"
    },
    "analysis-stats": {
      "aplCode": "analysis-stats",
      "columns": {
        "classAndMethod": {
          "aplCode": "classAndMethod",
          "escapeOutputHtml": true,
          "forcedDataType": null,
          "formatter": null,
          "formatterWillChangeValueLength": false,
          "getDataValueCallback": null,
          "name": "Class::method",
          "nonBreakingSpaces": false,
          "renderer": null,
          "truncateIfLonger": false,
          "width": -1
        },
        "execCount": {
          "aplCode": "execCount",
          "escapeOutputHtml": true,
          "forcedDataType": null,
          "formatter": null,
          "formatterWillChangeValueLength": false,
          "getDataValueCallback": null,
          "name": "Exec count",
          "nonBreakingSpaces": false,
          "renderer": null,
          "truncateIfLonger": false,
          "width": -1
        },
        "execTime": {
          "aplCode": "execTime",
          "escapeOutputHtml": true,
          "forcedDataType": null,
          "formatter": {},
          "formatterWillChangeValueLength": false,
          "getDataValueCallback": null,
          "name": "Exec time",
          "nonBreakingSpaces": false,
          "renderer": null,
          "truncateIfLonger": false,
          "width": 9
        }
      },
      "position": "after-url-table",
      "rows": [
        {
          "classAndMethod": "SslTlsAnalyzer::getTLSandSSLCertificateInfo",
          "execCount": "1",
          "execTime": "0.26621116",
          "execTimeFormatted": "266 ms"
        },
        {
          "classAndMethod": "BestPracticeAnalyzer::checkHeadingStructure",
          "execCount": "55",
          "execTime": "0.04904129400000001",
          "execTimeFormatted": "49 ms"
        },
        {
          "classAndMethod": "AccessibilityAnalyzer::checkMissingAriaLabels",
          "execCount": "51",
          "execTime": "0.04710557000000001",
          "execTimeFormatted": "47 ms"
        },
        {
          "classAndMethod": "AccessibilityAnalyzer::checkMissingLabels",
          "execCount": "51",
          "execTime": "0.043233411",
          "execTimeFormatted": "43 ms"
        },
        {
          "classAndMethod": "AccessibilityAnalyzer::checkMissingRoles",
          "execCount": "51",
          "execTime": "0.040707958000000016",
          "execTimeFormatted": "40 ms"
        },
        {
          "classAndMethod": "BestPracticeAnalyzer::checkMaxDOMDepth",
          "execCount": "55",
          "execTime": "0.03819707099999999",
          "execTimeFormatted": "38 ms"
        },
        {
          "classAndMethod": "AccessibilityAnalyzer::checkMissingLang",
          "execCount": "51",
          "execTime": "0.03680993700000001",
          "execTimeFormatted": "36 ms"
        },
        {
          "classAndMethod": "BestPracticeAnalyzer::checkNonClickablePhoneNumbers",
          "execCount": "55",
          "execTime": "0.025210428",
          "execTimeFormatted": "25 ms"
        },
        {
          "classAndMethod": "BestPracticeAnalyzer::checkInlineSvg",
          "execCount": "55",
          "execTime": "0.012483105000000001",
          "execTimeFormatted": "12 ms"
        },
        {
          "classAndMethod": "BestPracticeAnalyzer::checkMissingQuotesOnAttributes",
          "execCount": "55",
          "execTime": "0.003967633999999999",
          "execTimeFormatted": "3 ms"
        },
        {
          "classAndMethod": "SeoAndOpenGraphAnalyzer::analyzeHeadings",
          "execCount": "1",
          "execTime": "0.002403547",
          "execTimeFormatted": "2 ms"
        },
        {
          "classAndMethod": "SecurityAnalyzer::checkHtmlSecurity",
          "execCount": "54",
          "execTime": "0.0019036469999999996",
          "execTimeFormatted": "1 ms"
        },
        {
          "classAndMethod": "AccessibilityAnalyzer::checkImageAltAttributes",
          "execCount": "51",
          "execTime": "0.0015152590000000004",
          "execTimeFormatted": "1 ms"
        },
        {
          "classAndMethod": "SecurityAnalyzer::checkHeaders",
          "execCount": "54",
          "execTime": "0.000987299",
          "execTimeFormatted": "0 ms"
        },
        {
          "classAndMethod": "SeoAndOpenGraphAnalyzer::analyzeSeo",
          "execCount": "1",
          "execTime": "0.000208111",
          "execTimeFormatted": "0 ms"
        },
        {
          "classAndMethod": "SeoAndOpenGraphAnalyzer::analyzeOpenGraph",
          "execCount": "1",
          "execTime": "0.000155641",
          "execTimeFormatted": "0 ms"
        },
        {
          "classAndMethod": "BestPracticeAnalyzer::checkTitleUniqueness",
          "execCount": "1",
          "execTime": "0.000021892",
          "execTimeFormatted": "0 ms"
        },
        {
          "classAndMethod": "BestPracticeAnalyzer::checkMetaDescriptionUniqueness",
          "execCount": "1",
          "execTime": "0.000020891",
          "execTimeFormatted": "0 ms"
        },
        {
          "classAndMethod": "BestPracticeAnalyzer::checkBrotliSupport",
          "execCount": "1",
          "execTime": "0.000001541",
          "execTimeFormatted": "0 ms"
        },
        {
          "classAndMethod": "BestPracticeAnalyzer::checkWebpSupport",
          "execCount": "1",
          "execTime": "0.00000104",
          "execTimeFormatted": "0 ms"
        },
        {
          "classAndMethod": "BestPracticeAnalyzer::checkAvifSupport",
          "execCount": "1",
          "execTime": "0.000000961",
          "execTimeFormatted": "0 ms"
        }
      ],
      "title": "Analysis stats"
    },
    "best-practices": {
      "aplCode": "best-practices",
      "columns": {
        "analysisName": {
          "aplCode": "analysisName",
          "escapeOutputHtml": true,
          "forcedDataType": null,
          "formatter": {},
          "formatterWillChangeValueLength": true,
          "getDataValueCallback": null,
          "name": "Analysis name",
          "nonBreakingSpaces": false,
          "renderer": null,
          "truncateIfLonger": false,
          "width": -1
        },
        "critical": {
          "aplCode": "critical",
          "escapeOutputHtml": true,
          "forcedDataType": null,
          "formatter": {},
          "formatterWillChangeValueLength": false,
          "getDataValueCallback": null,
          "name": "Critical",
          "nonBreakingSpaces": false,
          "renderer": null,
          "truncateIfLonger": false,
          "width": 8
        },
        "notice": {
          "aplCode": "notice",
          "escapeOutputHtml": true,
          "forcedDataType": null,
          "formatter": {},
          "formatterWillChangeValueLength": false,
          "getDataValueCallback": null,
          "name": "Notice",
          "nonBreakingSpaces": false,
          "renderer": null,
          "truncateIfLonger": false,
          "width": 6
        },
        "ok": {
          "aplCode": "ok",
          "escapeOutputHtml": true,
          "forcedDataType": null,
          "formatter": {},
          "formatterWillChangeValueLength": false,
          "getDataValueCallback": null,
          "name": "OK",
          "nonBreakingSpaces": false,
          "renderer": null,
          "truncateIfLonger": false,
          "width": 5
        },
        "warning": {
          "aplCode": "warning",
          "escapeOutputHtml": true,
          "forcedDataType": null,
          "formatter": {},
          "formatterWillChangeValueLength": false,
          "getDataValueCallback": null,
          "name": "Warning",
          "nonBreakingSpaces": false,
          "renderer": null,
          "truncateIfLonger": false,
          "width": 7
        }
      },
      "position": "before-url-table",
      "rows": [
        {
          "analysisName": "Invalid inline SVGs",
          "critical": "0",
          "notice": "0",
          "ok": "34",
          "warning": "0"
        },
        {
          "analysisName": "DOM depth",
          "critical": "0",
          "notice": "0",
          "ok": "55",
          "warning": "0"
        },
        {
          "analysisName": "Duplicate inline SVGs",
          "critical": "0",
          "notice": "0",
          "ok": "34",
          "warning": "0"
        },
        {
          "analysisName": "Large inline SVGs",
          "critical": "0",
          "notice": "0",
          "ok": "34",
          "warning": "0"
        },
        {
          "analysisName": "Heading structure",
          "critical": "0",
          "notice": "0",
          "ok": "56",
          "warning": "54"
        },
        {
          "analysisName": "Title uniqueness",
          "critical": "0",
          "notice": "0",
          "ok": "51",
          "warning": "0"
        },
        {
          "analysisName": "Description uniqueness",
          "critical": "0",
          "notice": "0",
          "ok": "50",
          "warning": "0"
        },
        {
          "analysisName": "Brotli support",
          "critical": "0",
          "notice": "0",
          "ok": "0",
          "warning": "51"
        },
        {
          "analysisName": "WebP support",
          "critical": "0",
          "notice": "0",
          "ok": "2",
          "warning": "0"
        },
        {
          "analysisName": "AVIF support",
          "critical": "0",
          "notice": "0",
          "ok": "2",
          "warning": "0"
        }
      ],
      "title": "Best practices"
    },
    "caching-per-content-type": {
      "aplCode": "caching-per-content-type",
      "columns": {
        "avgLifetime": {
          "aplCode": "avgLifetime",
          "escapeOutputHtml": true,
          "forcedDataType": null,
          "formatter": {},
          "formatterWillChangeValueLength": false,
          "getDataValueCallback": null,
          "name": "AVG lifetime",
          "nonBreakingSpaces": false,
          "renderer": null,
          "truncateIfLonger": false,
          "width": 10
        },
        "cacheType": {
          "aplCode": "cacheType",
          "escapeOutputHtml": true,
          "forcedDataType": null,
          "formatter": null,
          "formatterWillChangeValueLength": false,
          "getDataValueCallback": null,
          "name": "Cache type",
          "nonBreakingSpaces": false,
          "renderer": null,
          "truncateIfLonger": false,
          "width": 12
        },
        "contentType": {
          "aplCode": "contentType",
          "escapeOutputHtml": true,
          "forcedDataType": null,
          "formatter": null,
          "formatterWillChangeValueLength": false,
          "getDataValueCallback": null,
          "name": "Content type",
          "nonBreakingSpaces": false,
          "renderer": null,
          "truncateIfLonger": false,
          "width": 12
        },
        "count": {
          "aplCode": "count",
          "escapeOutputHtml": true,
          "forcedDataType": null,
          "formatter": null,
          "formatterWillChangeValueLength": false,
          "getDataValueCallback": null,
          "name": "URLs",
          "nonBreakingSpaces": false,
          "renderer": null,
          "truncateIfLonger": false,
          "width": 5
        },
        "maxLifetime": {
          "aplCode": "maxLifetime",
          "escapeOutputHtml": true,
          "forcedDataType": null,
          "formatter": {},
          "formatterWillChangeValueLength": false,
          "getDataValueCallback": null,
          "name": "MAX lifetime",
          "nonBreakingSpaces": false,
          "renderer": null,
          "truncateIfLonger": false,
          "width": 10
        },
        "minLifetime": {
          "aplCode": "minLifetime",
          "escapeOutputHtml": true,
          "forcedDataType": null,
          "formatter": {},
          "formatterWillChangeValueLength": false,
          "getDataValueCallback": null,
          "name": "MIN lifetime",
          "nonBreakingSpaces": false,
          "renderer": null,
          "truncateIfLonger": false,
          "width": 10
        }
      },
      "position": "before-url-table",
      "rows": [
        {
          "avgLifetime": "3600",
          "cacheType": "Cache-Control + ETag + Last-Modified",
          "contentType": "HTML",
          "count": "51",
          "maxLifetime": "3600",
          "minLifetime": "3600"
        },
        {
          "avgLifetime": "31536000",
          "cacheType": "Cache-Control + ETag + Last-Modified",
          "contentType": "Image",
          "count": "9",
          "maxLifetime": "31536000",
          "minLifetime": "31536000"
        },
        {
          "avgLifetime": "31536000",
          "cacheType": "Cache-Control + ETag + Last-Modified",
          "contentType": "JS",
          "count": "5",
          "maxLifetime": "31536000",
          "minLifetime": "31536000"
        },
        {
          "avgLifetime": "",
          "cacheType": "ETag",
          "contentType": "HTML",
          "count": "4",
          "maxLifetime": "",
          "minLifetime": ""
        },
        {
          "avgLifetime": "31536000",
          "cacheType": "Cache-Control + ETag + Last-Modified",
          "contentType": "CSS",
          "count": "3",
          "maxLifetime": "31536000",
          "minLifetime": "31536000"
        },
        {
          "avgLifetime": "3600",
          "cacheType": "Cache-Control + ETag + Last-Modified",
          "contentType": "Document",
          "count": "1",
          "maxLifetime": "3600",
          "minLifetime": "3600"
        }
      ],
      "title": "HTTP Caching by content type (only from crawlable domains)"
    },
    "caching-per-domain": {
      "aplCode": "caching-per-domain",
      "columns": {
        "avgLifetime": {
          "aplCode": "avgLifetime",
          "escapeOutputHtml": true,
          "forcedDataType": null,
          "formatter": {},
          "formatterWillChangeValueLength": false,
          "getDataValueCallback": null,
          "name": "AVG lifetime",
          "nonBreakingSpaces": false,
          "renderer": null,
          "truncateIfLonger": false,
          "width": 10
        },
        "cacheType": {
          "aplCode": "cacheType",
          "escapeOutputHtml": true,
          "forcedDataType": null,
          "formatter": null,
          "formatterWillChangeValueLength": false,
          "getDataValueCallback": null,
          "name": "Cache type",
          "nonBreakingSpaces": false,
          "renderer": null,
          "truncateIfLonger": false,
          "width": 12
        },
        "count": {
          "aplCode": "count",
          "escapeOutputHtml": true,
          "forcedDataType": null,
          "formatter": null,
          "formatterWillChangeValueLength": false,
          "getDataValueCallback": null,
          "name": "URLs",
          "nonBreakingSpaces": false,
          "renderer": null,
          "truncateIfLonger": false,
          "width": 5
        },
        "domain": {
          "aplCode": "domain",
          "escapeOutputHtml": true,
          "forcedDataType": null,
          "formatter": null,
          "formatterWillChangeValueLength": false,
          "getDataValueCallback": null,
          "name": "Domain",
          "nonBreakingSpaces": false,
          "renderer": null,
          "truncateIfLonger": false,
          "width": 20
        },
        "maxLifetime": {
          "aplCode": "maxLifetime",
          "escapeOutputHtml": true,
          "forcedDataType": null,
          "formatter": {},
          "formatterWillChangeValueLength": false,
          "getDataValueCallback": null,
          "name": "MAX lifetime",
          "nonBreakingSpaces": false,
          "renderer": null,
          "truncateIfLonger": false,
          "width": 10
        },
        "minLifetime": {
          "aplCode": "minLifetime",
          "escapeOutputHtml": true,
          "forcedDataType": null,
          "formatter": {},
          "formatterWillChangeValueLength": false,
          "getDataValueCallback": null,
          "name": "MIN lifetime",
          "nonBreakingSpaces": false,
          "renderer": null,
          "truncateIfLonger": false,
          "width": 10
        }
      },
      "position": "before-url-table",
      "rows": [
        {
          "avgLifetime": "7772452",
          "cacheType": "Cache-Control + ETag + Last-Modified",
          "count": "69",
          "domain": "crawler.siteone.io",
          "maxLifetime": "31536000",
          "minLifetime": "3600"
        },
        {
          "avgLifetime": "",
          "cacheType": "ETag",
          "count": "4",
          "domain": "crawler.siteone.io",
          "maxLifetime": "",
          "minLifetime": ""
        }
      ],
      "title": "HTTP Caching by domain"
    },
    "caching-per-domain-and-content-type": {
      "aplCode": "caching-per-domain-and-content-type",
      "columns": {
        "avgLifetime": {
          "aplCode": "avgLifetime",
          "escapeOutputHtml": true,
          "forcedDataType": null,
          "formatter": {},
          "formatterWillChangeValueLength": false,
          "getDataValueCallback": null,
          "name": "AVG lifetime",
          "nonBreakingSpaces": false,
          "renderer": null,
          "truncateIfLonger": false,
          "width": 10
        },
        "cacheType": {
          "aplCode": "cacheType",
          "escapeOutputHtml": true,
          "forcedDataType": null,
          "formatter": null,
          "formatterWillChangeValueLength": false,
          "getDataValueCallback": null,
          "name": "Cache type",
          "nonBreakingSpaces": false,
          "renderer": null,
          "truncateIfLonger": false,
          "width": 12
        },
        "contentType": {
          "aplCode": "contentType",
          "escapeOutputHtml": true,
          "forcedDataType": null,
          "formatter": null,
          "formatterWillChangeValueLength": false,
          "getDataValueCallback": null,
          "name": "Content type",
          "nonBreakingSpaces": false,
          "renderer": null,
          "truncateIfLonger": false,
          "width": 12
        },
        "count": {
          "aplCode": "count",
          "escapeOutputHtml": true,
          "forcedDataType": null,
          "formatter": null,
          "formatterWillChangeValueLength": false,
          "getDataValueCallback": null,
          "name": "URLs",
          "nonBreakingSpaces": false,
          "renderer": null,
          "truncateIfLonger": false,
          "width": 5
        },
        "domain": {
          "aplCode": "domain",
          "escapeOutputHtml": true,
          "forcedDataType": null,
          "formatter": null,
          "formatterWillChangeValueLength": false,
          "getDataValueCallback": null,
          "name": "Domain",
          "nonBreakingSpaces": false,
          "renderer": null,
          "truncateIfLonger": false,
          "width": 20
        },
        "maxLifetime": {
          "aplCode": "maxLifetime",
          "escapeOutputHtml": true,
          "forcedDataType": null,
          "formatter": {},
          "formatterWillChangeValueLength": false,
          "getDataValueCallback": null,
          "name": "MAX lifetime",
          "nonBreakingSpaces": false,
          "renderer": null,
          "truncateIfLonger": false,
          "width": 10
        },
        "minLifetime": {
          "aplCode": "minLifetime",
          "escapeOutputHtml": true,
          "forcedDataType": null,
          "formatter": {},
          "formatterWillChangeValueLength": false,
          "getDataValueCallback": null,
          "name": "MIN lifetime",
          "nonBreakingSpaces": false,
          "renderer": null,
          "truncateIfLonger": false,
          "width": 10
        }
      },
      "position": "before-url-table",
      "rows": [
        {
          "avgLifetime": "3600",
          "cacheType": "Cache-Control + ETag + Last-Modified",
          "contentType": "HTML",
          "count": "51",
          "domain": "crawler.siteone.io",
          "maxLifetime": "3600",
          "minLifetime": "3600"
        },
        {
          "avgLifetime": "31536000",
          "cacheType": "Cache-Control + ETag + Last-Modified",
          "contentType": "Image",
          "count": "9",
          "domain": "crawler.siteone.io",
          "maxLifetime": "31536000",
          "minLifetime": "31536000"
        },
        {
          "avgLifetime": "31536000",
          "cacheType": "Cache-Control + ETag + Last-Modified",
          "contentType": "JS",
          "count": "5",
          "domain": "crawler.siteone.io",
          "maxLifetime": "31536000",
          "minLifetime": "31536000"
        },
        {
          "avgLifetime": "",
          "cacheType": "ETag",
          "contentType": "HTML",
          "count": "4",
          "domain": "crawler.siteone.io",
          "maxLifetime": "",
          "minLifetime": ""
        },
        {
          "avgLifetime": "31536000",
          "cacheType": "Cache-Control + ETag + Last-Modified",
          "contentType": "CSS",
          "count": "3",
          "domain": "crawler.siteone.io",
          "maxLifetime": "31536000",
          "minLifetime": "31536000"
        },
        {
          "avgLifetime": "3600",
          "cacheType": "Cache-Control + ETag + Last-Modified",
          "contentType": "Document",
          "count": "1",
          "domain": "crawler.siteone.io",
          "maxLifetime": "3600",
          "minLifetime": "3600"
        }
      ],
      "title": "HTTP Caching by domain and content type"
    },
    "certificate-info": {
      "aplCode": "certificate-info",
      "columns": {
        "info": {
          "aplCode": "info",
          "escapeOutputHtml": true,
          "forcedDataType": null,
          "formatter": null,
          "formatterWillChangeValueLength": false,
          "getDataValueCallback": null,
          "name": "Info",
          "nonBreakingSpaces": false,
          "renderer": null,
          "truncateIfLonger": false,
          "width": -1
        },
        "value": {
          "aplCode": "value",
          "escapeOutputHtml": false,
          "forcedDataType": null,
          "formatter": {},
          "formatterWillChangeValueLength": true,
          "getDataValueCallback": null,
          "name": "Text",
          "nonBreakingSpaces": false,
          "renderer": null,
          "truncateIfLonger": true,
          "width": 108
        }
      },
      "position": "before-url-table",
      "rows": [
        {
          "info": "Issuer",
          "value": "C = BE, O = GlobalSign nv-sa, CN = GlobalSign GCC R6 AlphaSSL CA 2025"
        },
        {
          "info": "Subject",
          "value": "CN = *.siteone.io"
        },
        {
          "info": "Valid from",
          "value": "Feb  9 15:43:30 2026 GMT (VALID already 35 day(s))"
        },
        {
          "info": "Valid to",
          "value": "Mar 13 15:43:29 2027 GMT (VALID still for 362 day(s))"
        },
        {
          "info": "Supported protocols",
          "value": "TLSv1.2"
        },
        {
          "info": "RAW certificate output",
          "value": "Certificate:\n    Data:\n        Version: 3 (0x2)\n        Serial Number:\n            3a:5e:e1:92:b3:18:b1:0c:8a:ff:d6:d6\n        Signature Algorithm: sha256WithRSAEncryption\n        Issuer: C = BE, O = GlobalSign nv-sa, CN = GlobalSign GCC R6 AlphaSSL CA 2025\n        Validity\n            Not Before: Feb  9 15:43:30 2026 GMT\n            Not After : Mar 13 15:43:29 2027 GMT\n        Subject: CN = *.siteone.io\n        Subject Public Key Info:\n            Public Key Algorithm: id-ecPublicKey\n                Public-Key: (256 bit)\n                pub:\n                    04:6f:74:08:f9:5a:a7:0c:ff:69:30:7f:15:12:90:\n                    9e:91:8e:80:3e:12:2b:cf:26:69:03:42:42:88:bc:\n                    b5:3d:4d:73:76:90:bb:39:30:cd:fd:82:79:c3:88:\n                    1f:d9:40:06:74:04:37:a5:48:86:2b:d7:ef:cb:b6:\n                    f1:9f:39:7f:9a\n                ASN1 OID: prime256v1\n                NIST CURVE: P-256\n        X509v3 extensions:\n            X509v3 Key Usage: critical\n                Digital Signature\n            X509v3 Basic Constraints: critical\n                CA:FALSE\n            Authority Information Access: \n                CA Issuers - URI:http://secure.globalsign.com/cacert/gsgccr6alphasslca2025.crt\n                OCSP - URI:http://ocsp.globalsign.com/gsgccr6alphasslca2025\n            X509v3 Certificate Policies: \n                Policy: 2.23.140.1.2.1\n                Policy: 1.3.6.1.4.1.4146.10.1.3\n                  CPS: https://www.globalsign.com/repository/\n            X509v3 CRL Distribution Points: \n                Full Name:\n                  URI:http://crl.globalsign.com/gsgccr6alphasslca2025.crl\n            X509v3 Subject Alternative Name: \n                DNS:*.siteone.io, DNS:siteone.io\n            X509v3 Extended Key Usage: \n                TLS Web Server Authentication, TLS Web Client Authentication\n            X509v3 Authority Key Identifier: \n                C5:B4:93:8F:6F:2B:DC:1E:48:BF:B7:10:30:85:CE:D1:B2:BB:48:2D\n            X509v3 Subject Key Identifier: \n                C7:E1:4D:93:BD:A2:18:AA:F4:FF:3B:F0:8D:7F:7F:8D:2E:C0:2F:C4\n            CT Precertificate SCTs: \n                Signed Certificate Timestamp:\n                    Version   : v1 (0x0)\n                    Log ID    : 1C:9F:68:2C:E9:FA:F0:45:69:50:F8:1B:96:8A:87:DD:\n                                DB:32:10:D8:4C:E6:C8:B2:E3:82:52:4A:C4:CF:59:9F\n                    Timestamp : Feb  9 15:43:33.998 2026 GMT\n                    Extensions: none\n                    Signature : ecdsa-with-SHA256\n                                30:45:02:21:00:B9:2C:BE:99:33:6A:9A:E2:6B:0F:45:\n                                21:1D:61:57:A1:44:E0:2F:1A:97:8F:6E:B2:20:90:EA:\n                                29:C6:8E:2B:B0:02:20:68:88:48:D5:DB:91:05:B6:BD:\n                                C3:8A:BB:B4:4B:06:AF:86:6A:C0:14:47:9C:F4:49:51:\n                                04:A0:7E:C7:24:43:84\n                Signed Certificate Timestamp:\n                    Version   : v1 (0x0)\n                    Log ID    : 8E:CA:47:0B:AC:DE:6A:F3:A2:06:B0:A4:7A:84:B7:46:\n                                FE:1F:C6:BF:95:3E:25:E6:9B:4E:E4:02:48:F3:C6:E8\n                    Timestamp : Feb  9 15:43:33.773 2026 GMT\n                    Extensions: 00:00:05:00:02:87:F7:42\n                    Signature : ecdsa-with-SHA256\n                                30:46:02:21:00:F8:9D:85:F9:39:A8:45:BA:B7:E5:66:\n                                0E:F4:30:96:25:E1:DC:68:8A:27:FD:50:09:CB:B1:E8:\n                                7B:62:AC:81:43:02:21:00:D0:8C:58:9B:E9:33:B2:69:\n                                98:2F:67:63:92:09:A8:BA:28:86:91:1D:B8:1A:AA:76:\n                                24:11:25:57:95:AC:E4:6C\n                Signed Certificate Timestamp:\n                    Version   : v1 (0x0)\n                    Log ID    : 4C:63:DC:98:E5:9C:1D:AB:88:F6:1E:8A:3D:DE:AE:8F:\n                                AB:44:A3:37:7B:5F:9B:94:C3:FB:A1:9C:FC:C1:BE:26\n                    Timestamp : Feb  9 15:43:33.289 2026 GMT\n                    Extensions: none\n                    Signature : ecdsa-with-SHA256\n                                30:44:02:20:69:87:55:72:C8:42:25:16:96:0D:84:9F:\n                                59:E8:76:38:6C:4E:F3:7B:B3:F9:A6:B9:01:13:C6:61:\n                                85:D3:2F:E8:02:20:0A:AE:F2:AE:5E:AA:7D:A9:92:96:\n                                06:DE:1A:A5:39:26:CE:EB:74:F9:C5:12:7F:DC:9F:5B:\n                                A9:AD:4A:0C:33:81\n    Signature Algorithm: sha256WithRSAEncryption\n    Signature Value:\n        54:38:06:65:1b:89:af:97:d5:b5:ab:88:03:49:99:7c:0f:fa:\n        99:d2:cf:6d:01:a3:eb:67:9f:f6:d2:0f:f2:18:41:4f:78:93:\n        6a:b8:c8:65:4c:09:b4:30:fb:51:45:4e:26:a2:0c:cf:d9:71:\n        e4:5c:ab:29:20:87:ad:87:49:03:22:28:0b:aa:67:89:0a:99:\n        e1:05:27:ae:c2:80:7d:d1:5e:16:95:a6:9c:3e:d3:4c:4f:d0:\n        70:4a:a4:d6:b5:a4:7f:ba:be:d1:91:37:d3:41:63:c7:7d:34:\n        de:d8:a1:12:1d:af:0b:2d:c3:49:be:6b:7c:46:cc:b2:f1:5d:\n        e3:f8:d2:ed:0e:49:bf:50:e7:c8:c8:06:4b:2f:8f:ea:d4:f7:\n        1b:42:10:b1:c6:d7:74:42:01:6e:57:ec:46:b9:73:e7:72:4d:\n        f9:66:a9:e3:2a:2e:f1:75:7c:34:19:e0:4d:83:8d:e2:df:a6:\n        49:54:84:3a:70:f0:55:ea:94:de:8c:4f:e7:cc:27:1d:2a:39:\n        dd:45:ed:a7:67:f9:fe:42:d1:8e:dd:99:67:41:1a:13:e8:0f:\n        be:66:ec:28:51:44:6f:f9:4b:81:45:5f:20:99:98:a2:88:40:\n        64:e7:86:dd:4f:56:91:5e:d3:b7:91:69:e7:d2:7c:16:53:e8:\n        91:24:48:5b\n"
        },
        {
          "info": "RAW protocols output",
          "value": "\n=== ssl2 ===\ns_client: Unknown option: -ssl2\ns_client: Use -help for summary.\n\n=== ssl3 ===\ns_client: Unknown option: -ssl3\ns_client: Use -help for summary.\n\n=== tls1 ===\n40A73279927E0000:error:0A00042E:SSL routines:ssl3_read_bytes:tlsv1 alert protocol version:ssl/record/rec_layer_s3.c:1605:SSL alert number 70\nCONNECTED(00000003)\n---\nno peer certificate available\n---\nNo client certificate CA names sent\n---\nSSL handshake has read 7 bytes and written 131 bytes\nVerification: OK\n---\nNew, (NONE), Cipher is (NONE)\nSecure Renegotiation IS NOT supported\nCompression: NONE\nExpansion: NONE\nNo ALPN negotiated\nSSL-Session:\n    Protocol  : TLSv1\n    Cipher    : 0000\n    Session-ID: \n    Session-ID-ctx: \n    Master-Key: \n    PSK identity: None\n    PSK identity hint: None\n    SRP username: None\n    Start Time: 1773672922\n    Timeout   : 7200 (sec)\n    Verify return code: 0 (ok)\n    Extended master secret: no\n---\n\n=== tls1_1 ===\n4097DCBCC9700000:error:0A00042E:SSL routines:ssl3_read_bytes:tlsv1 alert protocol version:ssl/record/rec_layer_s3.c:1605:SSL alert number 70\nCONNECTED(00000003)\n---\nno peer certificate available\n---\nNo client certificate CA names sent\n---\nSSL handshake has read 7 bytes and written 131 bytes\nVerification: OK\n---\nNew, (NONE), Cipher is (NONE)\nSecure Renegotiation IS NOT supported\nCompression: NONE\nExpansion: NONE\nNo ALPN negotiated\nSSL-Session:\n    Protocol  : TLSv1.1\n    Cipher    : 0000\n    Session-ID: \n    Session-ID-ctx: \n    Master-Key: \n    PSK identity: None\n    PSK identity hint: None\n    SRP username: None\n    Start Time: 1773672922\n    Timeout   : 7200 (sec)\n    Verify return code: 0 (ok)\n    Extended master secret: no\n---\n\n=== tls1_2 ===\ndepth=2 OU = GlobalSign Root CA - R6, O = GlobalSign, CN = GlobalSign\nverify return:1\ndepth=1 C = BE, O = GlobalSign nv-sa, CN = GlobalSign GCC R6 AlphaSSL CA 2025\nverify return:1\ndepth=0 CN = *.siteone.io\nverify return:1\nCONNECTED(00000003)\n---\nCertificate chain\n 0 s:CN = *.siteone.io\n   i:C = BE, O = GlobalSign nv-sa, CN = GlobalSign GCC R6 AlphaSSL CA 2025\n   a:PKEY: id-ecPublicKey, 256 (bit); sigalg: RSA-SHA256\n   v:NotBefore: Feb  9 15:43:30 2026 GMT; NotAfter: Mar 13 15:43:29 2027 GMT\n 1 s:C = BE, O = GlobalSign nv-sa, CN = GlobalSign GCC R6 AlphaSSL CA 2025\n   i:OU = GlobalSign Root CA - R6, O = GlobalSign, CN = GlobalSign\n   a:PKEY: rsaEncryption, 2048 (bit); sigalg: RSA-SHA256\n   v:NotBefore: May 21 02:36:52 2025 GMT; NotAfter: May 21 00:00:00 2027 GMT\n---\nServer certificate\n-----BEGIN CERTIFICATE-----\nMIIFlDCCBHygAwIBAgIMOl7hkrMYsQyK/9bWMA0GCSqGSIb3DQEBCwUAMFUxCzAJ\nBgNVBAYTAkJFMRkwFwYDVQQKExBHbG9iYWxTaWduIG52LXNhMSswKQYDVQQDEyJH\nbG9iYWxTaWduIEdDQyBSNiBBbHBoYVNTTCBDQSAyMDI1MB4XDTI2MDIwOTE1NDMz\nMFoXDTI3MDMxMzE1NDMyOVowFzEVMBMGA1UEAwwMKi5zaXRlb25lLmlvMFkwEwYH\nKoZIzj0CAQYIKoZIzj0DAQcDQgAEb3QI+VqnDP9pMH8VEpCekY6APhIrzyZpA0JC\niLy1PU1zdpC7OTDN/YJ5w4gf2UAGdAQ3pUiGK9fvy7bxnzl/mqOCA2swggNnMA4G\nA1UdDwEB/wQEAwIHgDAMBgNVHRMBAf8EAjAAMIGZBggrBgEFBQcBAQSBjDCBiTBJ\nBggrBgEFBQcwAoY9aHR0cDovL3NlY3VyZS5nbG9iYWxzaWduLmNvbS9jYWNlcnQv\nZ3NnY2NyNmFscGhhc3NsY2EyMDI1LmNydDA8BggrBgEFBQcwAYYwaHR0cDovL29j\nc3AuZ2xvYmFsc2lnbi5jb20vZ3NnY2NyNmFscGhhc3NsY2EyMDI1MFcGA1UdIARQ\nME4wCAYGZ4EMAQIBMEIGCisGAQQBoDIKAQMwNDAyBggrBgEFBQcCARYmaHR0cHM6\nLy93d3cuZ2xvYmFsc2lnbi5jb20vcmVwb3NpdG9yeS8wRAYDVR0fBD0wOzA5oDeg\nNYYzaHR0cDovL2NybC5nbG9iYWxzaWduLmNvbS9nc2djY3I2YWxwaGFzc2xjYTIw\nMjUuY3JsMCMGA1UdEQQcMBqCDCouc2l0ZW9uZS5pb4IKc2l0ZW9uZS5pbzAdBgNV\nHSUEFjAUBggrBgEFBQcDAQYIKwYBBQUHAwIwHwYDVR0jBBgwFoAUxbSTj28r3B5I\nv7cQMIXO0bK7SC0wHQYDVR0OBBYEFMfhTZO9ohiq9P878I1/f40uwC/EMIIBhgYK\nKwYBBAHWeQIEAgSCAXYEggFyAXAAdgAcn2gs6frwRWlQ+BuWiofd2zIQ2EzmyLLj\nglJKxM9ZnwAAAZxDEohuAAAEAwBHMEUCIQC5LL6ZM2qa4msPRSEdYVehROAvGpeP\nbrIgkOopxo4rsAIgaIhI1duRBba9w4q7tEsGr4ZqwBRHnPRJUQSgfsckQ4QAfwCO\nykcLrN5q86IGsKR6hLdG/h/Gv5U+JeabTuQCSPPG6AAAAZxDEoeNAAgAAAUAAof3\nQgQDAEgwRgIhAPidhfk5qEW6t+VmDvQwliXh3GiKJ/1QCcux6HtirIFDAiEA0IxY\nm+kzsmmYL2djkgmouiiGkR24Gqp2JBElV5Ws5GwAdQBMY9yY5Zwdq4j2Hoo93q6P\nq0SjN3tfm5TD+6Gc/MG+JgAAAZxDEoWpAAAEAwBGMEQCIGmHVXLIQiUWlg2En1no\ndjhsTvN7s/mmuQETxmGF0y/oAiAKrvKuXqp9qZKWBt4apTkmzut0+cUSf9yfW6mt\nSgwzgTANBgkqhkiG9w0BAQsFAAOCAQEAVDgGZRuJr5fVtauIA0mZfA/6mdLPbQGj\n62ef9tIP8hhBT3iTarjIZUwJtDD7UUVOJqIMz9lx5FyrKSCHrYdJAyIoC6pniQqZ\n4QUnrsKAfdFeFpWmnD7TTE/QcEqk1rWkf7q+0ZE300Fjx3003tihEh2vCy3DSb5r\nfEbMsvFd4/jS7Q5Jv1DnyMgGSy+P6tT3G0IQscbXdEIBblfsRrlz53JN+Wap4you\n8XV8NBngTYON4t+mSVSEOnDwVeqU3oxP58wnHSo53UXtp2f5/kLRjt2ZZ0EaE+gP\nvmbsKFFEb/lLgUVfIJmYoohAZOeG3U9WkV7Tt5Fp59J8FlPokSRIWw==\n-----END CERTIFICATE-----\nsubject=CN = *.siteone.io\nissuer=C = BE, O = GlobalSign nv-sa, CN = GlobalSign GCC R6 AlphaSSL CA 2025\n---\nNo client certificate CA names sent\nPeer signing digest: SHA256\nPeer signature type: ECDSA\nServer Temp Key: X25519, 253 bits\n---\nSSL handshake has read 3352 bytes and written 308 bytes\nVerification: OK\n---\nNew, TLSv1.2, Cipher is ECDHE-ECDSA-AES256-GCM-SHA384\nServer public key is 256 bit\nSecure Renegotiation IS supported\nCompression: NONE\nExpansion: NONE\nNo ALPN negotiated\nSSL-Session:\n    Protocol  : TLSv1.2\n    Cipher    : ECDHE-ECDSA-AES256-GCM-SHA384\n    Session-ID: 19B120BD7161E7019CEDF16CB4797E62F7446CD3EDEFFEDB608BF413921B4A7B\n    Session-ID-ctx: \n    Master-Key: 328F39D262554060A1EA39DF60ADF39266874AD839492BC4015B4C5549D01B077F950819D7E595B0A467ED00490F61A9\n    PSK identity: None\n    PSK identity hint: None\n    SRP username: None\n    TLS session ticket lifetime hint: 600 (seconds)\n    TLS session ticket:\n    0000 - 1b c2 18 67 f0 52 b2 61-ea 4f a2 db 95 f6 e4 91   ...g.R.a.O......\n    0010 - dd ec bc 83 82 0a 46 a5-6d a2 2c 4b bf 49 90 5c   ......F.m.,K.I.\\\n    0020 - 3c 3a b5 c6 01 db a3 31-24 02 83 c6 7e b1 94 91   <:.....1$...~...\n    0030 - 35 94 26 12 fc 24 bb 55-74 0e b9 cd a8 9d 55 8c   5.&..$.Ut.....U.\n    0040 - ac 2b d1 28 f3 4d 5f c3-f9 84 a5 24 99 ce 1f 32   .+.(.M_....$...2\n    0050 - 25 bb 63 49 33 55 3b 5e-74 fd 4e 76 f7 94 e8 52   %.cI3U;^t.Nv...R\n    0060 - f5 a1 ad 53 04 2e 13 c0-60 4d 6e fb 70 e4 a0 07   ...S....`Mn.p...\n    0070 - 4e 16 ab 18 f1 9e 43 92-9f a5 0c ad c8 6c 42 50   N.....C......lBP\n    0080 - da 1f cf a5 47 e2 38 3a-03 3f d6 a9 5b d5 e0 79   ....G.8:.?..[..y\n    0090 - 7e 49 1f 9c 5c 9d b0 f9-70 63 ff 19 9e 05 ff 35   ~I..\\...pc.....5\n    00a0 - 6b a4 e0 88 60 92 65 cf-88 19 db 4f d8 92 10 5f   k...`.e....O..._\n    00b0 - f9 e0 3b d7 d1 c0 91 9d-ee 4e f9 31 1d a2 cd 0b   ..;......N.1....\n    00c0 - a2 dd ab 49 63 e5 38 40-a0 00 72 54 fe f3 e8 c8   ...Ic.8@..rT....\n\n    Start Time: 1773672922\n    Timeout   : 7200 (sec)\n    Verify return code: 0 (ok)\n    Extended master secret: yes\n---\nDONE\n\n=== tls1_3 ===\n401761A9FC7D0000:error:0A00042E:SSL routines:ssl3_read_bytes:tlsv1 alert protocol version:ssl/record/rec_layer_s3.c:1605:SSL alert number 70\nCONNECTED(00000003)\n---\nno peer certificate available\n---\nNo client certificate CA names sent\n---\nSSL handshake has read 7 bytes and written 252 bytes\nVerification: OK\n---\nNew, (NONE), Cipher is (NONE)\nSecure Renegotiation IS NOT supported\nCompression: NONE\nExpansion: NONE\nNo ALPN negotiated\nEarly data was not sent\nVerify return code: 0 (ok)\n---\n"
        }
      ],
      "title": "SSL/TLS info"
    },
    "content-processors-stats": {
      "aplCode": "content-processors-stats",
      "columns": {
        "classAndMethod": {
          "aplCode": "classAndMethod",
          "escapeOutputHtml": true,
          "forcedDataType": null,
          "formatter": null,
          "formatterWillChangeValueLength": false,
          "getDataValueCallback": null,
          "name": "Class::method",
          "nonBreakingSpaces": false,
          "renderer": null,
          "truncateIfLonger": false,
          "width": -1
        },
        "execCount": {
          "aplCode": "execCount",
          "escapeOutputHtml": true,
          "forcedDataType": null,
          "formatter": null,
          "formatterWillChangeValueLength": false,
          "getDataValueCallback": null,
          "name": "Exec count",
          "nonBreakingSpaces": false,
          "renderer": null,
          "truncateIfLonger": false,
          "width": -1
        },
        "execTime": {
          "aplCode": "execTime",
          "escapeOutputHtml": true,
          "forcedDataType": null,
          "formatter": {},
          "formatterWillChangeValueLength": false,
          "getDataValueCallback": null,
          "name": "Exec time",
          "nonBreakingSpaces": false,
          "renderer": null,
          "truncateIfLonger": false,
          "width": 9
        }
      },
      "position": "after-url-table",
      "rows": [
        {
          "classAndMethod": "HtmlProcessor::findUrls",
          "execCount": "55",
          "execTime": "0.04634062399999998",
          "execTimeFormatted": "46 ms"
        },
        {
          "classAndMethod": "NextJsProcessor::applyContentChangesBeforeUrlParsing",
          "execCount": "63",
          "execTime": "0.010084196",
          "execTimeFormatted": "10 ms"
        },
        {
          "classAndMethod": "JavaScriptProcessor::findUrls",
          "execCount": "60",
          "execTime": "0.008742057000000001",
          "execTimeFormatted": "8 ms"
        },
        {
          "classAndMethod": "AstroProcessor::findUrls",
          "execCount": "60",
          "execTime": "0.001524671000000001",
          "execTimeFormatted": "1 ms"
        },
        {
          "classAndMethod": "CssProcessor::findUrls",
          "execCount": "58",
          "execTime": "0.001067747",
          "execTimeFormatted": "1 ms"
        },
        {
          "classAndMethod": "AstroProcessor::applyContentChangesBeforeUrlParsing",
          "execCount": "60",
          "execTime": "0.000086117",
          "execTimeFormatted": "0 ms"
        },
        {
          "classAndMethod": "NextJsProcessor::findUrls",
          "execCount": "63",
          "execTime": "0.000026580999999999995",
          "execTimeFormatted": "0 ms"
        },
        {
          "classAndMethod": "JavaScriptProcessor::applyContentChangesBeforeUrlParsing",
          "execCount": "60",
          "execTime": "0.000020106999999999992",
          "execTimeFormatted": "0 ms"
        },
        {
          "classAndMethod": "CssProcessor::applyContentChangesBeforeUrlParsing",
          "execCount": "58",
          "execTime": "0.0000033529999999999995",
          "execTimeFormatted": "0 ms"
        },
        {
          "classAndMethod": "SvelteProcessor::applyContentChangesBeforeUrlParsing",
          "execCount": "55",
          "execTime": "0.0000025889999999999997",
          "execTimeFormatted": "0 ms"
        },
        {
          "classAndMethod": "HtmlProcessor::applyContentChangesBeforeUrlParsing",
          "execCount": "55",
          "execTime": "0.000002129",
          "execTimeFormatted": "0 ms"
        },
        {
          "classAndMethod": "SvelteProcessor::findUrls",
          "execCount": "55",
          "execTime": "0.0000020479999999999997",
          "execTimeFormatted": "0 ms"
        }
      ],
      "title": "Content processor stats"
    },
    "content-types": {
      "aplCode": "content-types",
      "columns": {
        "avgTime": {
          "aplCode": "avgTime",
          "escapeOutputHtml": true,
          "forcedDataType": null,
          "formatter": {},
          "formatterWillChangeValueLength": false,
          "getDataValueCallback": null,
          "name": "Avg time",
          "nonBreakingSpaces": false,
          "renderer": null,
          "truncateIfLonger": false,
          "width": 8
        },
        "contentType": {
          "aplCode": "contentType",
          "escapeOutputHtml": true,
          "forcedDataType": null,
          "formatter": null,
          "formatterWillChangeValueLength": false,
          "getDataValueCallback": null,
          "name": "Content type",
          "nonBreakingSpaces": false,
          "renderer": null,
          "truncateIfLonger": false,
          "width": 12
        },
        "count": {
          "aplCode": "count",
          "escapeOutputHtml": true,
          "forcedDataType": null,
          "formatter": null,
          "formatterWillChangeValueLength": false,
          "getDataValueCallback": null,
          "name": "URLs",
          "nonBreakingSpaces": false,
          "renderer": null,
          "truncateIfLonger": false,
          "width": 5
        },
        "status20x": {
          "aplCode": "status20x",
          "escapeOutputHtml": true,
          "forcedDataType": null,
          "formatter": {},
          "formatterWillChangeValueLength": false,
          "getDataValueCallback": null,
          "name": "Status 20x",
          "nonBreakingSpaces": false,
          "renderer": null,
          "truncateIfLonger": false,
          "width": 10
        },
        "status40x": {
          "aplCode": "status40x",
          "escapeOutputHtml": true,
          "forcedDataType": null,
          "formatter": {},
          "formatterWillChangeValueLength": false,
          "getDataValueCallback": null,
          "name": "Status 40x",
          "nonBreakingSpaces": false,
          "renderer": null,
          "truncateIfLonger": false,
          "width": 10
        },
        "status42x": {
          "aplCode": "status42x",
          "escapeOutputHtml": true,
          "forcedDataType": null,
          "formatter": {},
          "formatterWillChangeValueLength": false,
          "getDataValueCallback": null,
          "name": "Status 42x",
          "nonBreakingSpaces": false,
          "renderer": null,
          "truncateIfLonger": false,
          "width": 10
        },
        "totalSize": {
          "aplCode": "totalSize",
          "escapeOutputHtml": true,
          "forcedDataType": null,
          "formatter": {},
          "formatterWillChangeValueLength": false,
          "getDataValueCallback": null,
          "name": "Total size",
          "nonBreakingSpaces": false,
          "renderer": null,
          "truncateIfLonger": false,
          "width": 10
        },
        "totalTime": {
          "aplCode": "totalTime",
          "escapeOutputHtml": true,
          "forcedDataType": null,
          "formatter": {},
          "formatterWillChangeValueLength": false,
          "getDataValueCallback": null,
          "name": "Total time",
          "nonBreakingSpaces": false,
          "renderer": null,
          "truncateIfLonger": false,
          "width": 10
        }
      },
      "position": "before-url-table",
      "rows": [
        {
          "avgTime": "0.0083",
          "contentType": "HTML",
          "count": "55",
          "status20x": "51",
          "status40x": "1",
          "status42x": "3",
          "totalSize": "2992356",
          "totalTime": "0.4539"
        },
        {
          "avgTime": "0.0866",
          "contentType": "Image",
          "count": "9",
          "status20x": "9",
          "status40x": "0",
          "status42x": "0",
          "totalSize": "18431137",
          "totalTime": "0.7797"
        },
        {
          "avgTime": "0.0119",
          "contentType": "JS",
          "count": "5",
          "status20x": "5",
          "status40x": "0",
          "status42x": "0",
          "totalSize": "9611",
          "totalTime": "0.0596"
        },
        {
          "avgTime": "0.0039",
          "contentType": "CSS",
          "count": "3",
          "status20x": "3",
          "status40x": "0",
          "status42x": "0",
          "totalSize": "80950",
          "totalTime": "0.0118"
        },
        {
          "avgTime": "0.0031",
          "contentType": "Document",
          "count": "1",
          "status20x": "1",
          "status40x": "0",
          "status42x": "0",
          "totalSize": "152",
          "totalTime": "0.0031"
        }
      ],
      "title": "Content types"
    },
    "content-types-raw": {
      "aplCode": "content-types-raw",
      "columns": {
        "avgTime": {
          "aplCode": "avgTime",
          "escapeOutputHtml": true,
          "forcedDataType": null,
          "formatter": {},
          "formatterWillChangeValueLength": false,
          "getDataValueCallback": null,
          "name": "Avg time",
          "nonBreakingSpaces": false,
          "renderer": null,
          "truncateIfLonger": false,
          "width": 8
        },
        "contentType": {
          "aplCode": "contentType",
          "escapeOutputHtml": true,
          "forcedDataType": null,
          "formatter": null,
          "formatterWillChangeValueLength": false,
          "getDataValueCallback": null,
          "name": "Content type",
          "nonBreakingSpaces": false,
          "renderer": null,
          "truncateIfLonger": false,
          "width": 26
        },
        "count": {
          "aplCode": "count",
          "escapeOutputHtml": true,
          "forcedDataType": null,
          "formatter": null,
          "formatterWillChangeValueLength": false,
          "getDataValueCallback": null,
          "name": "URLs",
          "nonBreakingSpaces": false,
          "renderer": null,
          "truncateIfLonger": false,
          "width": 5
        },
        "status20x": {
          "aplCode": "status20x",
          "escapeOutputHtml": true,
          "forcedDataType": null,
          "formatter": {},
          "formatterWillChangeValueLength": false,
          "getDataValueCallback": null,
          "name": "Status 20x",
          "nonBreakingSpaces": false,
          "renderer": null,
          "truncateIfLonger": false,
          "width": 10
        },
        "status40x": {
          "aplCode": "status40x",
          "escapeOutputHtml": true,
          "forcedDataType": null,
          "formatter": {},
          "formatterWillChangeValueLength": false,
          "getDataValueCallback": null,
          "name": "Status 40x",
          "nonBreakingSpaces": false,
          "renderer": null,
          "truncateIfLonger": false,
          "width": 10
        },
        "status42x": {
          "aplCode": "status42x",
          "escapeOutputHtml": true,
          "forcedDataType": null,
          "formatter": {},
          "formatterWillChangeValueLength": false,
          "getDataValueCallback": null,
          "name": "Status 42x",
          "nonBreakingSpaces": false,
          "renderer": null,
          "truncateIfLonger": false,
          "width": 10
        },
        "totalSize": {
          "aplCode": "totalSize",
          "escapeOutputHtml": true,
          "forcedDataType": null,
          "formatter": {},
          "formatterWillChangeValueLength": false,
          "getDataValueCallback": null,
          "name": "Total size",
          "nonBreakingSpaces": false,
          "renderer": null,
          "truncateIfLonger": false,
          "width": 10
        },
        "totalTime": {
          "aplCode": "totalTime",
          "escapeOutputHtml": true,
          "forcedDataType": null,
          "formatter": {},
          "formatterWillChangeValueLength": false,
          "getDataValueCallback": null,
          "name": "Total time",
          "nonBreakingSpaces": false,
          "renderer": null,
          "truncateIfLonger": false,
          "width": 10
        }
      },
      "position": "before-url-table",
      "rows": [
        {
          "avgTime": "0.0083",
          "contentType": "text/html",
          "count": "55",
          "status20x": "51",
          "status40x": "1",
          "status42x": "3",
          "totalSize": "2992356",
          "totalTime": "0.4539"
        },
        {
          "avgTime": "0.0119",
          "contentType": "application/javascript",
          "count": "5",
          "status20x": "5",
          "status40x": "0",
          "status42x": "0",
          "totalSize": "9611",
          "totalTime": "0.0596"
        },
        {
          "avgTime": "0.0039",
          "contentType": "text/css",
          "count": "3",
          "status20x": "3",
          "status40x": "0",
          "status42x": "0",
          "totalSize": "80950",
          "totalTime": "0.0118"
        },
        {
          "avgTime": "0.2140",
          "contentType": "image/gif",
          "count": "3",
          "status20x": "3",
          "status40x": "0",
          "status42x": "0",
          "totalSize": "16441548",
          "totalTime": "0.6421"
        },
        {
          "avgTime": "0.0363",
          "contentType": "image/avif",
          "count": "2",
          "status20x": "2",
          "status40x": "0",
          "status42x": "0",
          "totalSize": "1939289",
          "totalTime": "0.0726"
        },
        {
          "avgTime": "0.0032",
          "contentType": "image/webp",
          "count": "2",
          "status20x": "2",
          "status40x": "0",
          "status42x": "0",
          "totalSize": "48954",
          "totalTime": "0.0064"
        },
        {
          "avgTime": "0.0293",
          "contentType": "image/svg+xml",
          "count": "2",
          "status20x": "2",
          "status40x": "0",
          "status42x": "0",
          "totalSize": "1346",
          "totalTime": "0.0585"
        },
        {
          "avgTime": "0.0031",
          "contentType": "text/plain",
          "count": "1",
          "status20x": "1",
          "status40x": "0",
          "status42x": "0",
          "totalSize": "152",
          "totalTime": "0.0031"
        }
      ],
      "title": "Content types (MIME types)"
    },
    "dns": {
      "aplCode": "dns",
      "columns": {
        "info": {
          "aplCode": "info",
          "escapeOutputHtml": false,
          "forcedDataType": null,
          "formatter": {},
          "formatterWillChangeValueLength": false,
          "getDataValueCallback": null,
          "name": "DNS resolving tree",
          "nonBreakingSpaces": true,
          "renderer": null,
          "truncateIfLonger": true,
          "width": 70
        }
      },
      "position": "after-url-table",
      "rows": [
        {
          "info": "crawler.siteone.io"
        },
        {
          "info": "  IPv4: 86.49.167.242"
        },
        {
          "info": ""
        },
        {
          "info": "DNS server: 10.255.255.254"
        }
      ],
      "title": "DNS info"
    },
    "external-urls": {
      "aplCode": "external-urls",
      "columns": {
        "count": {
          "aplCode": "count",
          "escapeOutputHtml": true,
          "forcedDataType": null,
          "formatter": null,
          "formatterWillChangeValueLength": false,
          "getDataValueCallback": null,
          "name": "Pages",
          "nonBreakingSpaces": false,
          "renderer": null,
          "truncateIfLonger": false,
          "width": 5
        },
        "foundOn": {
          "aplCode": "foundOn",
          "escapeOutputHtml": true,
          "forcedDataType": null,
          "formatter": null,
          "formatterWillChangeValueLength": false,
          "getDataValueCallback": null,
          "name": "Found on URL (max 5)",
          "nonBreakingSpaces": false,
          "renderer": null,
          "truncateIfLonger": false,
          "width": -1
        },
        "url": {
          "aplCode": "url",
          "escapeOutputHtml": true,
          "forcedDataType": null,
          "formatter": null,
          "formatterWillChangeValueLength": false,
          "getDataValueCallback": null,
          "name": "External URL",
          "nonBreakingSpaces": false,
          "renderer": null,
          "truncateIfLonger": false,
          "width": -1
        }
      },
      "position": "before-url-table",
      "rows": [
        {
          "count": "1",
          "foundOn": "https://crawler.siteone.io/introduction/thanks/",
          "url": "https://adamwathan.me/"
        },
        {
          "count": "1",
          "foundOn": "https://crawler.siteone.io/introduction/support-us/",
          "url": "https://alternativeto.net/software/siteone-crawler--deep-website-analyzer/about/"
        },
        {
          "count": "1",
          "foundOn": "https://crawler.siteone.io/introduction/thanks/",
          "url": "https://chat.openai.com/"
        },
        {
          "count": "1",
          "foundOn": "https://crawler.siteone.io/introduction/thanks/",
          "url": "https://cz.linkedin.com/in/janbezdek"
        },
        {
          "count": "1",
          "foundOn": "https://crawler.siteone.io/introduction/thanks/",
          "url": "https://daisyui.com/"
        },
        {
          "count": "1",
          "foundOn": "https://crawler.siteone.io/",
          "url": "https://discord.gg/Uh66HaZJ"
        },
        {
          "count": "1",
          "foundOn": "https://crawler.siteone.io/introduction/contact-and-community/",
          "url": "https://discord.gg/fdm7KE8Z"
        },
        {
          "count": "1",
          "foundOn": "https://crawler.siteone.io/introduction/thanks/",
          "url": "https://en.wikipedia.org/wiki/Larry_Page"
        },
        {
          "count": "1",
          "foundOn": "https://crawler.siteone.io/introduction/thanks/",
          "url": "https://en.wikipedia.org/wiki/Sergey_Brin"
        },
        {
          "count": "1",
          "foundOn": "https://crawler.siteone.io/introduction/thanks/",
          "url": "https://en.wikipedia.org/wiki/Steve_Jobs"
        },
        {
          "count": "1",
          "foundOn": "https://crawler.siteone.io/introduction/thanks/",
          "url": "https://en.wikipedia.org/wiki/Tilman_Hausherr"
        },
        {
          "count": "1",
          "foundOn": "https://crawler.siteone.io/",
          "url": "https://github.com/janreges/siteone-crawler"
        },
        {
          "count": "1",
          "foundOn": "https://crawler.siteone.io/",
          "url": "https://github.com/janreges/siteone-crawler-gui"
        },
        {
          "count": "1",
          "foundOn": "https://crawler.siteone.io/features/ease-of-use/",
          "url": "https://github.com/janreges/siteone-crawler-gui/issues/new"
        },
        {
          "count": "1",
          "foundOn": "https://crawler.siteone.io/",
          "url": "https://github.com/janreges/siteone-crawler-gui/releases"
        },
        {
          "count": "1",
          "foundOn": "https://crawler.siteone.io/installation-and-requirements/desktop-application/",
          "url": "https://github.com/janreges/siteone-crawler-gui/releases/download/v1.0.8/SiteOne-Crawler-linux-arm64-1.0.8.AppImage"
        },
        {
          "count": "1",
          "foundOn": "https://crawler.siteone.io/installation-and-requirements/desktop-application/",
          "url": "https://github.com/janreges/siteone-crawler-gui/releases/download/v1.0.8/SiteOne-Crawler-linux-arm64-1.0.8.deb"
        },
        {
          "count": "1",
          "foundOn": "https://crawler.siteone.io/installation-and-requirements/desktop-application/",
          "url": "https://github.com/janreges/siteone-crawler-gui/releases/download/v1.0.8/SiteOne-Crawler-linux-arm64-1.0.8.snap"
        },
        {
          "count": "1",
          "foundOn": "https://crawler.siteone.io/",
          "url": "https://github.com/janreges/siteone-crawler-gui/releases/download/v1.0.8/SiteOne-Crawler-linux-x64-1.0.8.AppImage"
        },
        {
          "count": "1",
          "foundOn": "https://crawler.siteone.io/",
          "url": "https://github.com/janreges/siteone-crawler-gui/releases/download/v1.0.8/SiteOne-Crawler-linux-x64-1.0.8.deb"
        },
        {
          "count": "1",
          "foundOn": "https://crawler.siteone.io/",
          "url": "https://github.com/janreges/siteone-crawler-gui/releases/download/v1.0.8/SiteOne-Crawler-linux-x64-1.0.8.snap"
        },
        {
          "count": "1",
          "foundOn": "https://crawler.siteone.io/",
          "url": "https://github.com/janreges/siteone-crawler-gui/releases/download/v1.0.8/SiteOne-Crawler-mac-arm64-1.0.8.dmg"
        },
        {
          "count": "1",
          "foundOn": "https://crawler.siteone.io/",
          "url": "https://github.com/janreges/siteone-crawler-gui/releases/download/v1.0.8/SiteOne-Crawler-mac-x64-1.0.8.dmg"
        },
        {
          "count": "1",
          "foundOn": "https://crawler.siteone.io/",
          "url": "https://github.com/janreges/siteone-crawler-gui/releases/download/v1.0.8/SiteOne-Crawler-win-x64-1.0.8-portable.exe"
        },
        {
          "count": "1",
          "foundOn": "https://crawler.siteone.io/",
          "url": "https://github.com/janreges/siteone-crawler-gui/releases/download/v1.0.8/SiteOne-Crawler-win-x64-1.0.8-setup.exe"
        },
        {
          "count": "1",
          "foundOn": "https://crawler.siteone.io/",
          "url": "https://github.com/janreges/siteone-crawler-gui/releases/download/v1.0.8/SiteOne-Crawler-win-x64-1.0.8.msi"
        },
        {
          "count": "1",
          "foundOn": "https://crawler.siteone.io/",
          "url": "https://github.com/janreges/siteone-crawler-markdown-examples/blob/main/react.dev/index.md"
        },
        {
          "count": "1",
          "foundOn": "https://crawler.siteone.io/introduction/faq/",
          "url": "https://github.com/janreges/siteone-crawler/"
        },
        {
          "count": "1",
          "foundOn": "https://crawler.siteone.io/introduction/contact-and-community/",
          "url": "https://github.com/janreges/siteone-crawler/discussions"
        },
        {
          "count": "1",
          "foundOn": "https://crawler.siteone.io/",
          "url": "https://github.com/janreges/siteone-crawler/issues"
        },
        {
          "count": "1",
          "foundOn": "https://crawler.siteone.io/features/ease-of-use/",
          "url": "https://github.com/janreges/siteone-crawler/issues/new"
        },
        {
          "count": "1",
          "foundOn": "https://crawler.siteone.io/installation-and-requirements/ready-to-use-packages/",
          "url": "https://github.com/janreges/siteone-crawler/releases"
        },
        {
          "count": "1",
          "foundOn": "https://crawler.siteone.io/installation-and-requirements/ready-to-use-packages/",
          "url": "https://github.com/janreges/siteone-crawler/releases/download/v1.0.8/siteone-crawler-v1.0.8-win-x64.zip"
        },
        {
          "count": "1",
          "foundOn": "https://crawler.siteone.io/introduction/thanks/",
          "url": "https://github.com/matyhtf"
        },
        {
          "count": "1",
          "foundOn": "https://crawler.siteone.io/introduction/ideas-and-roadmap/",
          "url": "https://github.com/swoole/swoole-src"
        },
        {
          "count": "1",
          "foundOn": "https://crawler.siteone.io/installation-and-requirements/manual-installation/",
          "url": "https://github.com/swoole/swoole-src/releases"
        },
        {
          "count": "1",
          "foundOn": "https://crawler.siteone.io/installation-and-requirements/manual-installation/",
          "url": "https://github.com/swoole/swoole-src/releases/download/v4.8.13/swoole-cli-v4.8.13-cygwin-x64.zip"
        },
        {
          "count": "1",
          "foundOn": "https://crawler.siteone.io/installation-and-requirements/manual-installation/",
          "url": "https://github.com/swoole/swoole-src/releases/download/v4.8.13/swoole-cli-v4.8.13-linux-arm64.tar.xz"
        },
        {
          "count": "1",
          "foundOn": "https://crawler.siteone.io/installation-and-requirements/manual-installation/",
          "url": "https://github.com/swoole/swoole-src/releases/download/v4.8.13/swoole-cli-v4.8.13-macos-arm64.tar.xz"
        },
        {
          "count": "1",
          "foundOn": "https://crawler.siteone.io/installation-and-requirements/manual-installation/",
          "url": "https://github.com/swoole/swoole-src/releases/download/v4.8.13/swoole-cli-v4.8.13-macos-x64.tar.xz"
        },
        {
          "count": "1",
          "foundOn": "https://crawler.siteone.io/introduction/thanks/",
          "url": "https://home.snafu.de/tilman/xenulink.html"
        },
        {
          "count": "1",
          "foundOn": "https://crawler.siteone.io/installation-and-requirements/manual-installation/",
          "url": "https://learn.microsoft.com/en-us/windows/wsl/about"
        },
        {
          "count": "1",
          "foundOn": "https://crawler.siteone.io/installation-and-requirements/ready-to-use-packages/",
          "url": "https://learn.microsoft.com/en-us/windows/wsl/install"
        },
        {
          "count": "1",
          "foundOn": "https://crawler.siteone.io/introduction/thanks/",
          "url": "https://nette.org/"
        },
        {
          "count": "1",
          "foundOn": "https://crawler.siteone.io/advanced-topics/contribution-and-development/",
          "url": "https://opensource.guide/"
        },
        {
          "count": "1",
          "foundOn": "https://crawler.siteone.io/introduction/ideas-and-roadmap/",
          "url": "https://openswoole.com/docs/modules/swoole-table"
        },
        {
          "count": "1",
          "foundOn": "https://crawler.siteone.io/advanced-topics/contribution-and-development/",
          "url": "https://phpbestpractices.org/"
        },
        {
          "count": "1",
          "foundOn": "https://crawler.siteone.io/introduction/thanks/",
          "url": "https://phpstan.org/"
        },
        {
          "count": "1",
          "foundOn": "https://crawler.siteone.io/advanced-topics/contribution-and-development/",
          "url": "https://phptherightway.com/"
        },
        {
          "count": "1",
          "foundOn": "https://crawler.siteone.io/",
          "url": "https://platform-api.sharethis.com/js/sharethis.js"
        },
        {
          "count": "1",
          "foundOn": "https://crawler.siteone.io/introduction/ideas-and-roadmap/",
          "url": "https://reactphp.org/"
        },
        {
          "count": "1",
          "foundOn": "https://crawler.siteone.io/introduction/thanks/",
          "url": "https://starlight.astro.build/"
        },
        {
          "count": "1",
          "foundOn": "https://crawler.siteone.io/introduction/thanks/",
          "url": "https://svelte.dev/"
        },
        {
          "count": "1",
          "foundOn": "https://crawler.siteone.io/introduction/thanks/",
          "url": "https://tailwindcss.com/"
        },
        {
          "count": "1",
          "foundOn": "https://crawler.siteone.io/introduction/thanks/",
          "url": "https://twitter.com/BillGates"
        },
        {
          "count": "1",
          "foundOn": "https://crawler.siteone.io/introduction/thanks/",
          "url": "https://twitter.com/DavidGrudl"
        },
        {
          "count": "1",
          "foundOn": "https://crawler.siteone.io/introduction/thanks/",
          "url": "https://twitter.com/OndrejMirtes"
        },
        {
          "count": "1",
          "foundOn": "https://crawler.siteone.io/introduction/thanks/",
          "url": "https://twitter.com/elonmusk"
        },
        {
          "count": "1",
          "foundOn": "https://crawler.siteone.io/introduction/thanks/",
          "url": "https://twitter.com/machal"
        },
        {
          "count": "1",
          "foundOn": "https://crawler.siteone.io/introduction/thanks/",
          "url": "https://twitter.com/rich_harris"
        },
        {
          "count": "1",
          "foundOn": "https://crawler.siteone.io/introduction/thanks/",
          "url": "https://twitter.com/ryancarniato"
        },
        {
          "count": "1",
          "foundOn": "https://crawler.siteone.io/introduction/thanks/",
          "url": "https://twitter.com/saadeghi?lang=cs"
        },
        {
          "count": "1",
          "foundOn": "https://crawler.siteone.io/introduction/thanks/",
          "url": "https://twitter.com/sama"
        },
        {
          "count": "1",
          "foundOn": "https://crawler.siteone.io/",
          "url": "https://twitter.com/siteone_crawler"
        },
        {
          "count": "1",
          "foundOn": "https://crawler.siteone.io/introduction/thanks/",
          "url": "https://twitter.com/spazef0rze"
        },
        {
          "count": "1",
          "foundOn": "https://crawler.siteone.io/introduction/thanks/",
          "url": "https://twitter.com/swithinbank"
        },
        {
          "count": "1",
          "foundOn": "https://crawler.siteone.io/introduction/thanks/",
          "url": "https://twitter.com/zdendac"
        },
        {
          "count": "1",
          "foundOn": "https://crawler.siteone.io/installation-and-requirements/manual-installation/",
          "url": "https://ubuntu.com/wsl"
        },
        {
          "count": "1",
          "foundOn": "https://crawler.siteone.io/introduction/thanks/",
          "url": "https://www.amd.com/"
        },
        {
          "count": "1",
          "foundOn": "https://crawler.siteone.io/introduction/thanks/",
          "url": "https://www.cdn77.com/"
        },
        {
          "count": "1",
          "foundOn": "https://crawler.siteone.io/introduction/ideas-and-roadmap/",
          "url": "https://www.cygwin.com/"
        },
        {
          "count": "1",
          "foundOn": "https://crawler.siteone.io/introduction/thanks/",
          "url": "https://www.electronjs.org/"
        },
        {
          "count": "1",
          "foundOn": "https://crawler.siteone.io/introduction/thanks/",
          "url": "https://www.jetbrains.com/"
        },
        {
          "count": "1",
          "foundOn": "https://crawler.siteone.io/introduction/thanks/",
          "url": "https://www.lenovo.com/"
        },
        {
          "count": "1",
          "foundOn": "https://crawler.siteone.io/introduction/thanks/",
          "url": "https://www.linkedin.com/in/linustorvalds"
        },
        {
          "count": "1",
          "foundOn": "https://crawler.siteone.io/installation-and-requirements/manual-installation/",
          "url": "https://www.linuxfordevices.com/tutorials/linux/install-debian-on-windows-wsl"
        },
        {
          "count": "1",
          "foundOn": "https://crawler.siteone.io/introduction/thanks/",
          "url": "https://www.michalspacek.cz/"
        },
        {
          "count": "1",
          "foundOn": "https://crawler.siteone.io/advanced-topics/contribution-and-development/",
          "url": "https://www.php-fig.org/psr/psr-12/"
        },
        {
          "count": "1",
          "foundOn": "https://crawler.siteone.io/configuration/command-line-options/",
          "url": "https://www.php.net/manual/en/timezones.php"
        },
        {
          "count": "1",
          "foundOn": "https://crawler.siteone.io/introduction/contact-and-community/",
          "url": "https://www.reddit.com/r/siteone_crawler/"
        },
        {
          "count": "1",
          "foundOn": "https://crawler.siteone.io/introduction/ideas-and-roadmap/",
          "url": "https://www.rust-lang.org/"
        },
        {
          "count": "1",
          "foundOn": "https://crawler.siteone.io/introduction/thanks/",
          "url": "https://www.siteone.io/"
        },
        {
          "count": "1",
          "foundOn": "https://crawler.siteone.io/introduction/thanks/",
          "url": "https://www.solidjs.com/"
        },
        {
          "count": "1",
          "foundOn": "https://crawler.siteone.io/introduction/thanks/",
          "url": "https://www.spse-po.sk/"
        },
        {
          "count": "1",
          "foundOn": "https://crawler.siteone.io/introduction/thanks/",
          "url": "https://www.swoole.com/"
        },
        {
          "count": "1",
          "foundOn": "https://crawler.siteone.io/introduction/thanks/",
          "url": "https://www.vzhurudolu.cz/"
        },
        {
          "count": "1",
          "foundOn": "https://crawler.siteone.io/configuration/command-line-options/",
          "url": "https://www.w3schools.com/xml/xpath_syntax.asp"
        },
        {
          "count": "1",
          "foundOn": "https://crawler.siteone.io/",
          "url": "https://www.youtube.com/@SiteOne-Crawler"
        },
        {
          "count": "1",
          "foundOn": "https://crawler.siteone.io/introduction/faq/",
          "url": "https://x.com/janreges"
        }
      ],
      "title": "External URLs"
    },
    "fastest-urls": {
      "aplCode": "fastest-urls",
      "columns": {
        "requestTime": {
          "aplCode": "requestTime",
          "escapeOutputHtml": true,
          "forcedDataType": null,
          "formatter": {},
          "formatterWillChangeValueLength": false,
          "getDataValueCallback": null,
          "name": "Time",
          "nonBreakingSpaces": false,
          "renderer": null,
          "truncateIfLonger": false,
          "width": 6
        },
        "statusCode": {
          "aplCode": "statusCode",
          "escapeOutputHtml": true,
          "forcedDataType": null,
          "formatter": {},
          "formatterWillChangeValueLength": false,
          "getDataValueCallback": null,
          "name": "Status",
          "nonBreakingSpaces": false,
          "renderer": null,
          "truncateIfLonger": false,
          "width": 6
        },
        "url": {
          "aplCode": "url",
          "escapeOutputHtml": true,
          "forcedDataType": null,
          "formatter": null,
          "formatterWillChangeValueLength": true,
          "getDataValueCallback": null,
          "name": "Fast URL",
          "nonBreakingSpaces": false,
          "renderer": null,
          "truncateIfLonger": true,
          "width": 118
        }
      },
      "position": "before-url-table",
      "rows": [
        {
          "requestTime": "0.0041",
          "statusCode": "200",
          "url": "https://crawler.siteone.io/introduction/support-us/"
        },
        {
          "requestTime": "0.0042",
          "statusCode": "200",
          "url": "https://crawler.siteone.io/features/heading-analysis/"
        },
        {
          "requestTime": "0.0044",
          "statusCode": "200",
          "url": "https://crawler.siteone.io/features/accessibility-analysis/"
        },
        {
          "requestTime": "0.0044",
          "statusCode": "200",
          "url": "https://crawler.siteone.io/features/availability/"
        },
        {
          "requestTime": "0.0044",
          "statusCode": "200",
          "url": "https://crawler.siteone.io/introduction/overview/"
        },
        {
          "requestTime": "0.0045",
          "statusCode": "200",
          "url": "https://crawler.siteone.io/features/caching-analysis/"
        },
        {
          "requestTime": "0.0045",
          "statusCode": "200",
          "url": "https://crawler.siteone.io/features/mailer/"
        },
        {
          "requestTime": "0.0045",
          "statusCode": "200",
          "url": "https://crawler.siteone.io/features/security-analysis/"
        },
        {
          "requestTime": "0.0045",
          "statusCode": "200",
          "url": "https://crawler.siteone.io/features/exports-and-reports/"
        },
        {
          "requestTime": "0.0046",
          "statusCode": "200",
          "url": "https://crawler.siteone.io/introduction/thanks/"
        },
        {
          "requestTime": "0.0046",
          "statusCode": "200",
          "url": "https://crawler.siteone.io/features/sitemap-generator/"
        },
        {
          "requestTime": "0.0046",
          "statusCode": "200",
          "url": "https://crawler.siteone.io/installation-and-requirements/system-requirements/"
        },
        {
          "requestTime": "0.0046",
          "statusCode": "200",
          "url": "https://crawler.siteone.io/features/dns-analysis/"
        },
        {
          "requestTime": "0.0046",
          "statusCode": "200",
          "url": "https://crawler.siteone.io/features/redirect-and-404-analysis/"
        },
        {
          "requestTime": "0.0047",
          "statusCode": "200",
          "url": "https://crawler.siteone.io/installation-and-requirements/desktop-application/"
        },
        {
          "requestTime": "0.0048",
          "statusCode": "200",
          "url": "https://crawler.siteone.io/features/ssl-tls-analysis/"
        },
        {
          "requestTime": "0.0048",
          "statusCode": "200",
          "url": "https://crawler.siteone.io/features/audit-report/"
        },
        {
          "requestTime": "0.0048",
          "statusCode": "200",
          "url": "https://crawler.siteone.io/features/dev-devops-assistant/"
        },
        {
          "requestTime": "0.0048",
          "statusCode": "200",
          "url": "https://crawler.siteone.io/features/website-to-markdown-converter/"
        },
        {
          "requestTime": "0.0048",
          "statusCode": "200",
          "url": "https://crawler.siteone.io/introduction/key-features/"
        }
      ],
      "title": "TOP fastest URLs"
    },
    "headers": {
      "aplCode": "headers",
      "columns": {
        "header": {
          "aplCode": "header",
          "escapeOutputHtml": true,
          "forcedDataType": null,
          "formatter": null,
          "formatterWillChangeValueLength": false,
          "getDataValueCallback": null,
          "name": "Header",
          "nonBreakingSpaces": false,
          "renderer": null,
          "truncateIfLonger": false,
          "width": -1
        },
        "maxValue": {
          "aplCode": "maxValue",
          "escapeOutputHtml": true,
          "forcedDataType": null,
          "formatter": null,
          "formatterWillChangeValueLength": false,
          "getDataValueCallback": null,
          "name": "Max value",
          "nonBreakingSpaces": false,
          "renderer": null,
          "truncateIfLonger": false,
          "width": 10
        },
        "minValue": {
          "aplCode": "minValue",
          "escapeOutputHtml": true,
          "forcedDataType": null,
          "formatter": null,
          "formatterWillChangeValueLength": false,
          "getDataValueCallback": null,
          "name": "Min value",
          "nonBreakingSpaces": false,
          "renderer": null,
          "truncateIfLonger": false,
          "width": 10
        },
        "occurrences": {
          "aplCode": "occurrences",
          "escapeOutputHtml": true,
          "forcedDataType": null,
          "formatter": null,
          "formatterWillChangeValueLength": false,
          "getDataValueCallback": null,
          "name": "Occurs",
          "nonBreakingSpaces": false,
          "renderer": null,
          "truncateIfLonger": false,
          "width": 6
        },
        "uniqueValues": {
          "aplCode": "uniqueValues",
          "escapeOutputHtml": true,
          "forcedDataType": null,
          "formatter": null,
          "formatterWillChangeValueLength": false,
          "getDataValueCallback": null,
          "name": "Unique",
          "nonBreakingSpaces": false,
          "renderer": null,
          "truncateIfLonger": false,
          "width": 6
        },
        "valuesPreview": {
          "aplCode": "valuesPreview",
          "escapeOutputHtml": false,
          "forcedDataType": null,
          "formatter": null,
          "formatterWillChangeValueLength": true,
          "getDataValueCallback": null,
          "name": "Values preview",
          "nonBreakingSpaces": false,
          "renderer": null,
          "truncateIfLonger": true,
          "width": 48
        }
      },
      "position": "before-url-table",
      "rows": [
        {
          "header": "Accept-Ranges",
          "maxValue": "",
          "minValue": "",
          "occurrences": "11",
          "uniqueValues": "1",
          "valuesPreview": "bytes"
        },
        {
          "header": "Cache-Control",
          "maxValue": "",
          "minValue": "",
          "occurrences": "69",
          "uniqueValues": "2",
          "valuesPreview": "max-age=3600 (52) / max-age=31536000 (17)"
        },
        {
          "header": "Content-Length",
          "maxValue": "8 MB",
          "minValue": "152 B",
          "occurrences": "15",
          "uniqueValues": "-",
          "valuesPreview": "[ignored generic values]"
        },
        {
          "header": "Content-Security-Policy",
          "maxValue": "",
          "minValue": "",
          "occurrences": "52",
          "uniqueValues": "1",
          "valuesPreview": "default-src 'self' 'unsafe-inline' 'unsafe-eval' data: https://www.youtube.com h\u001b[0;31m…\u001b[0ms://*.sharethis.com https://*.ytimg.com"
        },
        {
          "header": "Content-Type",
          "maxValue": "",
          "minValue": "",
          "occurrences": "73",
          "uniqueValues": "8",
          "valuesPreview": "text/html (55) / application/javascript (5) / image/gif (3) / text/css (3) / ima\u001b[0;31m…\u001b[0ml (2) / image/avif (2) / text/plain (1)"
        },
        {
          "header": "Date",
          "maxValue": "2026-03-16",
          "minValue": "2026-03-16",
          "occurrences": "73",
          "uniqueValues": "-",
          "valuesPreview": "[ignored generic values]"
        },
        {
          "header": "Etag",
          "maxValue": "",
          "minValue": "",
          "occurrences": "73",
          "uniqueValues": "-",
          "valuesPreview": "[ignored generic values]"
        },
        {
          "header": "Expires",
          "maxValue": "2027-03-16",
          "minValue": "2026-03-16",
          "occurrences": "69",
          "uniqueValues": "-",
          "valuesPreview": "[ignored generic values]"
        },
        {
          "header": "Feature-Policy",
          "maxValue": "",
          "minValue": "",
          "occurrences": "73",
          "uniqueValues": "1",
          "valuesPreview": "accelerometer 'none'; camera 'none'; geolocation 'self'; gyroscope 'none'; magne\u001b[0;31m…\u001b[0mmidi 'none'; payment 'none'; usb 'none'"
        },
        {
          "header": "Last-Modified",
          "maxValue": "2025-06-08",
          "minValue": "2025-05-06",
          "occurrences": "69",
          "uniqueValues": "-",
          "valuesPreview": "[ignored generic values]"
        },
        {
          "header": "Permissions-Policy",
          "maxValue": "",
          "minValue": "",
          "occurrences": "73",
          "uniqueValues": "1",
          "valuesPreview": "accelerometer=(), camera=(), geolocation=(self), gyroscope=(), magnetometer=(),\u001b[0;31m…\u001b[0mrophone=(), midi=(), payment=(), usb=()"
        },
        {
          "header": "Referrer-Policy",
          "maxValue": "",
          "minValue": "",
          "occurrences": "73",
          "uniqueValues": "1",
          "valuesPreview": "no-referrer-when-downgrade"
        },
        {
          "header": "Server",
          "maxValue": "",
          "minValue": "",
          "occurrences": "73",
          "uniqueValues": "1",
          "valuesPreview": "-"
        },
        {
          "header": "Strict-Transport-Security",
          "maxValue": "",
          "minValue": "",
          "occurrences": "73",
          "uniqueValues": "1",
          "valuesPreview": "max-age=15552000"
        },
        {
          "header": "Vary",
          "maxValue": "",
          "minValue": "",
          "occurrences": "58",
          "uniqueValues": "1",
          "valuesPreview": "Accept-Encoding"
        },
        {
          "header": "X-Content-Type-Options",
          "maxValue": "",
          "minValue": "",
          "occurrences": "73",
          "uniqueValues": "2",
          "valuesPreview": "nosniff (56) / nosniff, nosniff (17)"
        },
        {
          "header": "X-Frame-Options",
          "maxValue": "",
          "minValue": "",
          "occurrences": "73",
          "uniqueValues": "1",
          "valuesPreview": "SAMEORIGIN"
        },
        {
          "header": "X-XSS-Protection",
          "maxValue": "",
          "minValue": "",
          "occurrences": "73",
          "uniqueValues": "1",
          "valuesPreview": "1; mode=block"
        }
      ],
      "title": "HTTP headers"
    },
    "headers-values": {
      "aplCode": "headers-values",
      "columns": {
        "header": {
          "aplCode": "header",
          "escapeOutputHtml": true,
          "forcedDataType": null,
          "formatter": null,
          "formatterWillChangeValueLength": false,
          "getDataValueCallback": null,
          "name": "Header",
          "nonBreakingSpaces": false,
          "renderer": null,
          "truncateIfLonger": false,
          "width": -1
        },
        "occurrences": {
          "aplCode": "occurrences",
          "escapeOutputHtml": true,
          "forcedDataType": null,
          "formatter": null,
          "formatterWillChangeValueLength": false,
          "getDataValueCallback": null,
          "name": "Occurs",
          "nonBreakingSpaces": false,
          "renderer": null,
          "truncateIfLonger": false,
          "width": 6
        },
        "value": {
          "aplCode": "value",
          "escapeOutputHtml": true,
          "forcedDataType": null,
          "formatter": null,
          "formatterWillChangeValueLength": true,
          "getDataValueCallback": null,
          "name": "Value",
          "nonBreakingSpaces": false,
          "renderer": null,
          "truncateIfLonger": true,
          "width": 82
        }
      },
      "position": "before-url-table",
      "rows": [
        {
          "header": "Accept-Ranges",
          "occurrences": "11",
          "value": "bytes"
        },
        {
          "header": "Cache-Control",
          "occurrences": "52",
          "value": "max-age=3600"
        },
        {
          "header": "Cache-Control",
          "occurrences": "17",
          "value": "max-age=31536000"
        },
        {
          "header": "Content-Security-Policy",
          "occurrences": "52",
          "value": "default-src 'self' 'unsafe-inline' 'unsafe-eval' data: https://www.youtube.com https://*.ggpht.com https://*.gstatic.com https://*.google.com https://*.googleapis.com https://static.doubleclick.net https://*.sharethis.com https://*.ytimg.com"
        },
        {
          "header": "Content-Type",
          "occurrences": "55",
          "value": "text/html"
        },
        {
          "header": "Content-Type",
          "occurrences": "5",
          "value": "application/javascript"
        },
        {
          "header": "Content-Type",
          "occurrences": "3",
          "value": "image/gif"
        },
        {
          "header": "Content-Type",
          "occurrences": "3",
          "value": "text/css"
        },
        {
          "header": "Content-Type",
          "occurrences": "2",
          "value": "image/webp"
        },
        {
          "header": "Content-Type",
          "occurrences": "2",
          "value": "image/svg+xml"
        },
        {
          "header": "Content-Type",
          "occurrences": "2",
          "value": "image/avif"
        },
        {
          "header": "Content-Type",
          "occurrences": "1",
          "value": "text/plain"
        },
        {
          "header": "Feature-Policy",
          "occurrences": "73",
          "value": "accelerometer 'none'; camera 'none'; geolocation 'self'; gyroscope 'none'; magnetometer 'none'; microphone 'none'; midi 'none'; payment 'none'; usb 'none'"
        },
        {
          "header": "Permissions-Policy",
          "occurrences": "73",
          "value": "accelerometer=(), camera=(), geolocation=(self), gyroscope=(), magnetometer=(), microphone=(), midi=(), payment=(), usb=()"
        },
        {
          "header": "Referrer-Policy",
          "occurrences": "73",
          "value": "no-referrer-when-downgrade"
        },
        {
          "header": "Server",
          "occurrences": "73",
          "value": "-"
        },
        {
          "header": "Strict-Transport-Security",
          "occurrences": "73",
          "value": "max-age=15552000"
        },
        {
          "header": "Vary",
          "occurrences": "58",
          "value": "Accept-Encoding"
        },
        {
          "header": "X-Content-Type-Options",
          "occurrences": "56",
          "value": "nosniff"
        },
        {
          "header": "X-Content-Type-Options",
          "occurrences": "17",
          "value": "nosniff, nosniff"
        },
        {
          "header": "X-Frame-Options",
          "occurrences": "73",
          "value": "SAMEORIGIN"
        },
        {
          "header": "X-XSS-Protection",
          "occurrences": "73",
          "value": "1; mode=block"
        }
      ],
      "title": "HTTP header values"
    },
    "non-unique-descriptions": {
      "aplCode": "non-unique-descriptions",
      "columns": {
        "count": {
          "aplCode": "count",
          "escapeOutputHtml": true,
          "forcedDataType": null,
          "formatter": null,
          "formatterWillChangeValueLength": false,
          "getDataValueCallback": null,
          "name": "Count",
          "nonBreakingSpaces": false,
          "renderer": null,
          "truncateIfLonger": false,
          "width": 5
        },
        "description": {
          "aplCode": "description",
          "escapeOutputHtml": true,
          "forcedDataType": null,
          "formatter": null,
          "formatterWillChangeValueLength": false,
          "getDataValueCallback": null,
          "name": "Description",
          "nonBreakingSpaces": false,
          "renderer": null,
          "truncateIfLonger": true,
          "width": 128
        }
      },
      "position": "before-url-table",
      "rows": [
        {
          "count": "2",
          "description": ""
        }
      ],
      "title": "TOP non-unique descriptions"
    },
    "non-unique-titles": {
      "aplCode": "non-unique-titles",
      "columns": {
        "count": {
          "aplCode": "count",
          "escapeOutputHtml": true,
          "forcedDataType": null,
          "formatter": null,
          "formatterWillChangeValueLength": false,
          "getDataValueCallback": null,
          "name": "Count",
          "nonBreakingSpaces": false,
          "renderer": null,
          "truncateIfLonger": false,
          "width": 5
        },
        "title": {
          "aplCode": "title",
          "escapeOutputHtml": true,
          "forcedDataType": null,
          "formatter": null,
          "formatterWillChangeValueLength": false,
          "getDataValueCallback": null,
          "name": "Title",
          "nonBreakingSpaces": false,
          "renderer": null,
          "truncateIfLonger": true,
          "width": 128
        }
      },
      "position": "before-url-table",
      "rows": [],
      "title": "TOP non-unique titles"
    },
    "open-graph": {
      "aplCode": "open-graph",
      "columns": {
        "ogDescription": {
          "aplCode": "ogDescription",
          "escapeOutputHtml": true,
          "forcedDataType": null,
          "formatter": null,
          "formatterWillChangeValueLength": false,
          "getDataValueCallback": null,
          "name": "OG Description",
          "nonBreakingSpaces": false,
          "renderer": null,
          "truncateIfLonger": true,
          "width": 10
        },
        "ogImage": {
          "aplCode": "ogImage",
          "escapeOutputHtml": true,
          "forcedDataType": null,
          "formatter": null,
          "formatterWillChangeValueLength": false,
          "getDataValueCallback": null,
          "name": "OG Image",
          "nonBreakingSpaces": false,
          "renderer": null,
          "truncateIfLonger": true,
          "width": 18
        },
        "ogTitle": {
          "aplCode": "ogTitle",
          "escapeOutputHtml": true,
          "forcedDataType": null,
          "formatter": null,
          "formatterWillChangeValueLength": false,
          "getDataValueCallback": null,
          "name": "OG Title",
          "nonBreakingSpaces": false,
          "renderer": null,
          "truncateIfLonger": true,
          "width": 10
        },
        "twitterDescription": {
          "aplCode": "twitterDescription",
          "escapeOutputHtml": true,
          "forcedDataType": null,
          "formatter": null,
          "formatterWillChangeValueLength": false,
          "getDataValueCallback": null,
          "name": "Twitter Description",
          "nonBreakingSpaces": false,
          "renderer": null,
          "truncateIfLonger": true,
          "width": 10
        },
        "twitterImage": {
          "aplCode": "twitterImage",
          "escapeOutputHtml": true,
          "forcedDataType": null,
          "formatter": null,
          "formatterWillChangeValueLength": false,
          "getDataValueCallback": null,
          "name": "Twitter Image",
          "nonBreakingSpaces": false,
          "renderer": null,
          "truncateIfLonger": true,
          "width": 18
        },
        "twitterTitle": {
          "aplCode": "twitterTitle",
          "escapeOutputHtml": true,
          "forcedDataType": null,
          "formatter": null,
          "formatterWillChangeValueLength": false,
          "getDataValueCallback": null,
          "name": "Twitter Title",
          "nonBreakingSpaces": false,
          "renderer": null,
          "truncateIfLonger": true,
          "width": 10
        },
        "urlPathAndQuery": {
          "aplCode": "urlPathAndQuery",
          "escapeOutputHtml": true,
          "forcedDataType": null,
          "formatter": null,
          "formatterWillChangeValueLength": false,
          "getDataValueCallback": null,
          "name": "URL",
          "nonBreakingSpaces": false,
          "renderer": null,
          "truncateIfLonger": true,
          "width": 50
        }
      },
      "position": "before-url-table",
      "rows": [
        {
          "ogDescription": "A very useful and free website analyzer you'll ♥ as a Dev/DevOps, QA engineer, SEO or Security specialist, website owner or consultant. It performs in-depth analyzes of your website, generates an offline or markdown version of the website, provides a detailed HTML audit report and works on all popular platforms - Windows, macOS and Linux (x64 and arm64 too).",
          "ogImage": "https://crawler.siteone.io/siteone-crawler-collage.png",
          "ogTitle": "SiteOne Crawler - free website analyzer and exporter (cloner)",
          "twitterDescription": "",
          "twitterImage": "",
          "twitterTitle": "",
          "urlPathAndQuery": "/"
        },
        {
          "ogDescription": "SiteOne Crawler implements an efficient HTTP response caching system to reduce network traffic and speed up repeated crawls.",
          "ogImage": "https://crawler.siteone.io/siteone-crawler-collage.png",
          "ogTitle": "Caching",
          "twitterDescription": "",
          "twitterImage": "",
          "twitterTitle": "",
          "urlPathAndQuery": "/advanced-topics/caching/"
        },
        {
          "ogDescription": "Guidelines for contributing to SiteOne Crawler, including development setup, coding standards, and submission process for pull requests.",
          "ogImage": "https://crawler.siteone.io/siteone-crawler-collage.png",
          "ogTitle": "Contribution and Development",
          "twitterDescription": "",
          "twitterImage": "",
          "twitterTitle": "",
          "urlPathAndQuery": "/advanced-topics/contribution-and-development/"
        },
        {
          "ogDescription": "Understand the core crawling mechanism of SiteOne Crawler, including URL handling, content processing, and navigational decisions.",
          "ogImage": "https://crawler.siteone.io/siteone-crawler-collage.png",
          "ogTitle": "Crawler Behavior",
          "twitterDescription": "",
          "twitterImage": "",
          "twitterTitle": "",
          "urlPathAndQuery": "/advanced-topics/crawler-behavior/"
        },
        {
          "ogDescription": "SiteOne Crawler is designed to be extensible, allowing developers to add custom analyzers, content processors, and exporters to enhance its functionality.",
          "ogImage": "https://crawler.siteone.io/siteone-crawler-collage.png",
          "ogTitle": "Extending",
          "twitterDescription": "",
          "twitterImage": "",
          "twitterTitle": "",
          "urlPathAndQuery": "/advanced-topics/extending/"
        },
        {
          "ogDescription": "Solutions to common issues encountered when using SiteOne Crawler, including performance problems, memory limitations, and crawling challenges.",
          "ogImage": "https://crawler.siteone.io/siteone-crawler-collage.png",
          "ogTitle": "Troubleshooting",
          "twitterDescription": "",
          "twitterImage": "",
          "twitterTitle": "",
          "urlPathAndQuery": "/advanced-topics/troubleshooting/"
        },
        {
          "ogDescription": "This section describes all the available command-line options of the SiteOne Crawler tool.",
          "ogImage": "https://crawler.siteone.io/siteone-crawler-collage.png",
          "ogTitle": "Command-line options",
          "twitterDescription": "",
          "twitterImage": "",
          "twitterTitle": "",
          "urlPathAndQuery": "/configuration/command-line-options/"
        },
        {
          "ogDescription": "This section describes examples and typical scenarios of using the SiteOne Crawler tool.",
          "ogImage": "https://crawler.siteone.io/siteone-crawler-collage.png",
          "ogTitle": "Examples",
          "twitterDescription": "",
          "twitterImage": "",
          "twitterTitle": "",
          "urlPathAndQuery": "/configuration/examples/"
        },
        {
          "ogDescription": "SiteOne Crawler provides basic accessibility analysis, which can help developers to improve the accessibility of their websites.",
          "ogImage": "https://crawler.siteone.io/siteone-crawler-collage.png",
          "ogTitle": "Accessibility Analysis",
          "twitterDescription": "",
          "twitterImage": "",
          "twitterTitle": "",
          "urlPathAndQuery": "/features/accessibility-analysis/"
        },
        {
          "ogDescription": "Learn about the comprehensive HTML audit report generated by SiteOne Crawler, which provides detailed analysis and insights about your website in an organized, interactive format.",
          "ogImage": "https://crawler.siteone.io/siteone-crawler-collage.png",
          "ogTitle": "Audit Report",
          "twitterDescription": "",
          "twitterImage": "",
          "twitterTitle": "",
          "urlPathAndQuery": "/features/audit-report/"
        },
        {
          "ogDescription": "Maximum availability for a wide range of users and platforms is our priority. For this reason, you can download and install SiteOne Crawler on the following platforms: Windows (x64), macOS (Intel/Apple Silicon), Linux (x64/ARM).",
          "ogImage": "https://crawler.siteone.io/siteone-crawler-collage.png",
          "ogTitle": "Availability",
          "twitterDescription": "",
          "twitterImage": "",
          "twitterTitle": "",
          "urlPathAndQuery": "/features/availability/"
        },
        {
          "ogDescription": "SiteOne Crawler evaluates websites against a set of best practices to ensure optimal performance, SEO, and user experience.",
          "ogImage": "https://crawler.siteone.io/siteone-crawler-collage.png",
          "ogTitle": "Best Practices Analysis",
          "twitterDescription": "",
          "twitterImage": "",
          "twitterTitle": "",
          "urlPathAndQuery": "/features/best-practices-analysis/"
        },
        {
          "ogDescription": "SiteOne Crawler analyzes HTTP cache headers across your website to identify optimization opportunities for faster loading times and better user experience.",
          "ogImage": "https://crawler.siteone.io/siteone-crawler-collage.png",
          "ogTitle": "Caching Analysis",
          "twitterDescription": "",
          "twitterImage": "",
          "twitterTitle": "",
          "urlPathAndQuery": "/features/caching-analysis/"
        },
        {
          "ogDescription": "The Content Type analysis provides detailed insights into the types of content served by a website, focusing on their distribution, size, load times, and HTTP status codes.",
          "ogImage": "https://crawler.siteone.io/siteone-crawler-collage.png",
          "ogTitle": "Content Type Analysis",
          "twitterDescription": "",
          "twitterImage": "",
          "twitterTitle": "",
          "urlPathAndQuery": "/features/content-type-analysis/"
        },
        {
          "ogDescription": "SiteOne Crawler is designed to crawl every aspect of your website, including all files, styles, scripts, images, and more. Learn how it can help you find and inspect everything that can be crawled on your website.",
          "ogImage": "https://crawler.siteone.io/siteone-crawler-collage.png",
          "ogTitle": "Deep Website Crawling",
          "twitterDescription": "",
          "twitterImage": "",
          "twitterTitle": "",
          "urlPathAndQuery": "/features/deep-website-crawling/"
        },
        {
          "ogDescription": "SiteOne Crawler offers several functionalities that can be useful for DevOps, such as testing public and local projects, password-protected websites, and stress tests.",
          "ogImage": "https://crawler.siteone.io/siteone-crawler-collage.png",
          "ogTitle": "Dev/DevOps assistant",
          "twitterDescription": "",
          "twitterImage": "",
          "twitterTitle": "",
          "urlPathAndQuery": "/features/dev-devops-assistant/"
        },
        {
          "ogDescription": "The DNS Analysis feature examines domain name resolution, providing insights into DNS configuration, IPv4/IPv6 support, and potential DNS-related issues.",
          "ogImage": "https://crawler.siteone.io/siteone-crawler-collage.png",
          "ogTitle": "DNS Analysis",
          "twitterDescription": "",
          "twitterImage": "",
          "twitterTitle": "",
          "urlPathAndQuery": "/features/dns-analysis/"
        },
        {
          "ogDescription": "SiteOne Crawler is designed for maximum availability, ease of use, and functionality for the command-line and desktop application.",
          "ogImage": "https://crawler.siteone.io/siteone-crawler-collage.png",
          "ogTitle": "Ease of Use",
          "twitterDescription": "",
          "twitterImage": "",
          "twitterTitle": "",
          "urlPathAndQuery": "/features/ease-of-use/"
        },
        {
          "ogDescription": "SiteOne Crawler offers export and reporting features, enabling users to generate comprehensive reports in various formats and for different use cases.",
          "ogImage": "https://crawler.siteone.io/siteone-crawler-collage.png",
          "ogTitle": "Exports and Reports",
          "twitterDescription": "",
          "twitterImage": "",
          "twitterTitle": "",
          "urlPathAndQuery": "/features/exports-and-reports/"
        },
        {
          "ogDescription": "Detailed analysis of HTTP response headers across your website, helping you identify security issues, performance opportunities, and best practices in header configuration.",
          "ogImage": "https://crawler.siteone.io/siteone-crawler-collage.png",
          "ogTitle": "HTTP Headers Analysis",
          "twitterDescription": "",
          "twitterImage": "",
          "twitterTitle": "",
          "urlPathAndQuery": "/features/headers-analysis/"
        },
        {
          "ogDescription": "The Heading Analysis evaluates the structure and organization of headings (H1, H2, H3, etc.) on a webpage to ensure they follow best practices for both accessibility and SEO.",
          "ogImage": "https://crawler.siteone.io/siteone-crawler-collage.png",
          "ogTitle": "Heading Analysis",
          "twitterDescription": "",
          "twitterImage": "",
          "twitterTitle": "",
          "urlPathAndQuery": "/features/heading-analysis/"
        },
        {
          "ogDescription": "Track and visualize improvements in your website metrics over time, providing a clear measure of progress in areas like performance, SEO, accessibility, and security.",
          "ogImage": "https://crawler.siteone.io/siteone-crawler-collage.png",
          "ogTitle": "Improvement Meter",
          "twitterDescription": "",
          "twitterImage": "",
          "twitterTitle": "",
          "urlPathAndQuery": "/features/improvement-meter/"
        },
        {
          "ogDescription": "The Mailer feature allows automated email delivery of HTML audit reports after a crawl is completed. This is particularly useful for sharing results with team members, proactively, e.g. as part of the CI/CD process.",
          "ogImage": "https://crawler.siteone.io/siteone-crawler-collage.png",
          "ogTitle": "Mailer",
          "twitterDescription": "",
          "twitterImage": "",
          "twitterTitle": "",
          "urlPathAndQuery": "/features/mailer/"
        },
        {
          "ogDescription": "Export the entire website to offline form (clone, mirror), where it is possible to browse the site through local HTML files (without webserver) including all document, images, styles, scripts, fonts, etc.",
          "ogImage": "https://crawler.siteone.io/siteone-crawler-collage.png",
          "ogTitle": "Offline Website Generator (clone, mirror)",
          "twitterDescription": "",
          "twitterImage": "",
          "twitterTitle": "",
          "urlPathAndQuery": "/features/offline-website-generator/"
        },
        {
          "ogDescription": "Upload an HTML report to our or your infrastructure and get a secure sharable unique URL for the report.",
          "ogImage": "https://crawler.siteone.io/siteone-crawler-collage.png",
          "ogTitle": "Audit Report Sharing (upload)",
          "twitterDescription": "",
          "twitterImage": "",
          "twitterTitle": "",
          "urlPathAndQuery": "/features/online-html-report-upload/"
        },
        {
          "ogDescription": "The Performance Analysis offers detailed insights into the loading times of individual URLs. This is crucial for developers and site administrators to identify performance bottlenecks and optimize web pages.",
          "ogImage": "https://crawler.siteone.io/siteone-crawler-collage.png",
          "ogTitle": "Performance Analysis",
          "twitterDescription": "",
          "twitterImage": "",
          "twitterTitle": "",
          "urlPathAndQuery": "/features/performance-analysis/"
        },
        {
          "ogDescription": "Identify the fastest and slowest pages on your website with detailed performance analysis, helping you optimize critical pages and fix bottlenecks.",
          "ogImage": "https://crawler.siteone.io/siteone-crawler-collage.png",
          "ogTitle": "Performance Metrics",
          "twitterDescription": "",
          "twitterImage": "",
          "twitterTitle": "",
          "urlPathAndQuery": "/features/performance-metrics/"
        },
        {
          "ogDescription": "The redirects and 404 analysis reports on problematic URLs, focusing on 404 errors and redirects (301/302). This helps identify broken links and unnecessary redirects, which can negatively impact user experience and SEO.",
          "ogImage": "https://crawler.siteone.io/siteone-crawler-collage.png",
          "ogTitle": "Redirect and 404 Analysis",
          "twitterDescription": "",
          "twitterImage": "",
          "twitterTitle": "",
          "urlPathAndQuery": "/features/redirect-and-404-analysis/"
        },
        {
          "ogDescription": "Evaluate the security of your website by analyzing HTTP headers, TLS/SSL configurations and cookie security.",
          "ogImage": "https://crawler.siteone.io/siteone-crawler-collage.png",
          "ogTitle": "Security Analysis",
          "twitterDescription": "",
          "twitterImage": "",
          "twitterTitle": "",
          "urlPathAndQuery": "/features/security-analysis/"
        },
        {
          "ogDescription": "Analyze your website for SEO best practices and social media optimization with detailed insights into metadata, heading structure, duplicate content, and OpenGraph tag implementation.",
          "ogImage": "https://crawler.siteone.io/siteone-crawler-collage.png",
          "ogTitle": "SEO and OpenGraph Analysis",
          "twitterDescription": "",
          "twitterImage": "",
          "twitterTitle": "",
          "urlPathAndQuery": "/features/seo-and-opengraph-analysis/"
        },
        {
          "ogDescription": "Automatically generate comprehensive XML and TXT sitemaps of your website to improve search engine visibility and indexing efficiency.",
          "ogImage": "https://crawler.siteone.io/siteone-crawler-collage.png",
          "ogTitle": "Sitemap Generator",
          "twitterDescription": "",
          "twitterImage": "",
          "twitterTitle": "",
          "urlPathAndQuery": "/features/sitemap-generator/"
        },
        {
          "ogDescription": "Analyze how resources are distributed across different domains in your website, helping you identify opportunities for optimization and potential issues with third-party dependencies.",
          "ogImage": "https://crawler.siteone.io/siteone-crawler-collage.png",
          "ogTitle": "Source Domains Analysis",
          "twitterDescription": "",
          "twitterImage": "",
          "twitterTitle": "",
          "urlPathAndQuery": "/features/source-domains-analysis/"
        },
        {
          "ogDescription": "Comprehensive analysis of your website SSL/TLS implementation, examining certificate configuration, protocol support, and security best practices.",
          "ogImage": "https://crawler.siteone.io/siteone-crawler-collage.png",
          "ogTitle": "SSL/TLS Analysis",
          "twitterDescription": "",
          "twitterImage": "",
          "twitterTitle": "",
          "urlPathAndQuery": "/features/ssl-tls-analysis/"
        },
        {
          "ogDescription": "The \"Stress Testing\" feature in SiteOne Crawler allows you to evaluate how a website performs under various load conditions by customizing the number of workers and maximum requests per second.",
          "ogImage": "https://crawler.siteone.io/siteone-crawler-collage.png",
          "ogTitle": "Stress Testing",
          "twitterDescription": "",
          "twitterImage": "",
          "twitterTitle": "",
          "urlPathAndQuery": "/features/stress-testing/"
        },
        {
          "ogDescription": "SiteOne Crawler provides in-depth technical analysis of your website, examining HTTP headers, server configurations, and technical implementations to identify issues and optimization opportunities.",
          "ogImage": "https://crawler.siteone.io/siteone-crawler-collage.png",
          "ogTitle": "Technical Analysis",
          "twitterDescription": "",
          "twitterImage": "",
          "twitterTitle": "",
          "urlPathAndQuery": "/features/technical-analysis/"
        },
        {
          "ogDescription": "Export or convert an entire website with all subpages to browsable markdown.",
          "ogImage": "https://crawler.siteone.io/siteone-crawler-collage.png",
          "ogTitle": "Website to Markdown Converter",
          "twitterDescription": "",
          "twitterImage": "",
          "twitterTitle": "",
          "urlPathAndQuery": "/features/website-to-markdown-converter/"
        },
        {
          "ogDescription": "This section describes the advanced usage of the SiteOne Crawler tool.",
          "ogImage": "https://crawler.siteone.io/siteone-crawler-collage.png",
          "ogTitle": "Advanced usage",
          "twitterDescription": "",
          "twitterImage": "",
          "twitterTitle": "",
          "urlPathAndQuery": "/getting-started/advanced-usage/"
        },
        {
          "ogDescription": "Basic usage of the Crawler - how to crawl a website, generate offline version or send HTML report to your e-mail.",
          "ogImage": "https://crawler.siteone.io/siteone-crawler-collage.png",
          "ogTitle": "Basic Usage",
          "twitterDescription": "",
          "twitterImage": "",
          "twitterTitle": "",
          "urlPathAndQuery": "/getting-started/basic-usage/"
        },
        {
          "ogDescription": "This guide will help you get started with the SiteOne Crawler tool in a few minutes.",
          "ogImage": "https://crawler.siteone.io/siteone-crawler-collage.png",
          "ogTitle": "Quick Start Guide",
          "twitterDescription": "",
          "twitterImage": "",
          "twitterTitle": "",
          "urlPathAndQuery": "/getting-started/quick-start-guide/"
        },
        {
          "ogDescription": "Install and use the user-friendly desktop application of SiteOne Crawler, available for Windows, macOS, and Linux with a full graphical interface for easier website analysis and reporting.",
          "ogImage": "https://crawler.siteone.io/siteone-crawler-collage.png",
          "ogTitle": "Desktop Application (GUI)",
          "twitterDescription": "",
          "twitterImage": "",
          "twitterTitle": "",
          "urlPathAndQuery": "/installation-and-requirements/desktop-application/"
        },
        {
          "ogDescription": "Learn how to manually install SiteOne Crawler from source code on various operating systems, including detailed steps for meeting dependencies and compilation requirements.",
          "ogImage": "https://crawler.siteone.io/siteone-crawler-collage.png",
          "ogTitle": "CLI: Manual Installation",
          "twitterDescription": "",
          "twitterImage": "",
          "twitterTitle": "",
          "urlPathAndQuery": "/installation-and-requirements/manual-installation/"
        },
        {
          "ogDescription": "Download and install SiteOne Crawler quickly using pre-built packages for Windows, macOS, and Linux, eliminating the need for complex setup procedures or dependencies.",
          "ogImage": "https://crawler.siteone.io/siteone-crawler-collage.png",
          "ogTitle": "CLI: Ready-to-use Packages",
          "twitterDescription": "",
          "twitterImage": "",
          "twitterTitle": "",
          "urlPathAndQuery": "/installation-and-requirements/ready-to-use-packages/"
        },
        {
          "ogDescription": "The crawler can handle even one core of any common Intel/AMD CPU of the last 10 years and hundreds of MB of RAM. ARM CPUs are also supported.",
          "ogImage": "https://crawler.siteone.io/siteone-crawler-collage.png",
          "ogTitle": "System Requirements",
          "twitterDescription": "",
          "twitterImage": "",
          "twitterTitle": "",
          "urlPathAndQuery": "/installation-and-requirements/system-requirements/"
        },
        {
          "ogDescription": "",
          "ogImage": "https://crawler.siteone.io/siteone-crawler-collage.png",
          "ogTitle": "Contact and Community",
          "twitterDescription": "",
          "twitterImage": "",
          "twitterTitle": "",
          "urlPathAndQuery": "/introduction/contact-and-community/"
        },
        {
          "ogDescription": "Answers to frequently asked questions about SiteOne Crawler, including details about its purpose, target users, unique features, and practical usage scenarios.",
          "ogImage": "https://crawler.siteone.io/siteone-crawler-collage.png",
          "ogTitle": "FAQ",
          "twitterDescription": "",
          "twitterImage": "",
          "twitterTitle": "",
          "urlPathAndQuery": "/introduction/faq/"
        },
        {
          "ogDescription": "Explore the future development plans for SiteOne Crawler, including upcoming features, community-driven priorities, and the vision for enhancing website analysis, export capabilities, and quality assessment tools.",
          "ogImage": "https://crawler.siteone.io/siteone-crawler-collage.png",
          "ogTitle": "Ideas and Roadmap",
          "twitterDescription": "",
          "twitterImage": "",
          "twitterTitle": "",
          "urlPathAndQuery": "/introduction/ideas-and-roadmap/"
        },
        {
          "ogDescription": "SiteOne Crawler is a free tool for website analysis and optimization. It provides a wide range of features, including deep website crawling, SEO analysis, accessibility analysis, and more. Available for all platforms.",
          "ogImage": "https://crawler.siteone.io/siteone-crawler-collage.png",
          "ogTitle": "Key Features",
          "twitterDescription": "",
          "twitterImage": "",
          "twitterTitle": "",
          "urlPathAndQuery": "/introduction/key-features/"
        },
        {
          "ogDescription": "Learn about the personal journey and professional background of SiteOne Crawler creator Ján Regeš, and understand the inspiration and experiences that led to the development of this tool.",
          "ogImage": "https://crawler.siteone.io/siteone-crawler-collage.png",
          "ogTitle": "Motivation",
          "twitterDescription": "",
          "twitterImage": "",
          "twitterTitle": "",
          "urlPathAndQuery": "/introduction/motivation/"
        },
        {
          "ogDescription": "The main purpose of SiteOne Crawler is to help owners, consultants, developers, QA engineers and DevOps find weaknesses and help improve the quality of their websites in various areas - SEO, security, performance, accessibility, socnets-sharing, best practices or content issues.",
          "ogImage": "https://crawler.siteone.io/siteone-crawler-collage.png",
          "ogTitle": "Overview",
          "twitterDescription": "",
          "twitterImage": "",
          "twitterTitle": "",
          "urlPathAndQuery": "/introduction/overview/"
        },
        {
          "ogDescription": "",
          "ogImage": "https://crawler.siteone.io/siteone-crawler-collage.png",
          "ogTitle": "Support Us",
          "twitterDescription": "",
          "twitterImage": "",
          "twitterTitle": "",
          "urlPathAndQuery": "/introduction/support-us/"
        },
        {
          "ogDescription": "This tool wouldn't have been created if it weren't for great people in my near and far surroundings. I would like to thank them all...",
          "ogImage": "https://crawler.siteone.io/siteone-crawler-collage.png",
          "ogTitle": "Thanks",
          "twitterDescription": "",
          "twitterImage": "",
          "twitterTitle": "",
          "urlPathAndQuery": "/introduction/thanks/"
        }
      ],
      "title": "OpenGraph metadata"
    },
    "redirects": {
      "aplCode": "redirects",
      "columns": {
        "sourceUqId": {
          "aplCode": "sourceUqId",
          "escapeOutputHtml": true,
          "forcedDataType": null,
          "formatter": null,
          "formatterWillChangeValueLength": true,
          "getDataValueCallback": null,
          "name": "Found at URL",
          "nonBreakingSpaces": false,
          "renderer": null,
          "truncateIfLonger": true,
          "width": 39
        },
        "statusCode": {
          "aplCode": "statusCode",
          "escapeOutputHtml": true,
          "forcedDataType": null,
          "formatter": {},
          "formatterWillChangeValueLength": false,
          "getDataValueCallback": null,
          "name": "Status",
          "nonBreakingSpaces": false,
          "renderer": null,
          "truncateIfLonger": false,
          "width": 6
        },
        "targetUrl": {
          "aplCode": "targetUrl",
          "escapeOutputHtml": true,
          "forcedDataType": null,
          "formatter": null,
          "formatterWillChangeValueLength": true,
          "getDataValueCallback": null,
          "name": "Target URL",
          "nonBreakingSpaces": false,
          "renderer": null,
          "truncateIfLonger": true,
          "width": 39
        },
        "url": {
          "aplCode": "url",
          "escapeOutputHtml": true,
          "forcedDataType": null,
          "formatter": null,
          "formatterWillChangeValueLength": true,
          "getDataValueCallback": null,
          "name": "Redirected URL",
          "nonBreakingSpaces": false,
          "renderer": null,
          "truncateIfLonger": true,
          "width": 39
        }
      },
      "position": "before-url-table",
      "rows": [],
      "title": "Redirected URLs"
    },
    "security": {
      "aplCode": "security",
      "columns": {
        "critical": {
          "aplCode": "critical",
          "escapeOutputHtml": true,
          "forcedDataType": null,
          "formatter": {},
          "formatterWillChangeValueLength": false,
          "getDataValueCallback": null,
          "name": "Critical",
          "nonBreakingSpaces": false,
          "renderer": null,
          "truncateIfLonger": false,
          "width": 8
        },
        "header": {
          "aplCode": "header",
          "escapeOutputHtml": true,
          "forcedDataType": null,
          "formatter": null,
          "formatterWillChangeValueLength": false,
          "getDataValueCallback": null,
          "name": "Header",
          "nonBreakingSpaces": false,
          "renderer": null,
          "truncateIfLonger": true,
          "width": 26
        },
        "notice": {
          "aplCode": "notice",
          "escapeOutputHtml": true,
          "forcedDataType": null,
          "formatter": {},
          "formatterWillChangeValueLength": false,
          "getDataValueCallback": null,
          "name": "Notice",
          "nonBreakingSpaces": false,
          "renderer": null,
          "truncateIfLonger": false,
          "width": 6
        },
        "ok": {
          "aplCode": "ok",
          "escapeOutputHtml": true,
          "forcedDataType": null,
          "formatter": {},
          "formatterWillChangeValueLength": false,
          "getDataValueCallback": null,
          "name": "OK",
          "nonBreakingSpaces": false,
          "renderer": null,
          "truncateIfLonger": false,
          "width": 5
        },
        "recommendation": {
          "aplCode": "recommendation",
          "escapeOutputHtml": false,
          "forcedDataType": null,
          "formatter": null,
          "formatterWillChangeValueLength": true,
          "getDataValueCallback": null,
          "name": "Recommendation",
          "nonBreakingSpaces": false,
          "renderer": null,
          "truncateIfLonger": true,
          "width": 68
        },
        "warning": {
          "aplCode": "warning",
          "escapeOutputHtml": true,
          "forcedDataType": null,
          "formatter": {},
          "formatterWillChangeValueLength": false,
          "getDataValueCallback": null,
          "name": "Warning",
          "nonBreakingSpaces": false,
          "renderer": null,
          "truncateIfLonger": false,
          "width": 7
        }
      },
      "position": "before-url-table",
      "rows": [
        {
          "critical": "3",
          "header": "Content-Security-Policy",
          "highestSeverity": "4",
          "notice": "0",
          "ok": "51",
          "recommendation": "Content-Security-Policy header is not set. It restricts resources the page can load and prevents XSS attacks.",
          "warning": "0"
        },
        {
          "critical": "0",
          "header": "X-Frame-Options",
          "highestSeverity": "2",
          "notice": "54",
          "ok": "0",
          "recommendation": "X-Frame-Options header is set to SAMEORIGIN which allows this origin to embed the resource in a frame.",
          "warning": "0"
        },
        {
          "critical": "0",
          "header": "X-XSS-Protection",
          "highestSeverity": "2",
          "notice": "54",
          "ok": "0",
          "recommendation": "X-XSS-Protection header is set but deprecated. Consider removing it and using Content-Security-Policy instead.",
          "warning": "0"
        },
        {
          "critical": "0",
          "header": "Strict-Transport-Security",
          "highestSeverity": "1",
          "notice": "0",
          "ok": "54",
          "recommendation": "",
          "warning": "0"
        },
        {
          "critical": "0",
          "header": "X-Content-Type-Options",
          "highestSeverity": "1",
          "notice": "0",
          "ok": "54",
          "recommendation": "",
          "warning": "0"
        },
        {
          "critical": "0",
          "header": "Referrer-Policy",
          "highestSeverity": "1",
          "notice": "0",
          "ok": "54",
          "recommendation": "",
          "warning": "0"
        },
        {
          "critical": "0",
          "header": "Feature-Policy",
          "highestSeverity": "1",
          "notice": "0",
          "ok": "54",
          "recommendation": "",
          "warning": "0"
        },
        {
          "critical": "0",
          "header": "Permissions-Policy",
          "highestSeverity": "1",
          "notice": "0",
          "ok": "54",
          "recommendation": "",
          "warning": "0"
        },
        {
          "critical": "0",
          "header": "Server",
          "highestSeverity": "1",
          "notice": "0",
          "ok": "54",
          "recommendation": "Server header is not set or empty. This is recommended.",
          "warning": "0"
        }
      ],
      "title": "Security"
    },
    "seo": {
      "aplCode": "seo",
      "columns": {
        "description": {
          "aplCode": "description",
          "escapeOutputHtml": true,
          "forcedDataType": null,
          "formatter": null,
          "formatterWillChangeValueLength": false,
          "getDataValueCallback": null,
          "name": "Description",
          "nonBreakingSpaces": false,
          "renderer": null,
          "truncateIfLonger": true,
          "width": 12
        },
        "h1": {
          "aplCode": "h1",
          "escapeOutputHtml": true,
          "forcedDataType": null,
          "formatter": {},
          "formatterWillChangeValueLength": false,
          "getDataValueCallback": null,
          "name": "H1",
          "nonBreakingSpaces": false,
          "renderer": null,
          "truncateIfLonger": true,
          "width": 12
        },
        "indexing": {
          "aplCode": "indexing",
          "escapeOutputHtml": true,
          "forcedDataType": null,
          "formatter": null,
          "formatterWillChangeValueLength": false,
          "getDataValueCallback": null,
          "name": "Indexing",
          "nonBreakingSpaces": false,
          "renderer": {},
          "truncateIfLonger": false,
          "width": 20
        },
        "keywords": {
          "aplCode": "keywords",
          "escapeOutputHtml": true,
          "forcedDataType": null,
          "formatter": null,
          "formatterWillChangeValueLength": false,
          "getDataValueCallback": null,
          "name": "Keywords",
          "nonBreakingSpaces": false,
          "renderer": null,
          "truncateIfLonger": true,
          "width": 12
        },
        "title": {
          "aplCode": "title",
          "escapeOutputHtml": true,
          "forcedDataType": null,
          "formatter": null,
          "formatterWillChangeValueLength": false,
          "getDataValueCallback": null,
          "name": "Title",
          "nonBreakingSpaces": false,
          "renderer": null,
          "truncateIfLonger": true,
          "width": 12
        },
        "urlPathAndQuery": {
          "aplCode": "urlPathAndQuery",
          "escapeOutputHtml": true,
          "forcedDataType": null,
          "formatter": null,
          "formatterWillChangeValueLength": false,
          "getDataValueCallback": null,
          "name": "URL",
          "nonBreakingSpaces": false,
          "renderer": null,
          "truncateIfLonger": true,
          "width": 50
        }
      },
      "position": "before-url-table",
      "rows": [
        {
          "deniedByRobotsTxt": "false",
          "description": "A very useful and free website analyzer you'll ♥ as a Dev/DevOps, QA engineer, SEO or Security specialist, website owner or consultant. It performs in-depth analyzes of your website, generates an offline or markdown version of the website, provides a detailed HTML audit report and works on all popular platforms - Windows, macOS and Linux (x64 and arm64 too).",
          "h1": "SiteOne Crawler",
          "indexing": "",
          "keywords": "",
          "robotsIndex": "1",
          "title": "SiteOne Crawler - free website analyzer, offline exporter, sitemap generator and Swiss Army Knife, you will love",
          "urlPathAndQuery": "/"
        },
        {
          "deniedByRobotsTxt": "false",
          "description": "SiteOne Crawler implements an efficient HTTP response caching system to reduce network traffic and speed up repeated crawls.",
          "h1": "Caching",
          "indexing": "",
          "keywords": "",
          "robotsIndex": "1",
          "title": "Caching • SiteOne Crawler",
          "urlPathAndQuery": "/advanced-topics/caching/"
        },
        {
          "deniedByRobotsTxt": "false",
          "description": "Guidelines for contributing to SiteOne Crawler, including development setup, coding standards, and submission process for pull requests.",
          "h1": "Contribution and Development",
          "indexing": "",
          "keywords": "",
          "robotsIndex": "1",
          "title": "Contribution and Development • SiteOne Crawler",
          "urlPathAndQuery": "/advanced-topics/contribution-and-development/"
        },
        {
          "deniedByRobotsTxt": "false",
          "description": "Understand the core crawling mechanism of SiteOne Crawler, including URL handling, content processing, and navigational decisions.",
          "h1": "Crawler Behavior",
          "indexing": "",
          "keywords": "",
          "robotsIndex": "1",
          "title": "Crawler Behavior • SiteOne Crawler",
          "urlPathAndQuery": "/advanced-topics/crawler-behavior/"
        },
        {
          "deniedByRobotsTxt": "false",
          "description": "SiteOne Crawler is designed to be extensible, allowing developers to add custom analyzers, content processors, and exporters to enhance its functionality.",
          "h1": "Extending",
          "indexing": "",
          "keywords": "",
          "robotsIndex": "1",
          "title": "Extending • SiteOne Crawler",
          "urlPathAndQuery": "/advanced-topics/extending/"
        },
        {
          "deniedByRobotsTxt": "false",
          "description": "Solutions to common issues encountered when using SiteOne Crawler, including performance problems, memory limitations, and crawling challenges.",
          "h1": "Troubleshooting",
          "indexing": "",
          "keywords": "",
          "robotsIndex": "1",
          "title": "Troubleshooting • SiteOne Crawler",
          "urlPathAndQuery": "/advanced-topics/troubleshooting/"
        },
        {
          "deniedByRobotsTxt": "false",
          "description": "This section describes all the available command-line options of the SiteOne Crawler tool.",
          "h1": "Command-line options",
          "indexing": "",
          "keywords": "",
          "robotsIndex": "1",
          "title": "Command-line options • SiteOne Crawler",
          "urlPathAndQuery": "/configuration/command-line-options/"
        },
        {
          "deniedByRobotsTxt": "false",
          "description": "This section describes examples and typical scenarios of using the SiteOne Crawler tool.",
          "h1": "Examples",
          "indexing": "",
          "keywords": "",
          "robotsIndex": "1",
          "title": "Examples • SiteOne Crawler",
          "urlPathAndQuery": "/configuration/examples/"
        },
        {
          "deniedByRobotsTxt": "false",
          "description": "SiteOne Crawler provides basic accessibility analysis, which can help developers to improve the accessibility of their websites.",
          "h1": "Accessibility Analysis",
          "indexing": "",
          "keywords": "",
          "robotsIndex": "1",
          "title": "Accessibility Analysis • SiteOne Crawler",
          "urlPathAndQuery": "/features/accessibility-analysis/"
        },
        {
          "deniedByRobotsTxt": "false",
          "description": "Learn about the comprehensive HTML audit report generated by SiteOne Crawler, which provides detailed analysis and insights about your website in an organized, interactive format.",
          "h1": "Audit Report",
          "indexing": "",
          "keywords": "",
          "robotsIndex": "1",
          "title": "Audit Report • SiteOne Crawler",
          "urlPathAndQuery": "/features/audit-report/"
        },
        {
          "deniedByRobotsTxt": "false",
          "description": "Maximum availability for a wide range of users and platforms is our priority. For this reason, you can download and install SiteOne Crawler on the following platforms: Windows (x64), macOS (Intel/Apple Silicon), Linux (x64/ARM).",
          "h1": "Availability",
          "indexing": "",
          "keywords": "",
          "robotsIndex": "1",
          "title": "Availability • SiteOne Crawler",
          "urlPathAndQuery": "/features/availability/"
        },
        {
          "deniedByRobotsTxt": "false",
          "description": "SiteOne Crawler evaluates websites against a set of best practices to ensure optimal performance, SEO, and user experience.",
          "h1": "Best Practices Analysis",
          "indexing": "",
          "keywords": "",
          "robotsIndex": "1",
          "title": "Best Practices Analysis • SiteOne Crawler",
          "urlPathAndQuery": "/features/best-practices-analysis/"
        },
        {
          "deniedByRobotsTxt": "false",
          "description": "SiteOne Crawler analyzes HTTP cache headers across your website to identify optimization opportunities for faster loading times and better user experience.",
          "h1": "Caching Analysis",
          "indexing": "",
          "keywords": "",
          "robotsIndex": "1",
          "title": "Caching Analysis • SiteOne Crawler",
          "urlPathAndQuery": "/features/caching-analysis/"
        },
        {
          "deniedByRobotsTxt": "false",
          "description": "The Content Type analysis provides detailed insights into the types of content served by a website, focusing on their distribution, size, load times, and HTTP status codes.",
          "h1": "Content Type Analysis",
          "indexing": "",
          "keywords": "",
          "robotsIndex": "1",
          "title": "Content Type Analysis • SiteOne Crawler",
          "urlPathAndQuery": "/features/content-type-analysis/"
        },
        {
          "deniedByRobotsTxt": "false",
          "description": "SiteOne Crawler is designed to crawl every aspect of your website, including all files, styles, scripts, images, and more. Learn how it can help you find and inspect everything that can be crawled on your website.",
          "h1": "Deep Website Crawling",
          "indexing": "",
          "keywords": "",
          "robotsIndex": "1",
          "title": "Deep Website Crawling • SiteOne Crawler",
          "urlPathAndQuery": "/features/deep-website-crawling/"
        },
        {
          "deniedByRobotsTxt": "false",
          "description": "SiteOne Crawler offers several functionalities that can be useful for DevOps, such as testing public and local projects, password-protected websites, and stress tests.",
          "h1": "Dev/DevOps assistant",
          "indexing": "",
          "keywords": "",
          "robotsIndex": "1",
          "title": "Dev/DevOps assistant • SiteOne Crawler",
          "urlPathAndQuery": "/features/dev-devops-assistant/"
        },
        {
          "deniedByRobotsTxt": "false",
          "description": "The DNS Analysis feature examines domain name resolution, providing insights into DNS configuration, IPv4/IPv6 support, and potential DNS-related issues.",
          "h1": "DNS Analysis",
          "indexing": "",
          "keywords": "",
          "robotsIndex": "1",
          "title": "DNS Analysis • SiteOne Crawler",
          "urlPathAndQuery": "/features/dns-analysis/"
        },
        {
          "deniedByRobotsTxt": "false",
          "description": "SiteOne Crawler is designed for maximum availability, ease of use, and functionality for the command-line and desktop application.",
          "h1": "Ease of Use",
          "indexing": "",
          "keywords": "",
          "robotsIndex": "1",
          "title": "Ease of Use • SiteOne Crawler",
          "urlPathAndQuery": "/features/ease-of-use/"
        },
        {
          "deniedByRobotsTxt": "false",
          "description": "SiteOne Crawler offers export and reporting features, enabling users to generate comprehensive reports in various formats and for different use cases.",
          "h1": "Exports and Reports",
          "indexing": "",
          "keywords": "",
          "robotsIndex": "1",
          "title": "Exports and Reports • SiteOne Crawler",
          "urlPathAndQuery": "/features/exports-and-reports/"
        },
        {
          "deniedByRobotsTxt": "false",
          "description": "Detailed analysis of HTTP response headers across your website, helping you identify security issues, performance opportunities, and best practices in header configuration.",
          "h1": "HTTP Headers Analysis",
          "indexing": "",
          "keywords": "",
          "robotsIndex": "1",
          "title": "HTTP Headers Analysis • SiteOne Crawler",
          "urlPathAndQuery": "/features/headers-analysis/"
        },
        {
          "deniedByRobotsTxt": "false",
          "description": "The Heading Analysis evaluates the structure and organization of headings (H1, H2, H3, etc.) on a webpage to ensure they follow best practices for both accessibility and SEO.",
          "h1": "Heading Analysis",
          "indexing": "",
          "keywords": "",
          "robotsIndex": "1",
          "title": "Heading Analysis • SiteOne Crawler",
          "urlPathAndQuery": "/features/heading-analysis/"
        },
        {
          "deniedByRobotsTxt": "false",
          "description": "Track and visualize improvements in your website metrics over time, providing a clear measure of progress in areas like performance, SEO, accessibility, and security.",
          "h1": "Improvement Meter",
          "indexing": "",
          "keywords": "",
          "robotsIndex": "1",
          "title": "Improvement Meter • SiteOne Crawler",
          "urlPathAndQuery": "/features/improvement-meter/"
        },
        {
          "deniedByRobotsTxt": "false",
          "description": "The Mailer feature allows automated email delivery of HTML audit reports after a crawl is completed. This is particularly useful for sharing results with team members, proactively, e.g. as part of the CI/CD process.",
          "h1": "Mailer",
          "indexing": "",
          "keywords": "",
          "robotsIndex": "1",
          "title": "Mailer • SiteOne Crawler",
          "urlPathAndQuery": "/features/mailer/"
        },
        {
          "deniedByRobotsTxt": "false",
          "description": "Export the entire website to offline form (clone, mirror), where it is possible to browse the site through local HTML files (without webserver) including all document, images, styles, scripts, fonts, etc.",
          "h1": "Offline Website Generator (clone, mirror)",
          "indexing": "",
          "keywords": "",
          "robotsIndex": "1",
          "title": "Offline Website Generator (clone, mirror) • SiteOne Crawler",
          "urlPathAndQuery": "/features/offline-website-generator/"
        },
        {
          "deniedByRobotsTxt": "false",
          "description": "Upload an HTML report to our or your infrastructure and get a secure sharable unique URL for the report.",
          "h1": "Audit Report Sharing (upload)",
          "indexing": "",
          "keywords": "",
          "robotsIndex": "1",
          "title": "Audit Report Sharing (upload) • SiteOne Crawler",
          "urlPathAndQuery": "/features/online-html-report-upload/"
        },
        {
          "deniedByRobotsTxt": "false",
          "description": "The Performance Analysis offers detailed insights into the loading times of individual URLs. This is crucial for developers and site administrators to identify performance bottlenecks and optimize web pages.",
          "h1": "Performance Analysis",
          "indexing": "",
          "keywords": "",
          "robotsIndex": "1",
          "title": "Performance Analysis • SiteOne Crawler",
          "urlPathAndQuery": "/features/performance-analysis/"
        },
        {
          "deniedByRobotsTxt": "false",
          "description": "Identify the fastest and slowest pages on your website with detailed performance analysis, helping you optimize critical pages and fix bottlenecks.",
          "h1": "Performance Metrics",
          "indexing": "",
          "keywords": "",
          "robotsIndex": "1",
          "title": "Performance Metrics • SiteOne Crawler",
          "urlPathAndQuery": "/features/performance-metrics/"
        },
        {
          "deniedByRobotsTxt": "false",
          "description": "The redirects and 404 analysis reports on problematic URLs, focusing on 404 errors and redirects (301/302). This helps identify broken links and unnecessary redirects, which can negatively impact user experience and SEO.",
          "h1": "Redirect and 404 Analysis",
          "indexing": "",
          "keywords": "",
          "robotsIndex": "1",
          "title": "Redirect and 404 Analysis • SiteOne Crawler",
          "urlPathAndQuery": "/features/redirect-and-404-analysis/"
        },
        {
          "deniedByRobotsTxt": "false",
          "description": "Evaluate the security of your website by analyzing HTTP headers, TLS/SSL configurations and cookie security.",
          "h1": "Security Analysis",
          "indexing": "",
          "keywords": "",
          "robotsIndex": "1",
          "title": "Security Analysis • SiteOne Crawler",
          "urlPathAndQuery": "/features/security-analysis/"
        },
        {
          "deniedByRobotsTxt": "false",
          "description": "Analyze your website for SEO best practices and social media optimization with detailed insights into metadata, heading structure, duplicate content, and OpenGraph tag implementation.",
          "h1": "SEO and OpenGraph Analysis",
          "indexing": "",
          "keywords": "",
          "robotsIndex": "1",
          "title": "SEO and OpenGraph Analysis • SiteOne Crawler",
          "urlPathAndQuery": "/features/seo-and-opengraph-analysis/"
        },
        {
          "deniedByRobotsTxt": "false",
          "description": "Automatically generate comprehensive XML and TXT sitemaps of your website to improve search engine visibility and indexing efficiency.",
          "h1": "Sitemap Generator",
          "indexing": "",
          "keywords": "",
          "robotsIndex": "1",
          "title": "Sitemap Generator • SiteOne Crawler",
          "urlPathAndQuery": "/features/sitemap-generator/"
        },
        {
          "deniedByRobotsTxt": "false",
          "description": "Analyze how resources are distributed across different domains in your website, helping you identify opportunities for optimization and potential issues with third-party dependencies.",
          "h1": "Source Domains Analysis",
          "indexing": "",
          "keywords": "",
          "robotsIndex": "1",
          "title": "Source Domains Analysis • SiteOne Crawler",
          "urlPathAndQuery": "/features/source-domains-analysis/"
        },
        {
          "deniedByRobotsTxt": "false",
          "description": "Comprehensive analysis of your website SSL/TLS implementation, examining certificate configuration, protocol support, and security best practices.",
          "h1": "SSL/TLS Analysis",
          "indexing": "",
          "keywords": "",
          "robotsIndex": "1",
          "title": "SSL/TLS Analysis • SiteOne Crawler",
          "urlPathAndQuery": "/features/ssl-tls-analysis/"
        },
        {
          "deniedByRobotsTxt": "false",
          "description": "The \"Stress Testing\" feature in SiteOne Crawler allows you to evaluate how a website performs under various load conditions by customizing the number of workers and maximum requests per second.",
          "h1": "Stress Testing",
          "indexing": "",
          "keywords": "",
          "robotsIndex": "1",
          "title": "Stress Testing • SiteOne Crawler",
          "urlPathAndQuery": "/features/stress-testing/"
        },
        {
          "deniedByRobotsTxt": "false",
          "description": "SiteOne Crawler provides in-depth technical analysis of your website, examining HTTP headers, server configurations, and technical implementations to identify issues and optimization opportunities.",
          "h1": "Technical Analysis",
          "indexing": "",
          "keywords": "",
          "robotsIndex": "1",
          "title": "Technical Analysis • SiteOne Crawler",
          "urlPathAndQuery": "/features/technical-analysis/"
        },
        {
          "deniedByRobotsTxt": "false",
          "description": "Export or convert an entire website with all subpages to browsable markdown.",
          "h1": "Website to Markdown Converter",
          "indexing": "",
          "keywords": "",
          "robotsIndex": "1",
          "title": "Website to Markdown Converter • SiteOne Crawler",
          "urlPathAndQuery": "/features/website-to-markdown-converter/"
        },
        {
          "deniedByRobotsTxt": "false",
          "description": "This section describes the advanced usage of the SiteOne Crawler tool.",
          "h1": "Advanced usage",
          "indexing": "",
          "keywords": "",
          "robotsIndex": "1",
          "title": "Advanced usage • SiteOne Crawler",
          "urlPathAndQuery": "/getting-started/advanced-usage/"
        },
        {
          "deniedByRobotsTxt": "false",
          "description": "Basic usage of the Crawler - how to crawl a website, generate offline version or send HTML report to your e-mail.",
          "h1": "Basic Usage",
          "indexing": "",
          "keywords": "",
          "robotsIndex": "1",
          "title": "Basic Usage • SiteOne Crawler",
          "urlPathAndQuery": "/getting-started/basic-usage/"
        },
        {
          "deniedByRobotsTxt": "false",
          "description": "This guide will help you get started with the SiteOne Crawler tool in a few minutes.",
          "h1": "Quick Start Guide",
          "indexing": "",
          "keywords": "",
          "robotsIndex": "1",
          "title": "Quick Start Guide • SiteOne Crawler",
          "urlPathAndQuery": "/getting-started/quick-start-guide/"
        },
        {
          "deniedByRobotsTxt": "false",
          "description": "Install and use the user-friendly desktop application of SiteOne Crawler, available for Windows, macOS, and Linux with a full graphical interface for easier website analysis and reporting.",
          "h1": "Desktop Application (GUI)",
          "indexing": "",
          "keywords": "",
          "robotsIndex": "1",
          "title": "Desktop Application (GUI) • SiteOne Crawler",
          "urlPathAndQuery": "/installation-and-requirements/desktop-application/"
        },
        {
          "deniedByRobotsTxt": "false",
          "description": "Learn how to manually install SiteOne Crawler from source code on various operating systems, including detailed steps for meeting dependencies and compilation requirements.",
          "h1": "CLI: Manual Installation",
          "indexing": "",
          "keywords": "",
          "robotsIndex": "1",
          "title": "CLI: Manual Installation • SiteOne Crawler",
          "urlPathAndQuery": "/installation-and-requirements/manual-installation/"
        },
        {
          "deniedByRobotsTxt": "false",
          "description": "Download and install SiteOne Crawler quickly using pre-built packages for Windows, macOS, and Linux, eliminating the need for complex setup procedures or dependencies.",
          "h1": "CLI: Ready-to-use Packages",
          "indexing": "",
          "keywords": "",
          "robotsIndex": "1",
          "title": "CLI: Ready-to-use Packages • SiteOne Crawler",
          "urlPathAndQuery": "/installation-and-requirements/ready-to-use-packages/"
        },
        {
          "deniedByRobotsTxt": "false",
          "description": "The crawler can handle even one core of any common Intel/AMD CPU of the last 10 years and hundreds of MB of RAM. ARM CPUs are also supported.",
          "h1": "System Requirements",
          "indexing": "",
          "keywords": "",
          "robotsIndex": "1",
          "title": "System Requirements • SiteOne Crawler",
          "urlPathAndQuery": "/installation-and-requirements/system-requirements/"
        },
        {
          "deniedByRobotsTxt": "false",
          "description": "",
          "h1": "Contact and Community",
          "indexing": "",
          "keywords": "",
          "robotsIndex": "1",
          "title": "Contact and Community • SiteOne Crawler",
          "urlPathAndQuery": "/introduction/contact-and-community/"
        },
        {
          "deniedByRobotsTxt": "false",
          "description": "Answers to frequently asked questions about SiteOne Crawler, including details about its purpose, target users, unique features, and practical usage scenarios.",
          "h1": "FAQ",
          "indexing": "",
          "keywords": "",
          "robotsIndex": "1",
          "title": "FAQ • SiteOne Crawler",
          "urlPathAndQuery": "/introduction/faq/"
        },
        {
          "deniedByRobotsTxt": "false",
          "description": "Explore the future development plans for SiteOne Crawler, including upcoming features, community-driven priorities, and the vision for enhancing website analysis, export capabilities, and quality assessment tools.",
          "h1": "Ideas and Roadmap",
          "indexing": "",
          "keywords": "",
          "robotsIndex": "1",
          "title": "Ideas and Roadmap • SiteOne Crawler",
          "urlPathAndQuery": "/introduction/ideas-and-roadmap/"
        },
        {
          "deniedByRobotsTxt": "false",
          "description": "SiteOne Crawler is a free tool for website analysis and optimization. It provides a wide range of features, including deep website crawling, SEO analysis, accessibility analysis, and more. Available for all platforms.",
          "h1": "Key Features",
          "indexing": "",
          "keywords": "",
          "robotsIndex": "1",
          "title": "Key Features • SiteOne Crawler",
          "urlPathAndQuery": "/introduction/key-features/"
        },
        {
          "deniedByRobotsTxt": "false",
          "description": "Learn about the personal journey and professional background of SiteOne Crawler creator Ján Regeš, and understand the inspiration and experiences that led to the development of this tool.",
          "h1": "Motivation",
          "indexing": "",
          "keywords": "",
          "robotsIndex": "1",
          "title": "Motivation • SiteOne Crawler",
          "urlPathAndQuery": "/introduction/motivation/"
        },
        {
          "deniedByRobotsTxt": "false",
          "description": "The main purpose of SiteOne Crawler is to help owners, consultants, developers, QA engineers and DevOps find weaknesses and help improve the quality of their websites in various areas - SEO, security, performance, accessibility, socnets-sharing, best practices or content issues.",
          "h1": "Overview",
          "indexing": "",
          "keywords": "",
          "robotsIndex": "1",
          "title": "Overview • SiteOne Crawler",
          "urlPathAndQuery": "/introduction/overview/"
        },
        {
          "deniedByRobotsTxt": "false",
          "description": "",
          "h1": "Support Us",
          "indexing": "",
          "keywords": "",
          "robotsIndex": "1",
          "title": "Support Us • SiteOne Crawler",
          "urlPathAndQuery": "/introduction/support-us/"
        },
        {
          "deniedByRobotsTxt": "false",
          "description": "This tool wouldn't have been created if it weren't for great people in my near and far surroundings. I would like to thank them all...",
          "h1": "Thanks",
          "indexing": "",
          "keywords": "",
          "robotsIndex": "1",
          "title": "Thanks • SiteOne Crawler",
          "urlPathAndQuery": "/introduction/thanks/"
        }
      ],
      "title": "SEO metadata"
    },
    "seo-headings": {
      "aplCode": "seo-headings",
      "columns": {
        "headings": {
          "aplCode": "headings",
          "escapeOutputHtml": false,
          "forcedDataType": null,
          "formatter": null,
          "formatterWillChangeValueLength": false,
          "getDataValueCallback": null,
          "name": "Heading structure",
          "nonBreakingSpaces": false,
          "renderer": {},
          "truncateIfLonger": true,
          "width": 84
        },
        "headingsCount": {
          "aplCode": "headingsCount",
          "escapeOutputHtml": true,
          "forcedDataType": null,
          "formatter": null,
          "formatterWillChangeValueLength": false,
          "getDataValueCallback": null,
          "name": "Count",
          "nonBreakingSpaces": false,
          "renderer": null,
          "truncateIfLonger": false,
          "width": 5
        },
        "headingsErrorsCount": {
          "aplCode": "headingsErrorsCount",
          "escapeOutputHtml": true,
          "forcedDataType": null,
          "formatter": {},
          "formatterWillChangeValueLength": false,
          "getDataValueCallback": null,
          "name": "Errors",
          "nonBreakingSpaces": false,
          "renderer": null,
          "truncateIfLonger": false,
          "width": 6
        },
        "urlPathAndQuery": {
          "aplCode": "urlPathAndQuery",
          "escapeOutputHtml": true,
          "forcedDataType": null,
          "formatter": null,
          "formatterWillChangeValueLength": false,
          "getDataValueCallback": null,
          "name": "URL",
          "nonBreakingSpaces": false,
          "renderer": null,
          "truncateIfLonger": true,
          "width": 30
        }
      },
      "position": "before-url-table",
      "rows": [
        {
          "headings": "<h2> On this page [#starlight__on-this-page] <h1> FAQ [#_top] <h3> Who is this tool for? [#who-is-this-tool-for] <h3> What is the difference between this tool and other tools? [#what-is-the-difference-between-this-tool-and-other-tools] <h3> Can I easily share analysis results with colleagues? [#can-i-easily-share-analysis-results-with-colleagues] <h3> Is this tool difficult to use? [#is-this-tool-difficult-to-use] <h3> Is this tool safe to use? [#is-this-tool-safe-to-use] <h3> How can I prevent SiteOne Crawler from crawling my website? [#how-can-i-prevent-siteone-crawler-from-crawling-my-website] <h3> What are the key features of this tool? [#what-are-the-key-features-of-this-tool] <h3> What are the known limitations of this tool? [#what-are-the-known-limitations-of-this-tool] <h3> What are the future plans for this tool? [#what-are-the-future-plans-for-this-tool] <h3> How can I contribute to this tool? [#how-can-i-contribute-to-this-tool] <h3> How can I report a bug or request a new feature? [#how-can-i-report-a-bug-or-request-a-new-feature] <h3> How can I contact the author? [#how-can-i-contact-the-author]",
          "headingsCount": "14",
          "headingsErrorsCount": "13",
          "headingsHtml": "<ul><li><span class=\"help\" title=\"Heading level 2 is not correct. Should be 1.\"><span style=\"color: #ff00ff\">&lt;h2&gt; On this page [#starlight__on-this-page]</span></span></li><li>&lt;h1&gt; FAQ [#_top]<ul><li><span class=\"help\" title=\"Heading level 3 is not correct. Should be 2.\"><span style=\"color: #ff00ff\">&lt;h3&gt; Who is this tool for? [#who-is-this-tool-for]</span></span></li><li><span class=\"help\" title=\"Heading level 3 is not correct. Should be 2.\"><span style=\"color: #ff00ff\">&lt;h3&gt; What is the difference between this tool and other tools? [#what-is-the-difference-between-this-tool-and-other-tools]</span></span></li><li><span class=\"help\" title=\"Heading level 3 is not correct. Should be 2.\"><span style=\"color: #ff00ff\">&lt;h3&gt; Can I easily share analysis results with colleagues? [#can-i-easily-share-analysis-results-with-colleagues]</span></span></li><li><span class=\"help\" title=\"Heading level 3 is not correct. Should be 2.\"><span style=\"color: #ff00ff\">&lt;h3&gt; Is this tool difficult to use? [#is-this-tool-difficult-to-use]</span></span></li><li><span class=\"help\" title=\"Heading level 3 is not correct. Should be 2.\"><span style=\"color: #ff00ff\">&lt;h3&gt; Is this tool safe to use? [#is-this-tool-safe-to-use]</span></span></li><li><span class=\"help\" title=\"Heading level 3 is not correct. Should be 2.\"><span style=\"color: #ff00ff\">&lt;h3&gt; How can I prevent SiteOne Crawler from crawling my website? [#how-can-i-prevent-siteone-crawler-from-crawling-my-website]</span></span></li><li><span class=\"help\" title=\"Heading level 3 is not correct. Should be 2.\"><span style=\"color: #ff00ff\">&lt;h3&gt; What are the key features of this tool? [#what-are-the-key-features-of-this-tool]</span></span></li><li><span class=\"help\" title=\"Heading level 3 is not correct. Should be 2.\"><span style=\"color: #ff00ff\">&lt;h3&gt; What are the known limitations of this tool? [#what-are-the-known-limitations-of-this-tool]</span></span></li><li><span class=\"help\" title=\"Heading level 3 is not correct. Should be 2.\"><span style=\"color: #ff00ff\">&lt;h3&gt; What are the future plans for this tool? [#what-are-the-future-plans-for-this-tool]</span></span></li><li><span class=\"help\" title=\"Heading level 3 is not correct. Should be 2.\"><span style=\"color: #ff00ff\">&lt;h3&gt; How can I contribute to this tool? [#how-can-i-contribute-to-this-tool]</span></span></li><li><span class=\"help\" title=\"Heading level 3 is not correct. Should be 2.\"><span style=\"color: #ff00ff\">&lt;h3&gt; How can I report a bug or request a new feature? [#how-can-i-report-a-bug-or-request-a-new-feature]</span></span></li><li><span class=\"help\" title=\"Heading level 3 is not correct. Should be 2.\"><span style=\"color: #ff00ff\">&lt;h3&gt; How can I contact the author? [#how-can-i-contact-the-author]</span></span></li></ul></li></ul>",
          "urlPathAndQuery": "/introduction/faq/"
        },
        {
          "headings": "<h2> On this page [#starlight__on-this-page] <h1> Examples [#_top] <h3> Analysis of the entire website with default settings [#analysis-of-the-entire-website-with-default-settings] <h3> Analysis and upload HTML report to the online service [#analysis-and-upload-html-report-to-the-online-service] <h3> Analysis and sending of e-mail with the HTML report [#analysis-and-sending-of-e-mail-with-the-html-report] <h3> Simulate a tablet and crawl only the first 100 URLs [#simulate-a-tablet-and-crawl-only-the-first-100-urls] <h3> Internal password-protected web behind the proxy [#internal-password-protected-web-behind-the-proxy] <h3> SEO oriented analysis and output (ignore assets) [#seo-oriented-analysis-and-output-ignore-assets] <h3> Stress test with 10 workers and 100 reqs/sec [#stress-test-with-10-workers-and-100-reqssec] <h3> Analysis and export of a large website ~ 1 mio URLs [#analysis-and-export-of-a-large-website--1-mio-urls] <h3> Generate an offline version of the website [#generate-an-offline-version-of-the-website] <h3> Generate sitemaps for the website [#generate-sitemaps-for-the-website] <h3> Help with all available options [#help-with-all-available-options]",
          "headingsCount": "13",
          "headingsErrorsCount": "12",
          "headingsHtml": "<ul><li><span class=\"help\" title=\"Heading level 2 is not correct. Should be 1.\"><span style=\"color: #ff00ff\">&lt;h2&gt; On this page [#starlight__on-this-page]</span></span></li><li>&lt;h1&gt; Examples [#_top]<ul><li><span class=\"help\" title=\"Heading level 3 is not correct. Should be 2.\"><span style=\"color: #ff00ff\">&lt;h3&gt; Analysis of the entire website with default settings [#analysis-of-the-entire-website-with-default-settings]</span></span></li><li><span class=\"help\" title=\"Heading level 3 is not correct. Should be 2.\"><span style=\"color: #ff00ff\">&lt;h3&gt; Analysis and upload HTML report to the online service [#analysis-and-upload-html-report-to-the-online-service]</span></span></li><li><span class=\"help\" title=\"Heading level 3 is not correct. Should be 2.\"><span style=\"color: #ff00ff\">&lt;h3&gt; Analysis and sending of e-mail with the HTML report [#analysis-and-sending-of-e-mail-with-the-html-report]</span></span></li><li><span class=\"help\" title=\"Heading level 3 is not correct. Should be 2.\"><span style=\"color: #ff00ff\">&lt;h3&gt; Simulate a tablet and crawl only the first 100 URLs [#simulate-a-tablet-and-crawl-only-the-first-100-urls]</span></span></li><li><span class=\"help\" title=\"Heading level 3 is not correct. Should be 2.\"><span style=\"color: #ff00ff\">&lt;h3&gt; Internal password-protected web behind the proxy [#internal-password-protected-web-behind-the-proxy]</span></span></li><li><span class=\"help\" title=\"Heading level 3 is not correct. Should be 2.\"><span style=\"color: #ff00ff\">&lt;h3&gt; SEO oriented analysis and output (ignore assets) [#seo-oriented-analysis-and-output-ignore-assets]</span></span></li><li><span class=\"help\" title=\"Heading level 3 is not correct. Should be 2.\"><span style=\"color: #ff00ff\">&lt;h3&gt; Stress test with 10 workers and 100 reqs/sec [#stress-test-with-10-workers-and-100-reqssec]</span></span></li><li><span class=\"help\" title=\"Heading level 3 is not correct. Should be 2.\"><span style=\"color: #ff00ff\">&lt;h3&gt; Analysis and export of a large website ~ 1 mio URLs [#analysis-and-export-of-a-large-website--1-mio-urls]</span></span></li><li><span class=\"help\" title=\"Heading level 3 is not correct. Should be 2.\"><span style=\"color: #ff00ff\">&lt;h3&gt; Generate an offline version of the website [#generate-an-offline-version-of-the-website]</span></span></li><li><span class=\"help\" title=\"Heading level 3 is not correct. Should be 2.\"><span style=\"color: #ff00ff\">&lt;h3&gt; Generate sitemaps for the website [#generate-sitemaps-for-the-website]</span></span></li><li><span class=\"help\" title=\"Heading level 3 is not correct. Should be 2.\"><span style=\"color: #ff00ff\">&lt;h3&gt; Help with all available options [#help-with-all-available-options]</span></span></li></ul></li></ul>",
          "urlPathAndQuery": "/configuration/examples/"
        },
        {
          "headings": "<h2> On this page [#starlight__on-this-page] <h1> Quick Start Guide [#_top] <h3> How to use Desktop Application [#how-to-use-desktop-application] <h3> How to use Command-line Interface [#how-to-use-command-line-interface]",
          "headingsCount": "4",
          "headingsErrorsCount": "3",
          "headingsHtml": "<ul><li><span class=\"help\" title=\"Heading level 2 is not correct. Should be 1.\"><span style=\"color: #ff00ff\">&lt;h2&gt; On this page [#starlight__on-this-page]</span></span></li><li>&lt;h1&gt; Quick Start Guide [#_top]<ul><li><span class=\"help\" title=\"Heading level 3 is not correct. Should be 2.\"><span style=\"color: #ff00ff\">&lt;h3&gt; How to use Desktop Application [#how-to-use-desktop-application]</span></span></li><li><span class=\"help\" title=\"Heading level 3 is not correct. Should be 2.\"><span style=\"color: #ff00ff\">&lt;h3&gt; How to use Command-line Interface [#how-to-use-command-line-interface]</span></span></li></ul></li></ul>",
          "urlPathAndQuery": "/getting-started/quick-start-guide/"
        },
        {
          "headings": "<h2> On this page [#starlight__on-this-page] <h1> Overview [#_top] <h2> Purpose of the SiteOne Crawler [#purpose-of-the-siteone-crawler] <h2> 3 videos, more than a thousand words [#3-videos-more-than-a-thousand-words] <h2> Typical Use Cases [#typical-use-cases] <h3> Website owner or consultant [#website-owner-or-consultant] <h3> Developer [#developer] <h3> DevOps [#devops] <h3> QA engineer [#qa-engineer] <h2> Core Principles [#core-principles]",
          "headingsCount": "10",
          "headingsErrorsCount": "1",
          "headingsHtml": "<ul><li><span class=\"help\" title=\"Heading level 2 is not correct. Should be 1.\"><span style=\"color: #ff00ff\">&lt;h2&gt; On this page [#starlight__on-this-page]</span></span></li><li>&lt;h1&gt; Overview [#_top]<ul><li>&lt;h2&gt; Purpose of the SiteOne Crawler [#purpose-of-the-siteone-crawler]</li><li>&lt;h2&gt; 3 videos, more than a thousand words [#3-videos-more-than-a-thousand-words]</li><li>&lt;h2&gt; Typical Use Cases [#typical-use-cases]<ul><li>&lt;h3&gt; Website owner or consultant [#website-owner-or-consultant]</li><li>&lt;h3&gt; Developer [#developer]</li><li>&lt;h3&gt; DevOps [#devops]</li><li>&lt;h3&gt; QA engineer [#qa-engineer]</li></ul></li><li>&lt;h2&gt; Core Principles [#core-principles]</li></ul></li></ul>",
          "urlPathAndQuery": "/introduction/overview/"
        },
        {
          "headings": "<h2> On this page [#starlight__on-this-page] <h1> Key Features [#_top] <h2> Thinking about features [#thinking-about-features] <h2> Future plans [#future-plans] <h2> List of features [#list-of-features]",
          "headingsCount": "5",
          "headingsErrorsCount": "1",
          "headingsHtml": "<ul><li><span class=\"help\" title=\"Heading level 2 is not correct. Should be 1.\"><span style=\"color: #ff00ff\">&lt;h2&gt; On this page [#starlight__on-this-page]</span></span></li><li>&lt;h1&gt; Key Features [#_top]<ul><li>&lt;h2&gt; Thinking about features [#thinking-about-features]</li><li>&lt;h2&gt; Future plans [#future-plans]</li><li>&lt;h2&gt; List of features [#list-of-features]</li></ul></li></ul>",
          "urlPathAndQuery": "/introduction/key-features/"
        },
        {
          "headings": "<h2> On this page [#starlight__on-this-page] <h1> Ease of Use [#_top] <h2> 💡What would you improve? [#what-would-you-improve]",
          "headingsCount": "3",
          "headingsErrorsCount": "1",
          "headingsHtml": "<ul><li><span class=\"help\" title=\"Heading level 2 is not correct. Should be 1.\"><span style=\"color: #ff00ff\">&lt;h2&gt; On this page [#starlight__on-this-page]</span></span></li><li>&lt;h1&gt; Ease of Use [#_top]<ul><li>&lt;h2&gt; 💡What would you improve? [#what-would-you-improve]</li></ul></li></ul>",
          "urlPathAndQuery": "/features/ease-of-use/"
        },
        {
          "headings": "<h2> On this page [#starlight__on-this-page] <h1> Offline Website Generator (clone, mirror) [#_top] <h2> Features [#features] <h2> 💡Further development ideas [#further-development-ideas]",
          "headingsCount": "4",
          "headingsErrorsCount": "1",
          "headingsHtml": "<ul><li><span class=\"help\" title=\"Heading level 2 is not correct. Should be 1.\"><span style=\"color: #ff00ff\">&lt;h2&gt; On this page [#starlight__on-this-page]</span></span></li><li>&lt;h1&gt; Offline Website Generator (clone, mirror) [#_top]<ul><li>&lt;h2&gt; Features [#features]</li><li>&lt;h2&gt; 💡Further development ideas [#further-development-ideas]</li></ul></li></ul>",
          "urlPathAndQuery": "/features/offline-website-generator/"
        },
        {
          "headings": "<h2> On this page [#starlight__on-this-page] <h1> Command-line options [#_top] <h2> Basic Settings [#basic-settings] <h2> Output Settings [#output-settings] <h2> Upload options [#upload-options] <h2> Resource Filtering [#resource-filtering] <h2> Advanced Crawler Settings [#advanced-crawler-settings] <h2> Expert Settings [#expert-settings] <h2> File Export Settings [#file-export-settings] <h2> Mailer Options [#mailer-options] <h2> Offline Exporter Options [#offline-exporter-options] <h2> Markdown exporter options [#markdown-exporter-options] <h2> Sitemap Options [#sitemap-options] <h2> Fastest URL Analyzer [#fastest-url-analyzer] <h2> SEO and OpenGraph Analyzer [#seo-and-opengraph-analyzer] <h2> Slowest URL Analyzer [#slowest-url-analyzer]",
          "headingsCount": "16",
          "headingsErrorsCount": "1",
          "headingsHtml": "<ul><li><span class=\"help\" title=\"Heading level 2 is not correct. Should be 1.\"><span style=\"color: #ff00ff\">&lt;h2&gt; On this page [#starlight__on-this-page]</span></span></li><li>&lt;h1&gt; Command-line options [#_top]<ul><li>&lt;h2&gt; Basic Settings [#basic-settings]</li><li>&lt;h2&gt; Output Settings [#output-settings]</li><li>&lt;h2&gt; Upload options [#upload-options]</li><li>&lt;h2&gt; Resource Filtering [#resource-filtering]</li><li>&lt;h2&gt; Advanced Crawler Settings [#advanced-crawler-settings]</li><li>&lt;h2&gt; Expert Settings [#expert-settings]</li><li>&lt;h2&gt; File Export Settings [#file-export-settings]</li><li>&lt;h2&gt; Mailer Options [#mailer-options]</li><li>&lt;h2&gt; Offline Exporter Options [#offline-exporter-options]</li><li>&lt;h2&gt; Markdown exporter options [#markdown-exporter-options]</li><li>&lt;h2&gt; Sitemap Options [#sitemap-options]</li><li>&lt;h2&gt; Fastest URL Analyzer [#fastest-url-analyzer]</li><li>&lt;h2&gt; SEO and OpenGraph Analyzer [#seo-and-opengraph-analyzer]</li><li>&lt;h2&gt; Slowest URL Analyzer [#slowest-url-analyzer]</li></ul></li></ul>",
          "urlPathAndQuery": "/configuration/command-line-options/"
        },
        {
          "headings": "<h2> On this page [#starlight__on-this-page] <h1> Basic Usage [#_top] <h2> First steps [#first-steps] <h3> Crawl a website and print the results to the console [#crawl-a-website-and-print-the-results-to-the-console] <h3> Crawl with HTML report to e-mail [#crawl-with-html-report-to-e-mail] <h3> Generate offline version of the website [#generate-offline-version-of-the-website] <h2> Lot of other uses [#lot-of-other-uses]",
          "headingsCount": "7",
          "headingsErrorsCount": "1",
          "headingsHtml": "<ul><li><span class=\"help\" title=\"Heading level 2 is not correct. Should be 1.\"><span style=\"color: #ff00ff\">&lt;h2&gt; On this page [#starlight__on-this-page]</span></span></li><li>&lt;h1&gt; Basic Usage [#_top]<ul><li>&lt;h2&gt; First steps [#first-steps]<ul><li>&lt;h3&gt; Crawl a website and print the results to the console [#crawl-a-website-and-print-the-results-to-the-console]</li><li>&lt;h3&gt; Crawl with HTML report to e-mail [#crawl-with-html-report-to-e-mail]</li><li>&lt;h3&gt; Generate offline version of the website [#generate-offline-version-of-the-website]</li></ul></li><li>&lt;h2&gt; Lot of other uses [#lot-of-other-uses]</li></ul></li></ul>",
          "urlPathAndQuery": "/getting-started/basic-usage/"
        },
        {
          "headings": "<h2> On this page [#starlight__on-this-page] <h1> Contact and Community [#_top] <h2> Introduction [#introduction] <h2> Contact and Community links [#contact-and-community-links] <h3> Documentation [#documentation] <h3> Primary channels [#primary-channels] <h3> Secondary channels [#secondary-channels]",
          "headingsCount": "7",
          "headingsErrorsCount": "1",
          "headingsHtml": "<ul><li><span class=\"help\" title=\"Heading level 2 is not correct. Should be 1.\"><span style=\"color: #ff00ff\">&lt;h2&gt; On this page [#starlight__on-this-page]</span></span></li><li>&lt;h1&gt; Contact and Community [#_top]<ul><li>&lt;h2&gt; Introduction [#introduction]</li><li>&lt;h2&gt; Contact and Community links [#contact-and-community-links]<ul><li>&lt;h3&gt; Documentation [#documentation]</li><li>&lt;h3&gt; Primary channels [#primary-channels]</li><li>&lt;h3&gt; Secondary channels [#secondary-channels]</li></ul></li></ul></li></ul>",
          "urlPathAndQuery": "/introduction/contact-and-community/"
        },
        {
          "headings": "<h2> On this page [#starlight__on-this-page] <h1> Security Analysis [#_top] <h2> Sample Results [#sample-results] <h2> 💡What would you improve? [#what-would-you-improve]",
          "headingsCount": "4",
          "headingsErrorsCount": "1",
          "headingsHtml": "<ul><li><span class=\"help\" title=\"Heading level 2 is not correct. Should be 1.\"><span style=\"color: #ff00ff\">&lt;h2&gt; On this page [#starlight__on-this-page]</span></span></li><li>&lt;h1&gt; Security Analysis [#_top]<ul><li>&lt;h2&gt; Sample Results [#sample-results]</li><li>&lt;h2&gt; 💡What would you improve? [#what-would-you-improve]</li></ul></li></ul>",
          "urlPathAndQuery": "/features/security-analysis/"
        },
        {
          "headings": "<h2> On this page [#starlight__on-this-page] <h1> Dev/DevOps assistant [#_top] <h2> 💡Further development ideas [#further-development-ideas]",
          "headingsCount": "3",
          "headingsErrorsCount": "1",
          "headingsHtml": "<ul><li><span class=\"help\" title=\"Heading level 2 is not correct. Should be 1.\"><span style=\"color: #ff00ff\">&lt;h2&gt; On this page [#starlight__on-this-page]</span></span></li><li>&lt;h1&gt; Dev/DevOps assistant [#_top]<ul><li>&lt;h2&gt; 💡Further development ideas [#further-development-ideas]</li></ul></li></ul>",
          "urlPathAndQuery": "/features/dev-devops-assistant/"
        },
        {
          "headings": "<h2> On this page [#starlight__on-this-page] <h1> Redirect and 404 Analysis [#_top] <h2> Key Findings [#key-findings] <h2> Sample Results [#sample-results] <h2> 💡What would you improve? [#what-would-you-improve]",
          "headingsCount": "5",
          "headingsErrorsCount": "1",
          "headingsHtml": "<ul><li><span class=\"help\" title=\"Heading level 2 is not correct. Should be 1.\"><span style=\"color: #ff00ff\">&lt;h2&gt; On this page [#starlight__on-this-page]</span></span></li><li>&lt;h1&gt; Redirect and 404 Analysis [#_top]<ul><li>&lt;h2&gt; Key Findings [#key-findings]</li><li>&lt;h2&gt; Sample Results [#sample-results]</li><li>&lt;h2&gt; 💡What would you improve? [#what-would-you-improve]</li></ul></li></ul>",
          "urlPathAndQuery": "/features/redirect-and-404-analysis/"
        },
        {
          "headings": "<h2> On this page [#starlight__on-this-page] <h1> Performance Analysis [#_top] <h2> Key Findings [#key-findings] <h2> Sample Results [#sample-results] <h3> Slowest URLs [#slowest-urls] <h3> Fastest URLs [#fastest-urls] <h2> Configuration Options [#configuration-options] <h2> Usage [#usage] <h2> 💡Further development ideas [#further-development-ideas]",
          "headingsCount": "9",
          "headingsErrorsCount": "1",
          "headingsHtml": "<ul><li><span class=\"help\" title=\"Heading level 2 is not correct. Should be 1.\"><span style=\"color: #ff00ff\">&lt;h2&gt; On this page [#starlight__on-this-page]</span></span></li><li>&lt;h1&gt; Performance Analysis [#_top]<ul><li>&lt;h2&gt; Key Findings [#key-findings]</li><li>&lt;h2&gt; Sample Results [#sample-results]<ul><li>&lt;h3&gt; Slowest URLs [#slowest-urls]</li><li>&lt;h3&gt; Fastest URLs [#fastest-urls]</li></ul></li><li>&lt;h2&gt; Configuration Options [#configuration-options]</li><li>&lt;h2&gt; Usage [#usage]</li><li>&lt;h2&gt; 💡Further development ideas [#further-development-ideas]</li></ul></li></ul>",
          "urlPathAndQuery": "/features/performance-analysis/"
        },
        {
          "headings": "<h2> On this page [#starlight__on-this-page] <h1> SEO and OpenGraph Analysis [#_top] <h2> Key Findings [#key-findings] <h2> 💡What would you improve? [#what-would-you-improve]",
          "headingsCount": "4",
          "headingsErrorsCount": "1",
          "headingsHtml": "<ul><li><span class=\"help\" title=\"Heading level 2 is not correct. Should be 1.\"><span style=\"color: #ff00ff\">&lt;h2&gt; On this page [#starlight__on-this-page]</span></span></li><li>&lt;h1&gt; SEO and OpenGraph Analysis [#_top]<ul><li>&lt;h2&gt; Key Findings [#key-findings]</li><li>&lt;h2&gt; 💡What would you improve? [#what-would-you-improve]</li></ul></li></ul>",
          "urlPathAndQuery": "/features/seo-and-opengraph-analysis/"
        },
        {
          "headings": "<h2> On this page [#starlight__on-this-page] <h1> Desktop Application (GUI) [#_top] <h2> Where to download [#where-to-download] <h3> All platforms and older versions [#all-platforms-and-older-versions] <h2> How to use Desktop Application [#how-to-use-desktop-application]",
          "headingsCount": "5",
          "headingsErrorsCount": "1",
          "headingsHtml": "<ul><li><span class=\"help\" title=\"Heading level 2 is not correct. Should be 1.\"><span style=\"color: #ff00ff\">&lt;h2&gt; On this page [#starlight__on-this-page]</span></span></li><li>&lt;h1&gt; Desktop Application (GUI) [#_top]<ul><li>&lt;h2&gt; Where to download [#where-to-download]<ul><li>&lt;h3&gt; All platforms and older versions [#all-platforms-and-older-versions]</li></ul></li><li>&lt;h2&gt; How to use Desktop Application [#how-to-use-desktop-application]</li></ul></li></ul>",
          "urlPathAndQuery": "/installation-and-requirements/desktop-application/"
        },
        {
          "headings": "<h2> On this page [#starlight__on-this-page] <h1> Website to Markdown Converter [#_top] <h2> Features [#features] <h2> Command-line Options [#command-line-options] <h2> 💡Further development ideas [#further-development-ideas]",
          "headingsCount": "5",
          "headingsErrorsCount": "1",
          "headingsHtml": "<ul><li><span class=\"help\" title=\"Heading level 2 is not correct. Should be 1.\"><span style=\"color: #ff00ff\">&lt;h2&gt; On this page [#starlight__on-this-page]</span></span></li><li>&lt;h1&gt; Website to Markdown Converter [#_top]<ul><li>&lt;h2&gt; Features [#features]</li><li>&lt;h2&gt; Command-line Options [#command-line-options]</li><li>&lt;h2&gt; 💡Further development ideas [#further-development-ideas]</li></ul></li></ul>",
          "urlPathAndQuery": "/features/website-to-markdown-converter/"
        },
        {
          "headings": "<h2> On this page [#starlight__on-this-page] <h1> CLI: Ready-to-use Packages [#_top] <h2> Where to download packages [#where-to-download-packages] <h2> Windows 7/8/10/11 (x64) [#windows-781011-x64] <h2> macOS (x64, Intel) [#macos-x64-intel] <h2> macOS (arm64, Apple Silicon, M1/M2/M3) [#macos-arm64-apple-silicon-m1m2m3] <h2> Linux (x64) or WSL on Windows [#linux-x64-or-wsl-on-windows] <h2> Linux (arm64) [#linux-arm64]",
          "headingsCount": "8",
          "headingsErrorsCount": "1",
          "headingsHtml": "<ul><li><span class=\"help\" title=\"Heading level 2 is not correct. Should be 1.\"><span style=\"color: #ff00ff\">&lt;h2&gt; On this page [#starlight__on-this-page]</span></span></li><li>&lt;h1&gt; CLI: Ready-to-use Packages [#_top]<ul><li>&lt;h2&gt; Where to download packages [#where-to-download-packages]</li><li>&lt;h2&gt; Windows 7/8/10/11 (x64) [#windows-781011-x64]</li><li>&lt;h2&gt; macOS (x64, Intel) [#macos-x64-intel]</li><li>&lt;h2&gt; macOS (arm64, Apple Silicon, M1/M2/M3) [#macos-arm64-apple-silicon-m1m2m3]</li><li>&lt;h2&gt; Linux (x64) or WSL on Windows [#linux-x64-or-wsl-on-windows]</li><li>&lt;h2&gt; Linux (arm64) [#linux-arm64]</li></ul></li></ul>",
          "urlPathAndQuery": "/installation-and-requirements/ready-to-use-packages/"
        },
        {
          "headings": "<h2> On this page [#starlight__on-this-page] <h1> System Requirements [#_top] <h2> Hardware Requirements [#hardware-requirements] <h3> CPU [#cpu] <h3> Memory (RAM) [#memory-ram] <h3> Disk [#disk] <h3> Network/Internet [#networkinternet] <h2> Software Requirements [#software-requirements]",
          "headingsCount": "8",
          "headingsErrorsCount": "1",
          "headingsHtml": "<ul><li><span class=\"help\" title=\"Heading level 2 is not correct. Should be 1.\"><span style=\"color: #ff00ff\">&lt;h2&gt; On this page [#starlight__on-this-page]</span></span></li><li>&lt;h1&gt; System Requirements [#_top]<ul><li>&lt;h2&gt; Hardware Requirements [#hardware-requirements]<ul><li>&lt;h3&gt; CPU [#cpu]</li><li>&lt;h3&gt; Memory (RAM) [#memory-ram]</li><li>&lt;h3&gt; Disk [#disk]</li><li>&lt;h3&gt; Network/Internet [#networkinternet]</li></ul></li><li>&lt;h2&gt; Software Requirements [#software-requirements]</li></ul></li></ul>",
          "urlPathAndQuery": "/installation-and-requirements/system-requirements/"
        },
        {
          "headings": "<h2> On this page [#starlight__on-this-page] <h1> Ideas and Roadmap [#_top]",
          "headingsCount": "2",
          "headingsErrorsCount": "1",
          "headingsHtml": "<ul><li><span class=\"help\" title=\"Heading level 2 is not correct. Should be 1.\"><span style=\"color: #ff00ff\">&lt;h2&gt; On this page [#starlight__on-this-page]</span></span></li><li>&lt;h1&gt; Ideas and Roadmap [#_top]</li></ul>",
          "urlPathAndQuery": "/introduction/ideas-and-roadmap/"
        },
        {
          "headings": "<h2> On this page [#starlight__on-this-page] <h1> Technical Analysis [#_top] <h2> Key Technical Analysis Features [#key-technical-analysis-features] <h3> HTTP Headers Analysis [#http-headers-analysis] <h3> SSL/TLS Analysis [#ssltls-analysis] <h3> Server Configuration [#server-configuration] <h3> Technical Implementation [#technical-implementation] <h2> Sample Results [#sample-results] <h3> HTTP Headers Statistics [#http-headers-statistics] <h3> HTTP Header Values [#http-header-values] <h3> DNS Information [#dns-information] <h3> Analysis Statistics [#analysis-statistics] <h2> Interpreting Results [#interpreting-results] <h2> Use Cases [#use-cases] <h3> Security Hardening [#security-hardening] <h3> Performance Optimization [#performance-optimization] <h3> Standards Compliance [#standards-compliance] <h2> 💡Further Development Ideas [#further-development-ideas]",
          "headingsCount": "18",
          "headingsErrorsCount": "1",
          "headingsHtml": "<ul><li><span class=\"help\" title=\"Heading level 2 is not correct. Should be 1.\"><span style=\"color: #ff00ff\">&lt;h2&gt; On this page [#starlight__on-this-page]</span></span></li><li>&lt;h1&gt; Technical Analysis [#_top]<ul><li>&lt;h2&gt; Key Technical Analysis Features [#key-technical-analysis-features]<ul><li>&lt;h3&gt; HTTP Headers Analysis [#http-headers-analysis]</li><li>&lt;h3&gt; SSL/TLS Analysis [#ssltls-analysis]</li><li>&lt;h3&gt; Server Configuration [#server-configuration]</li><li>&lt;h3&gt; Technical Implementation [#technical-implementation]</li></ul></li><li>&lt;h2&gt; Sample Results [#sample-results]<ul><li>&lt;h3&gt; HTTP Headers Statistics [#http-headers-statistics]</li><li>&lt;h3&gt; HTTP Header Values [#http-header-values]</li><li>&lt;h3&gt; DNS Information [#dns-information]</li><li>&lt;h3&gt; Analysis Statistics [#analysis-statistics]</li></ul></li><li>&lt;h2&gt; Interpreting Results [#interpreting-results]</li><li>&lt;h2&gt; Use Cases [#use-cases]<ul><li>&lt;h3&gt; Security Hardening [#security-hardening]</li><li>&lt;h3&gt; Performance Optimization [#performance-optimization]</li><li>&lt;h3&gt; Standards Compliance [#standards-compliance]</li></ul></li><li>&lt;h2&gt; 💡Further Development Ideas [#further-development-ideas]</li></ul></li></ul>",
          "urlPathAndQuery": "/features/technical-analysis/"
        },
        {
          "headings": "<h2> On this page [#starlight__on-this-page] <h1> Audit Report [#_top] <h2> Video about the audit report [#video-about-the-audit-report] <h2> Upload feature to easy report sharing [#upload-feature-to-easy-report-sharing] <h2> Real examples of the audit report [#real-examples-of-the-audit-report] <h2> High-quality continuous display of the result [#high-quality-continuous-display-of-the-result] <h2> 💡What would you improve? [#what-would-you-improve]",
          "headingsCount": "7",
          "headingsErrorsCount": "1",
          "headingsHtml": "<ul><li><span class=\"help\" title=\"Heading level 2 is not correct. Should be 1.\"><span style=\"color: #ff00ff\">&lt;h2&gt; On this page [#starlight__on-this-page]</span></span></li><li>&lt;h1&gt; Audit Report [#_top]<ul><li>&lt;h2&gt; Video about the audit report [#video-about-the-audit-report]</li><li>&lt;h2&gt; Upload feature to easy report sharing [#upload-feature-to-easy-report-sharing]</li><li>&lt;h2&gt; Real examples of the audit report [#real-examples-of-the-audit-report]</li><li>&lt;h2&gt; High-quality continuous display of the result [#high-quality-continuous-display-of-the-result]</li><li>&lt;h2&gt; 💡What would you improve? [#what-would-you-improve]</li></ul></li></ul>",
          "urlPathAndQuery": "/features/audit-report/"
        },
        {
          "headings": "<h2> On this page [#starlight__on-this-page] <h1> CLI: Manual Installation [#_top] <h2> Choose your platform [#choose-your-platform] <h2> Windows 7/8/10/11 (x64) [#windows-781011-x64] <h2> macOS (x64, Intel) [#macos-x64-intel] <h2> macOS (arm64, Apple Silicon, M1/M2/M3) [#macos-arm64-apple-silicon-m1m2m3] <h2> Linux (x64) or WSL on Windows [#linux-x64-or-wsl-on-windows] <h2> Linux (arm64) [#linux-arm64]",
          "headingsCount": "8",
          "headingsErrorsCount": "1",
          "headingsHtml": "<ul><li><span class=\"help\" title=\"Heading level 2 is not correct. Should be 1.\"><span style=\"color: #ff00ff\">&lt;h2&gt; On this page [#starlight__on-this-page]</span></span></li><li>&lt;h1&gt; CLI: Manual Installation [#_top]<ul><li>&lt;h2&gt; Choose your platform [#choose-your-platform]</li><li>&lt;h2&gt; Windows 7/8/10/11 (x64) [#windows-781011-x64]</li><li>&lt;h2&gt; macOS (x64, Intel) [#macos-x64-intel]</li><li>&lt;h2&gt; macOS (arm64, Apple Silicon, M1/M2/M3) [#macos-arm64-apple-silicon-m1m2m3]</li><li>&lt;h2&gt; Linux (x64) or WSL on Windows [#linux-x64-or-wsl-on-windows]</li><li>&lt;h2&gt; Linux (arm64) [#linux-arm64]</li></ul></li></ul>",
          "urlPathAndQuery": "/installation-and-requirements/manual-installation/"
        },
        {
          "headings": "<h2> On this page [#starlight__on-this-page] <h1> Heading Analysis [#_top] <h2> 💡What would you improve? [#what-would-you-improve]",
          "headingsCount": "3",
          "headingsErrorsCount": "1",
          "headingsHtml": "<ul><li><span class=\"help\" title=\"Heading level 2 is not correct. Should be 1.\"><span style=\"color: #ff00ff\">&lt;h2&gt; On this page [#starlight__on-this-page]</span></span></li><li>&lt;h1&gt; Heading Analysis [#_top]<ul><li>&lt;h2&gt; 💡What would you improve? [#what-would-you-improve]</li></ul></li></ul>",
          "urlPathAndQuery": "/features/heading-analysis/"
        },
        {
          "headings": "<h2> On this page [#starlight__on-this-page] <h1> Accessibility Analysis [#_top] <h2> Included checks [#included-checks] <h2> Sample Results [#sample-results] <h2> 💡Further development ideas [#further-development-ideas]",
          "headingsCount": "5",
          "headingsErrorsCount": "1",
          "headingsHtml": "<ul><li><span class=\"help\" title=\"Heading level 2 is not correct. Should be 1.\"><span style=\"color: #ff00ff\">&lt;h2&gt; On this page [#starlight__on-this-page]</span></span></li><li>&lt;h1&gt; Accessibility Analysis [#_top]<ul><li>&lt;h2&gt; Included checks [#included-checks]</li><li>&lt;h2&gt; Sample Results [#sample-results]</li><li>&lt;h2&gt; 💡Further development ideas [#further-development-ideas]</li></ul></li></ul>",
          "urlPathAndQuery": "/features/accessibility-analysis/"
        },
        {
          "headings": "<h2> On this page [#starlight__on-this-page] <h1> Audit Report Sharing (upload) [#_top] <h2> Features of the online audit report [#features-of-the-online-audit-report] <h2> Security mechanisms [#security-mechanisms] <h2> How to set up your own upload service [#how-to-set-up-your-own-upload-service] <h2> Command-line options [#command-line-options] <h2> 💡Further development ideas [#further-development-ideas]",
          "headingsCount": "7",
          "headingsErrorsCount": "1",
          "headingsHtml": "<ul><li><span class=\"help\" title=\"Heading level 2 is not correct. Should be 1.\"><span style=\"color: #ff00ff\">&lt;h2&gt; On this page [#starlight__on-this-page]</span></span></li><li>&lt;h1&gt; Audit Report Sharing (upload) [#_top]<ul><li>&lt;h2&gt; Features of the online audit report [#features-of-the-online-audit-report]</li><li>&lt;h2&gt; Security mechanisms [#security-mechanisms]</li><li>&lt;h2&gt; How to set up your own upload service [#how-to-set-up-your-own-upload-service]</li><li>&lt;h2&gt; Command-line options [#command-line-options]</li><li>&lt;h2&gt; 💡Further development ideas [#further-development-ideas]</li></ul></li></ul>",
          "urlPathAndQuery": "/features/online-html-report-upload/"
        },
        {
          "headings": "<h2> On this page [#starlight__on-this-page] <h1> Deep Website Crawling [#_top] <h2> 💡Further development ideas [#further-development-ideas]",
          "headingsCount": "3",
          "headingsErrorsCount": "1",
          "headingsHtml": "<ul><li><span class=\"help\" title=\"Heading level 2 is not correct. Should be 1.\"><span style=\"color: #ff00ff\">&lt;h2&gt; On this page [#starlight__on-this-page]</span></span></li><li>&lt;h1&gt; Deep Website Crawling [#_top]<ul><li>&lt;h2&gt; 💡Further development ideas [#further-development-ideas]</li></ul></li></ul>",
          "urlPathAndQuery": "/features/deep-website-crawling/"
        },
        {
          "headings": "<h2> On this page [#starlight__on-this-page] <h1> Extending [#_top] <h2> Architecture Overview [#architecture-overview] <h2> Creating Custom Analyzers [#creating-custom-analyzers] <h3> Analyzer Interface [#analyzer-interface] <h3> Key Analyzer Methods [#key-analyzer-methods] <h3> Adding the Analyzer [#adding-the-analyzer] <h2> Creating Custom Content Processors [#creating-custom-content-processors] <h3> Content Processor Interface [#content-processor-interface] <h3> Adding the Content Processor [#adding-the-content-processor] <h2> Creating Custom Exporters [#creating-custom-exporters] <h3> Exporter Interface [#exporter-interface] <h3> Adding the Exporter [#adding-the-exporter] <h2> Best Practices for Extensions [#best-practices-for-extensions] <h2> Example: Simple SEO Title Analyzer [#example-simple-seo-title-analyzer] <h2> 💡Further Development Ideas [#further-development-ideas]",
          "headingsCount": "16",
          "headingsErrorsCount": "1",
          "headingsHtml": "<ul><li><span class=\"help\" title=\"Heading level 2 is not correct. Should be 1.\"><span style=\"color: #ff00ff\">&lt;h2&gt; On this page [#starlight__on-this-page]</span></span></li><li>&lt;h1&gt; Extending [#_top]<ul><li>&lt;h2&gt; Architecture Overview [#architecture-overview]</li><li>&lt;h2&gt; Creating Custom Analyzers [#creating-custom-analyzers]<ul><li>&lt;h3&gt; Analyzer Interface [#analyzer-interface]</li><li>&lt;h3&gt; Key Analyzer Methods [#key-analyzer-methods]</li><li>&lt;h3&gt; Adding the Analyzer [#adding-the-analyzer]</li></ul></li><li>&lt;h2&gt; Creating Custom Content Processors [#creating-custom-content-processors]<ul><li>&lt;h3&gt; Content Processor Interface [#content-processor-interface]</li><li>&lt;h3&gt; Adding the Content Processor [#adding-the-content-processor]</li></ul></li><li>&lt;h2&gt; Creating Custom Exporters [#creating-custom-exporters]<ul><li>&lt;h3&gt; Exporter Interface [#exporter-interface]</li><li>&lt;h3&gt; Adding the Exporter [#adding-the-exporter]</li></ul></li><li>&lt;h2&gt; Best Practices for Extensions [#best-practices-for-extensions]</li><li>&lt;h2&gt; Example: Simple SEO Title Analyzer [#example-simple-seo-title-analyzer]</li><li>&lt;h2&gt; 💡Further Development Ideas [#further-development-ideas]</li></ul></li></ul>",
          "urlPathAndQuery": "/advanced-topics/extending/"
        },
        {
          "headings": "<h2> On this page [#starlight__on-this-page] <h1> Stress Testing [#_top] <h2> Key Features [#key-features] <h2> 💡What would you improve? [#what-would-you-improve]",
          "headingsCount": "4",
          "headingsErrorsCount": "1",
          "headingsHtml": "<ul><li><span class=\"help\" title=\"Heading level 2 is not correct. Should be 1.\"><span style=\"color: #ff00ff\">&lt;h2&gt; On this page [#starlight__on-this-page]</span></span></li><li>&lt;h1&gt; Stress Testing [#_top]<ul><li>&lt;h2&gt; Key Features [#key-features]</li><li>&lt;h2&gt; 💡What would you improve? [#what-would-you-improve]</li></ul></li></ul>",
          "urlPathAndQuery": "/features/stress-testing/"
        },
        {
          "headings": "<h2> On this page [#starlight__on-this-page] <h1> DNS Analysis [#_top] <h2> Key Features [#key-features] <h2> DNS Information Display [#dns-information-display] <h2> Summary Information [#summary-information] <h2> How It Works [#how-it-works] <h2> Practical Benefits [#practical-benefits] <h2> Considerations and Limitations [#considerations-and-limitations] <h2> 💡Further Development Ideas [#further-development-ideas]",
          "headingsCount": "9",
          "headingsErrorsCount": "1",
          "headingsHtml": "<ul><li><span class=\"help\" title=\"Heading level 2 is not correct. Should be 1.\"><span style=\"color: #ff00ff\">&lt;h2&gt; On this page [#starlight__on-this-page]</span></span></li><li>&lt;h1&gt; DNS Analysis [#_top]<ul><li>&lt;h2&gt; Key Features [#key-features]</li><li>&lt;h2&gt; DNS Information Display [#dns-information-display]</li><li>&lt;h2&gt; Summary Information [#summary-information]</li><li>&lt;h2&gt; How It Works [#how-it-works]</li><li>&lt;h2&gt; Practical Benefits [#practical-benefits]</li><li>&lt;h2&gt; Considerations and Limitations [#considerations-and-limitations]</li><li>&lt;h2&gt; 💡Further Development Ideas [#further-development-ideas]</li></ul></li></ul>",
          "urlPathAndQuery": "/features/dns-analysis/"
        },
        {
          "headings": "<h2> On this page [#starlight__on-this-page] <h1> Performance Metrics [#_top] <h2> Key Capabilities [#key-capabilities] <h2> Fastest Pages Analysis [#fastest-pages-analysis] <h2> Slowest Pages Analysis [#slowest-pages-analysis] <h2> Customizable Performance Thresholds [#customizable-performance-thresholds] <h2> Implementation Details [#implementation-details] <h2> Practical Applications [#practical-applications] <h3> Performance Optimization [#performance-optimization] <h3> User Experience Improvement [#user-experience-improvement] <h3> Technical Troubleshooting [#technical-troubleshooting] <h3> Content Strategy [#content-strategy] <h2> Complementary Analysis [#complementary-analysis] <h2> 💡Further Development Ideas [#further-development-ideas]",
          "headingsCount": "14",
          "headingsErrorsCount": "1",
          "headingsHtml": "<ul><li><span class=\"help\" title=\"Heading level 2 is not correct. Should be 1.\"><span style=\"color: #ff00ff\">&lt;h2&gt; On this page [#starlight__on-this-page]</span></span></li><li>&lt;h1&gt; Performance Metrics [#_top]<ul><li>&lt;h2&gt; Key Capabilities [#key-capabilities]</li><li>&lt;h2&gt; Fastest Pages Analysis [#fastest-pages-analysis]</li><li>&lt;h2&gt; Slowest Pages Analysis [#slowest-pages-analysis]</li><li>&lt;h2&gt; Customizable Performance Thresholds [#customizable-performance-thresholds]</li><li>&lt;h2&gt; Implementation Details [#implementation-details]</li><li>&lt;h2&gt; Practical Applications [#practical-applications]<ul><li>&lt;h3&gt; Performance Optimization [#performance-optimization]</li><li>&lt;h3&gt; User Experience Improvement [#user-experience-improvement]</li><li>&lt;h3&gt; Technical Troubleshooting [#technical-troubleshooting]</li><li>&lt;h3&gt; Content Strategy [#content-strategy]</li></ul></li><li>&lt;h2&gt; Complementary Analysis [#complementary-analysis]</li><li>&lt;h2&gt; 💡Further Development Ideas [#further-development-ideas]</li></ul></li></ul>",
          "urlPathAndQuery": "/features/performance-metrics/"
        },
        {
          "headings": "<h2> On this page [#starlight__on-this-page] <h1> Troubleshooting [#_top] <h2> Common Issues and Solutions [#common-issues-and-solutions] <h3> Memory Issues [#memory-issues] <h3> Performance Issues [#performance-issues] <h3> Crawling Problems [#crawling-problems] <h3> Export and Report Issues [#export-and-report-issues] <h2> Debugging Techniques [#debugging-techniques] <h3> Enable Debug Mode [#enable-debug-mode] <h3> Logging to File [#logging-to-file] <h3> Progressive Testing [#progressive-testing] <h2> Specific Scenarios [#specific-scenarios] <h3> Handling Modern JavaScript Frameworks [#handling-modern-javascript-frameworks] <h3> Working with Large E-commerce Sites [#working-with-large-e-commerce-sites] <h3> Handling Sites with Login Requirements [#handling-sites-with-login-requirements] <h2> Error Messages Explained [#error-messages-explained] <h2> Getting More Help [#getting-more-help] <h2> 💡Further Troubleshooting Tips [#further-troubleshooting-tips]",
          "headingsCount": "18",
          "headingsErrorsCount": "1",
          "headingsHtml": "<ul><li><span class=\"help\" title=\"Heading level 2 is not correct. Should be 1.\"><span style=\"color: #ff00ff\">&lt;h2&gt; On this page [#starlight__on-this-page]</span></span></li><li>&lt;h1&gt; Troubleshooting [#_top]<ul><li>&lt;h2&gt; Common Issues and Solutions [#common-issues-and-solutions]<ul><li>&lt;h3&gt; Memory Issues [#memory-issues]</li><li>&lt;h3&gt; Performance Issues [#performance-issues]</li><li>&lt;h3&gt; Crawling Problems [#crawling-problems]</li><li>&lt;h3&gt; Export and Report Issues [#export-and-report-issues]</li></ul></li><li>&lt;h2&gt; Debugging Techniques [#debugging-techniques]<ul><li>&lt;h3&gt; Enable Debug Mode [#enable-debug-mode]</li><li>&lt;h3&gt; Logging to File [#logging-to-file]</li><li>&lt;h3&gt; Progressive Testing [#progressive-testing]</li></ul></li><li>&lt;h2&gt; Specific Scenarios [#specific-scenarios]<ul><li>&lt;h3&gt; Handling Modern JavaScript Frameworks [#handling-modern-javascript-frameworks]</li><li>&lt;h3&gt; Working with Large E-commerce Sites [#working-with-large-e-commerce-sites]</li><li>&lt;h3&gt; Handling Sites with Login Requirements [#handling-sites-with-login-requirements]</li></ul></li><li>&lt;h2&gt; Error Messages Explained [#error-messages-explained]</li><li>&lt;h2&gt; Getting More Help [#getting-more-help]</li><li>&lt;h2&gt; 💡Further Troubleshooting Tips [#further-troubleshooting-tips]</li></ul></li></ul>",
          "urlPathAndQuery": "/advanced-topics/troubleshooting/"
        },
        {
          "headings": "<h2> On this page [#starlight__on-this-page] <h1> Content Type Analysis [#_top] <h2> Key Metrics [#key-metrics] <h2> Sample Results [#sample-results] <h3> Content Types Summary [#content-types-summary] <h3> MIME Types Breakdown [#mime-types-breakdown] <h2> 💡Further development ideas [#further-development-ideas]",
          "headingsCount": "7",
          "headingsErrorsCount": "1",
          "headingsHtml": "<ul><li><span class=\"help\" title=\"Heading level 2 is not correct. Should be 1.\"><span style=\"color: #ff00ff\">&lt;h2&gt; On this page [#starlight__on-this-page]</span></span></li><li>&lt;h1&gt; Content Type Analysis [#_top]<ul><li>&lt;h2&gt; Key Metrics [#key-metrics]</li><li>&lt;h2&gt; Sample Results [#sample-results]<ul><li>&lt;h3&gt; Content Types Summary [#content-types-summary]</li><li>&lt;h3&gt; MIME Types Breakdown [#mime-types-breakdown]</li></ul></li><li>&lt;h2&gt; 💡Further development ideas [#further-development-ideas]</li></ul></li></ul>",
          "urlPathAndQuery": "/features/content-type-analysis/"
        },
        {
          "headings": "<h2> On this page [#starlight__on-this-page] <h1> Improvement Meter [#_top] <h2> 💡What would you improve? [#what-would-you-improve]",
          "headingsCount": "3",
          "headingsErrorsCount": "1",
          "headingsHtml": "<ul><li><span class=\"help\" title=\"Heading level 2 is not correct. Should be 1.\"><span style=\"color: #ff00ff\">&lt;h2&gt; On this page [#starlight__on-this-page]</span></span></li><li>&lt;h1&gt; Improvement Meter [#_top]<ul><li>&lt;h2&gt; 💡What would you improve? [#what-would-you-improve]</li></ul></li></ul>",
          "urlPathAndQuery": "/features/improvement-meter/"
        },
        {
          "headings": "<h2> On this page [#starlight__on-this-page] <h1> Source Domains Analysis [#_top] <h2> Key Features [#key-features] <h2> Analysis Table [#analysis-table] <h2> Practical Applications [#practical-applications] <h3> Performance Optimization [#performance-optimization] <h3> Security Assessment [#security-assessment] <h3> Infrastructure Planning [#infrastructure-planning] <h3> Compliance and Privacy [#compliance-and-privacy] <h2> Implementation Details [#implementation-details] <h2> Best Practices Based on Analysis [#best-practices-based-on-analysis] <h2> 💡Further Development Ideas [#further-development-ideas]",
          "headingsCount": "12",
          "headingsErrorsCount": "1",
          "headingsHtml": "<ul><li><span class=\"help\" title=\"Heading level 2 is not correct. Should be 1.\"><span style=\"color: #ff00ff\">&lt;h2&gt; On this page [#starlight__on-this-page]</span></span></li><li>&lt;h1&gt; Source Domains Analysis [#_top]<ul><li>&lt;h2&gt; Key Features [#key-features]</li><li>&lt;h2&gt; Analysis Table [#analysis-table]</li><li>&lt;h2&gt; Practical Applications [#practical-applications]<ul><li>&lt;h3&gt; Performance Optimization [#performance-optimization]</li><li>&lt;h3&gt; Security Assessment [#security-assessment]</li><li>&lt;h3&gt; Infrastructure Planning [#infrastructure-planning]</li><li>&lt;h3&gt; Compliance and Privacy [#compliance-and-privacy]</li></ul></li><li>&lt;h2&gt; Implementation Details [#implementation-details]</li><li>&lt;h2&gt; Best Practices Based on Analysis [#best-practices-based-on-analysis]</li><li>&lt;h2&gt; 💡Further Development Ideas [#further-development-ideas]</li></ul></li></ul>",
          "urlPathAndQuery": "/features/source-domains-analysis/"
        },
        {
          "headings": "<h2> On this page [#starlight__on-this-page] <h1> Crawler Behavior [#_top] <h2> Basic Crawling Process [#basic-crawling-process] <h2> URL Handling and Discovery [#url-handling-and-discovery] <h2> URL Filtering [#url-filtering] <h3> Domain Restrictions [#domain-restrictions] <h3> Content Type Filtering [#content-type-filtering] <h3> Depth Limitation [#depth-limitation] <h3> Pattern Matching [#pattern-matching] <h3> Robots.txt Compliance [#robotstxt-compliance] <h2> Handling Special Cases [#handling-special-cases] <h3> Query Parameters [#query-parameters] <h3> Redirects and Non-200 Responses [#redirects-and-non-200-responses] <h3> JavaScript Frameworks [#javascript-frameworks] <h2> Performance Considerations [#performance-considerations] <h2> 💡Further Development Ideas [#further-development-ideas]",
          "headingsCount": "16",
          "headingsErrorsCount": "1",
          "headingsHtml": "<ul><li><span class=\"help\" title=\"Heading level 2 is not correct. Should be 1.\"><span style=\"color: #ff00ff\">&lt;h2&gt; On this page [#starlight__on-this-page]</span></span></li><li>&lt;h1&gt; Crawler Behavior [#_top]<ul><li>&lt;h2&gt; Basic Crawling Process [#basic-crawling-process]</li><li>&lt;h2&gt; URL Handling and Discovery [#url-handling-and-discovery]</li><li>&lt;h2&gt; URL Filtering [#url-filtering]<ul><li>&lt;h3&gt; Domain Restrictions [#domain-restrictions]</li><li>&lt;h3&gt; Content Type Filtering [#content-type-filtering]</li><li>&lt;h3&gt; Depth Limitation [#depth-limitation]</li><li>&lt;h3&gt; Pattern Matching [#pattern-matching]</li><li>&lt;h3&gt; Robots.txt Compliance [#robotstxt-compliance]</li></ul></li><li>&lt;h2&gt; Handling Special Cases [#handling-special-cases]<ul><li>&lt;h3&gt; Query Parameters [#query-parameters]</li><li>&lt;h3&gt; Redirects and Non-200 Responses [#redirects-and-non-200-responses]</li><li>&lt;h3&gt; JavaScript Frameworks [#javascript-frameworks]</li></ul></li><li>&lt;h2&gt; Performance Considerations [#performance-considerations]</li><li>&lt;h2&gt; 💡Further Development Ideas [#further-development-ideas]</li></ul></li></ul>",
          "urlPathAndQuery": "/advanced-topics/crawler-behavior/"
        },
        {
          "headings": "<h2> On this page [#starlight__on-this-page] <h1> Mailer [#_top] <h2> Configuration Options [#configuration-options] <h2> 💡What would you improve? [#what-would-you-improve]",
          "headingsCount": "4",
          "headingsErrorsCount": "1",
          "headingsHtml": "<ul><li><span class=\"help\" title=\"Heading level 2 is not correct. Should be 1.\"><span style=\"color: #ff00ff\">&lt;h2&gt; On this page [#starlight__on-this-page]</span></span></li><li>&lt;h1&gt; Mailer [#_top]<ul><li>&lt;h2&gt; Configuration Options [#configuration-options]</li><li>&lt;h2&gt; 💡What would you improve? [#what-would-you-improve]</li></ul></li></ul>",
          "urlPathAndQuery": "/features/mailer/"
        },
        {
          "headings": "<h2> On this page [#starlight__on-this-page] <h1> Exports and Reports [#_top] <h2> Report formats [#report-formats] <h2> Export features [#export-features] <h2> 💡What would you improve? [#what-would-you-improve]",
          "headingsCount": "5",
          "headingsErrorsCount": "1",
          "headingsHtml": "<ul><li><span class=\"help\" title=\"Heading level 2 is not correct. Should be 1.\"><span style=\"color: #ff00ff\">&lt;h2&gt; On this page [#starlight__on-this-page]</span></span></li><li>&lt;h1&gt; Exports and Reports [#_top]<ul><li>&lt;h2&gt; Report formats [#report-formats]</li><li>&lt;h2&gt; Export features [#export-features]</li><li>&lt;h2&gt; 💡What would you improve? [#what-would-you-improve]</li></ul></li></ul>",
          "urlPathAndQuery": "/features/exports-and-reports/"
        },
        {
          "headings": "<h2> On this page [#starlight__on-this-page] <h1> Motivation [#_top] <h2> About the main author [#about-the-main-author] <h2> Motivation to create this project [#motivation-to-create-this-project]",
          "headingsCount": "4",
          "headingsErrorsCount": "1",
          "headingsHtml": "<ul><li><span class=\"help\" title=\"Heading level 2 is not correct. Should be 1.\"><span style=\"color: #ff00ff\">&lt;h2&gt; On this page [#starlight__on-this-page]</span></span></li><li>&lt;h1&gt; Motivation [#_top]<ul><li>&lt;h2&gt; About the main author [#about-the-main-author]</li><li>&lt;h2&gt; Motivation to create this project [#motivation-to-create-this-project]</li></ul></li></ul>",
          "urlPathAndQuery": "/introduction/motivation/"
        },
        {
          "headings": "<h2> On this page [#starlight__on-this-page] <h1> Best Practices Analysis [#_top] <h2> Key Findings [#key-findings] <h2> Sample Results [#sample-results] <h2> 💡Further development ideas [#further-development-ideas]",
          "headingsCount": "5",
          "headingsErrorsCount": "1",
          "headingsHtml": "<ul><li><span class=\"help\" title=\"Heading level 2 is not correct. Should be 1.\"><span style=\"color: #ff00ff\">&lt;h2&gt; On this page [#starlight__on-this-page]</span></span></li><li>&lt;h1&gt; Best Practices Analysis [#_top]<ul><li>&lt;h2&gt; Key Findings [#key-findings]</li><li>&lt;h2&gt; Sample Results [#sample-results]</li><li>&lt;h2&gt; 💡Further development ideas [#further-development-ideas]</li></ul></li></ul>",
          "urlPathAndQuery": "/features/best-practices-analysis/"
        },
        {
          "headings": "<h2> On this page [#starlight__on-this-page] <h1> Advanced usage [#_top] <h2> Features [#features] <h2> Configuration [#configuration] <h2> Full example [#full-example]",
          "headingsCount": "5",
          "headingsErrorsCount": "1",
          "headingsHtml": "<ul><li><span class=\"help\" title=\"Heading level 2 is not correct. Should be 1.\"><span style=\"color: #ff00ff\">&lt;h2&gt; On this page [#starlight__on-this-page]</span></span></li><li>&lt;h1&gt; Advanced usage [#_top]<ul><li>&lt;h2&gt; Features [#features]</li><li>&lt;h2&gt; Configuration [#configuration]</li><li>&lt;h2&gt; Full example [#full-example]</li></ul></li></ul>",
          "urlPathAndQuery": "/getting-started/advanced-usage/"
        },
        {
          "headings": "<h2> On this page [#starlight__on-this-page] <h1> Availability [#_top] <h2> 💡Further development ideas [#further-development-ideas]",
          "headingsCount": "3",
          "headingsErrorsCount": "1",
          "headingsHtml": "<ul><li><span class=\"help\" title=\"Heading level 2 is not correct. Should be 1.\"><span style=\"color: #ff00ff\">&lt;h2&gt; On this page [#starlight__on-this-page]</span></span></li><li>&lt;h1&gt; Availability [#_top]<ul><li>&lt;h2&gt; 💡Further development ideas [#further-development-ideas]</li></ul></li></ul>",
          "urlPathAndQuery": "/features/availability/"
        },
        {
          "headings": "<h2> On this page [#starlight__on-this-page] <h1> Thanks [#_top]",
          "headingsCount": "2",
          "headingsErrorsCount": "1",
          "headingsHtml": "<ul><li><span class=\"help\" title=\"Heading level 2 is not correct. Should be 1.\"><span style=\"color: #ff00ff\">&lt;h2&gt; On this page [#starlight__on-this-page]</span></span></li><li>&lt;h1&gt; Thanks [#_top]</li></ul>",
          "urlPathAndQuery": "/introduction/thanks/"
        },
        {
          "headings": "<h2> On this page [#starlight__on-this-page] <h1> Sitemap Generator [#_top] <h2> Key Features [#key-features] <h2> How It Works [#how-it-works] <h2> 💡What would you improve? [#what-would-you-improve]",
          "headingsCount": "5",
          "headingsErrorsCount": "1",
          "headingsHtml": "<ul><li><span class=\"help\" title=\"Heading level 2 is not correct. Should be 1.\"><span style=\"color: #ff00ff\">&lt;h2&gt; On this page [#starlight__on-this-page]</span></span></li><li>&lt;h1&gt; Sitemap Generator [#_top]<ul><li>&lt;h2&gt; Key Features [#key-features]</li><li>&lt;h2&gt; How It Works [#how-it-works]</li><li>&lt;h2&gt; 💡What would you improve? [#what-would-you-improve]</li></ul></li></ul>",
          "urlPathAndQuery": "/features/sitemap-generator/"
        },
        {
          "headings": "<h2> On this page [#starlight__on-this-page] <h1> Caching Analysis [#_top] <h2> Key Features [#key-features] <h2> Analysis Tables [#analysis-tables] <h3> HTTP Caching by Content Type [#http-caching-by-content-type] <h3> HTTP Caching by Domain [#http-caching-by-domain] <h3> HTTP Caching by Domain and Content Type [#http-caching-by-domain-and-content-type] <h2> Detected Cache Types [#detected-cache-types] <h2> How It Works [#how-it-works] <h2> Optimization Opportunities [#optimization-opportunities] <h2> Best Practices [#best-practices] <h2> 💡Further Development Ideas [#further-development-ideas]",
          "headingsCount": "12",
          "headingsErrorsCount": "1",
          "headingsHtml": "<ul><li><span class=\"help\" title=\"Heading level 2 is not correct. Should be 1.\"><span style=\"color: #ff00ff\">&lt;h2&gt; On this page [#starlight__on-this-page]</span></span></li><li>&lt;h1&gt; Caching Analysis [#_top]<ul><li>&lt;h2&gt; Key Features [#key-features]</li><li>&lt;h2&gt; Analysis Tables [#analysis-tables]<ul><li>&lt;h3&gt; HTTP Caching by Content Type [#http-caching-by-content-type]</li><li>&lt;h3&gt; HTTP Caching by Domain [#http-caching-by-domain]</li><li>&lt;h3&gt; HTTP Caching by Domain and Content Type [#http-caching-by-domain-and-content-type]</li></ul></li><li>&lt;h2&gt; Detected Cache Types [#detected-cache-types]</li><li>&lt;h2&gt; How It Works [#how-it-works]</li><li>&lt;h2&gt; Optimization Opportunities [#optimization-opportunities]</li><li>&lt;h2&gt; Best Practices [#best-practices]</li><li>&lt;h2&gt; 💡Further Development Ideas [#further-development-ideas]</li></ul></li></ul>",
          "urlPathAndQuery": "/features/caching-analysis/"
        },
        {
          "headings": "<h2> On this page [#starlight__on-this-page] <h1> HTTP Headers Analysis [#_top] <h2> Key Features [#key-features] <h2> Analysis Tables [#analysis-tables] <h3> HTTP Headers Overview [#http-headers-overview] <h3> HTTP Header Values [#http-header-values] <h2> Security Headers Focus [#security-headers-focus] <h2> Key HTTP Headers Analyzed [#key-http-headers-analyzed] <h3> Security Headers [#security-headers] <h3> Caching Headers [#caching-headers] <h3> Content Headers [#content-headers] <h2> Practical Benefits [#practical-benefits] <h2> Implementation Recommendations [#implementation-recommendations] <h2> 💡Further Development Ideas [#further-development-ideas]",
          "headingsCount": "14",
          "headingsErrorsCount": "1",
          "headingsHtml": "<ul><li><span class=\"help\" title=\"Heading level 2 is not correct. Should be 1.\"><span style=\"color: #ff00ff\">&lt;h2&gt; On this page [#starlight__on-this-page]</span></span></li><li>&lt;h1&gt; HTTP Headers Analysis [#_top]<ul><li>&lt;h2&gt; Key Features [#key-features]</li><li>&lt;h2&gt; Analysis Tables [#analysis-tables]<ul><li>&lt;h3&gt; HTTP Headers Overview [#http-headers-overview]</li><li>&lt;h3&gt; HTTP Header Values [#http-header-values]</li></ul></li><li>&lt;h2&gt; Security Headers Focus [#security-headers-focus]</li><li>&lt;h2&gt; Key HTTP Headers Analyzed [#key-http-headers-analyzed]<ul><li>&lt;h3&gt; Security Headers [#security-headers]</li><li>&lt;h3&gt; Caching Headers [#caching-headers]</li><li>&lt;h3&gt; Content Headers [#content-headers]</li></ul></li><li>&lt;h2&gt; Practical Benefits [#practical-benefits]</li><li>&lt;h2&gt; Implementation Recommendations [#implementation-recommendations]</li><li>&lt;h2&gt; 💡Further Development Ideas [#further-development-ideas]</li></ul></li></ul>",
          "urlPathAndQuery": "/features/headers-analysis/"
        },
        {
          "headings": "<h2> On this page [#starlight__on-this-page] <h1> SSL/TLS Analysis [#_top] <h2> Key Features [#key-features] <h2> Certificate Information Table [#certificate-information-table] <h2> Comprehensive Security Assessment [#comprehensive-security-assessment] <h3> Certificate Details [#certificate-details] <h3> Protocol Security [#protocol-security] <h3> Implementation Features [#implementation-features] <h2> Security Recommendations [#security-recommendations] <h2> Implementation Details [#implementation-details] <h2> Real-World Benefits [#real-world-benefits] <h2> 💡Further Development Ideas [#further-development-ideas]",
          "headingsCount": "12",
          "headingsErrorsCount": "1",
          "headingsHtml": "<ul><li><span class=\"help\" title=\"Heading level 2 is not correct. Should be 1.\"><span style=\"color: #ff00ff\">&lt;h2&gt; On this page [#starlight__on-this-page]</span></span></li><li>&lt;h1&gt; SSL/TLS Analysis [#_top]<ul><li>&lt;h2&gt; Key Features [#key-features]</li><li>&lt;h2&gt; Certificate Information Table [#certificate-information-table]</li><li>&lt;h2&gt; Comprehensive Security Assessment [#comprehensive-security-assessment]<ul><li>&lt;h3&gt; Certificate Details [#certificate-details]</li><li>&lt;h3&gt; Protocol Security [#protocol-security]</li><li>&lt;h3&gt; Implementation Features [#implementation-features]</li></ul></li><li>&lt;h2&gt; Security Recommendations [#security-recommendations]</li><li>&lt;h2&gt; Implementation Details [#implementation-details]</li><li>&lt;h2&gt; Real-World Benefits [#real-world-benefits]</li><li>&lt;h2&gt; 💡Further Development Ideas [#further-development-ideas]</li></ul></li></ul>",
          "urlPathAndQuery": "/features/ssl-tls-analysis/"
        },
        {
          "headings": "<h2> On this page [#starlight__on-this-page] <h1> Contribution and Development [#_top] <h2> Getting Started [#getting-started] <h3> Development Environment Setup [#development-environment-setup] <h3> Project Structure [#project-structure] <h2> Contributing Code [#contributing-code] <h3> Development Workflow [#development-workflow] <h3> Coding Standards [#coding-standards] <h3> Pull Request Process [#pull-request-process] <h2> Contributing Documentation [#contributing-documentation] <h3> Documentation Structure [#documentation-structure] <h3> Documentation Guidelines [#documentation-guidelines] <h2> Reporting Issues [#reporting-issues] <h3> Bug Reports [#bug-reports] <h3> Feature Requests [#feature-requests] <h2> Development Guidelines [#development-guidelines] <h3> Performance Considerations [#performance-considerations] <h3> Security Best Practices [#security-best-practices] <h2> Release Process [#release-process] <h2> Community Communication [#community-communication] <h2> 💡Further Development Resources [#further-development-resources]",
          "headingsCount": "21",
          "headingsErrorsCount": "1",
          "headingsHtml": "<ul><li><span class=\"help\" title=\"Heading level 2 is not correct. Should be 1.\"><span style=\"color: #ff00ff\">&lt;h2&gt; On this page [#starlight__on-this-page]</span></span></li><li>&lt;h1&gt; Contribution and Development [#_top]<ul><li>&lt;h2&gt; Getting Started [#getting-started]<ul><li>&lt;h3&gt; Development Environment Setup [#development-environment-setup]</li><li>&lt;h3&gt; Project Structure [#project-structure]</li></ul></li><li>&lt;h2&gt; Contributing Code [#contributing-code]<ul><li>&lt;h3&gt; Development Workflow [#development-workflow]</li><li>&lt;h3&gt; Coding Standards [#coding-standards]</li><li>&lt;h3&gt; Pull Request Process [#pull-request-process]</li></ul></li><li>&lt;h2&gt; Contributing Documentation [#contributing-documentation]<ul><li>&lt;h3&gt; Documentation Structure [#documentation-structure]</li><li>&lt;h3&gt; Documentation Guidelines [#documentation-guidelines]</li></ul></li><li>&lt;h2&gt; Reporting Issues [#reporting-issues]<ul><li>&lt;h3&gt; Bug Reports [#bug-reports]</li><li>&lt;h3&gt; Feature Requests [#feature-requests]</li></ul></li><li>&lt;h2&gt; Development Guidelines [#development-guidelines]<ul><li>&lt;h3&gt; Performance Considerations [#performance-considerations]</li><li>&lt;h3&gt; Security Best Practices [#security-best-practices]</li></ul></li><li>&lt;h2&gt; Release Process [#release-process]</li><li>&lt;h2&gt; Community Communication [#community-communication]</li><li>&lt;h2&gt; 💡Further Development Resources [#further-development-resources]</li></ul></li></ul>",
          "urlPathAndQuery": "/advanced-topics/contribution-and-development/"
        },
        {
          "headings": "<h2> On this page [#starlight__on-this-page] <h1> Support Us [#_top]",
          "headingsCount": "2",
          "headingsErrorsCount": "1",
          "headingsHtml": "<ul><li><span class=\"help\" title=\"Heading level 2 is not correct. Should be 1.\"><span style=\"color: #ff00ff\">&lt;h2&gt; On this page [#starlight__on-this-page]</span></span></li><li>&lt;h1&gt; Support Us [#_top]</li></ul>",
          "urlPathAndQuery": "/introduction/support-us/"
        },
        {
          "headings": "<h2> On this page [#starlight__on-this-page] <h1> Caching [#_top] <h2> How Caching Works [#how-caching-works] <h2> Cache Configuration [#cache-configuration] <h3> Main Cache Settings [#main-cache-settings] <h3> Example Usage [#example-usage] <h2> Cache Implementation Details [#cache-implementation-details] <h2> When to Use Caching [#when-to-use-caching] <h2> When to Disable Caching [#when-to-disable-caching] <h2> 💡Further Development Ideas [#further-development-ideas]",
          "headingsCount": "10",
          "headingsErrorsCount": "1",
          "headingsHtml": "<ul><li><span class=\"help\" title=\"Heading level 2 is not correct. Should be 1.\"><span style=\"color: #ff00ff\">&lt;h2&gt; On this page [#starlight__on-this-page]</span></span></li><li>&lt;h1&gt; Caching [#_top]<ul><li>&lt;h2&gt; How Caching Works [#how-caching-works]</li><li>&lt;h2&gt; Cache Configuration [#cache-configuration]<ul><li>&lt;h3&gt; Main Cache Settings [#main-cache-settings]</li><li>&lt;h3&gt; Example Usage [#example-usage]</li></ul></li><li>&lt;h2&gt; Cache Implementation Details [#cache-implementation-details]</li><li>&lt;h2&gt; When to Use Caching [#when-to-use-caching]</li><li>&lt;h2&gt; When to Disable Caching [#when-to-disable-caching]</li><li>&lt;h2&gt; 💡Further Development Ideas [#further-development-ideas]</li></ul></li></ul>",
          "urlPathAndQuery": "/advanced-topics/caching/"
        },
        {
          "headings": "<h1> SiteOne Crawler [#_top] <h2> How it works and examples [#how-it-works-and-examples] <h2> 3 videos, more than a thousand words [#3-videos-more-than-a-thousand-words] <h2> First Steps [#first-steps] <h3> Desktop Application <h3> Command-line Interface <h2> Key Features [#key-features]",
          "headingsCount": "7",
          "headingsErrorsCount": "0",
          "headingsHtml": "<ul><li>&lt;h1&gt; SiteOne Crawler [#_top]<ul><li>&lt;h2&gt; How it works and examples [#how-it-works-and-examples]</li><li>&lt;h2&gt; 3 videos, more than a thousand words [#3-videos-more-than-a-thousand-words]</li><li>&lt;h2&gt; First Steps [#first-steps]<ul><li>&lt;h3&gt; Desktop Application</li><li>&lt;h3&gt; Command-line Interface</li></ul></li><li>&lt;h2&gt; Key Features [#key-features]</li></ul></li></ul>",
          "urlPathAndQuery": "/"
        }
      ],
      "title": "Heading structure"
    },
    "skipped": {
      "aplCode": "skipped",
      "columns": {
        "reason": {
          "aplCode": "reason",
          "escapeOutputHtml": true,
          "forcedDataType": null,
          "formatter": null,
          "formatterWillChangeValueLength": false,
          "getDataValueCallback": null,
          "name": "Reason",
          "nonBreakingSpaces": false,
          "renderer": null,
          "truncateIfLonger": false,
          "width": 18
        },
        "sourceAttr": {
          "aplCode": "sourceAttr",
          "escapeOutputHtml": true,
          "forcedDataType": null,
          "formatter": null,
          "formatterWillChangeValueLength": false,
          "getDataValueCallback": null,
          "name": "Source",
          "nonBreakingSpaces": false,
          "renderer": null,
          "truncateIfLonger": false,
          "width": 19
        },
        "sourceUqId": {
          "aplCode": "sourceUqId",
          "escapeOutputHtml": true,
          "forcedDataType": null,
          "formatter": null,
          "formatterWillChangeValueLength": true,
          "getDataValueCallback": null,
          "name": "Found at URL",
          "nonBreakingSpaces": false,
          "renderer": null,
          "truncateIfLonger": true,
          "width": 60
        },
        "url": {
          "aplCode": "url",
          "escapeOutputHtml": true,
          "forcedDataType": null,
          "formatter": null,
          "formatterWillChangeValueLength": true,
          "getDataValueCallback": null,
          "name": "Skipped URL",
          "nonBreakingSpaces": false,
          "renderer": null,
          "truncateIfLonger": true,
          "width": 60
        }
      },
      "position": "before-url-table",
      "rows": [
        {
          "reason": "Robots.txt",
          "sourceAttr": "<a href>",
          "sourceUqId": "/",
          "url": "/examples-exports/docs.astro.build/"
        },
        {
          "reason": "Robots.txt",
          "sourceAttr": "<a href>",
          "sourceUqId": "/",
          "url": "/examples-exports/netlify.com/"
        },
        {
          "reason": "Robots.txt",
          "sourceAttr": "<a href>",
          "sourceUqId": "/",
          "url": "/examples-exports/nextjs.org/"
        },
        {
          "reason": "Robots.txt",
          "sourceAttr": "<a href>",
          "sourceUqId": "/",
          "url": "/html/2024-08-23/forever/cl8xw4r-fdag8wg-44dd.html"
        },
        {
          "reason": "Robots.txt",
          "sourceAttr": "<a href>",
          "sourceUqId": "/",
          "url": "/html/2024-08-23/forever/x2-vuvb0oi6qxkr-ku79.html"
        },
        {
          "reason": "Robots.txt",
          "sourceAttr": "<a href>",
          "sourceUqId": "/",
          "url": "/html/2024-08-24/forever/hwzxj1-qrs69-1fqlxbd.html"
        },
        {
          "reason": "Not allowed host",
          "sourceAttr": "<a href>",
          "sourceUqId": "/introduction/thanks/",
          "url": "https://adamwathan.me/"
        },
        {
          "reason": "Not allowed host",
          "sourceAttr": "<a href>",
          "sourceUqId": "/introduction/support-us/",
          "url": "https://alternativeto.net/software/siteone-crawler--deep-website-analyzer/about/"
        },
        {
          "reason": "Not allowed host",
          "sourceAttr": "<a href>",
          "sourceUqId": "/introduction/thanks/",
          "url": "https://chat.openai.com/"
        },
        {
          "reason": "Not allowed host",
          "sourceAttr": "<a href>",
          "sourceUqId": "/introduction/thanks/",
          "url": "https://cz.linkedin.com/in/janbezdek"
        },
        {
          "reason": "Not allowed host",
          "sourceAttr": "<a href>",
          "sourceUqId": "/introduction/thanks/",
          "url": "https://daisyui.com/"
        },
        {
          "reason": "Not allowed host",
          "sourceAttr": "<a href>",
          "sourceUqId": "/",
          "url": "https://discord.gg/Uh66HaZJ"
        },
        {
          "reason": "Not allowed host",
          "sourceAttr": "<a href>",
          "sourceUqId": "/introduction/contact-and-community/",
          "url": "https://discord.gg/fdm7KE8Z"
        },
        {
          "reason": "Not allowed host",
          "sourceAttr": "<a href>",
          "sourceUqId": "/introduction/thanks/",
          "url": "https://en.wikipedia.org/wiki/Larry_Page"
        },
        {
          "reason": "Not allowed host",
          "sourceAttr": "<a href>",
          "sourceUqId": "/introduction/thanks/",
          "url": "https://en.wikipedia.org/wiki/Sergey_Brin"
        },
        {
          "reason": "Not allowed host",
          "sourceAttr": "<a href>",
          "sourceUqId": "/introduction/thanks/",
          "url": "https://en.wikipedia.org/wiki/Steve_Jobs"
        },
        {
          "reason": "Not allowed host",
          "sourceAttr": "<a href>",
          "sourceUqId": "/introduction/thanks/",
          "url": "https://en.wikipedia.org/wiki/Tilman_Hausherr"
        },
        {
          "reason": "Not allowed host",
          "sourceAttr": "<a href>",
          "sourceUqId": "/",
          "url": "https://github.com/janreges/siteone-crawler"
        },
        {
          "reason": "Not allowed host",
          "sourceAttr": "<a href>",
          "sourceUqId": "/",
          "url": "https://github.com/janreges/siteone-crawler-gui"
        },
        {
          "reason": "Not allowed host",
          "sourceAttr": "<a href>",
          "sourceUqId": "/features/ease-of-use/",
          "url": "https://github.com/janreges/siteone-crawler-gui/issues/new"
        },
        {
          "reason": "Not allowed host",
          "sourceAttr": "<a href>",
          "sourceUqId": "/",
          "url": "https://github.com/janreges/siteone-crawler-gui/releases"
        },
        {
          "reason": "Not allowed host",
          "sourceAttr": "<a href>",
          "sourceUqId": "/installation-and-requirements/desktop-application/",
          "url": "https://github.com/janreges/siteone-crawler-gui/releases/download/v1.0.8/SiteOne-Crawler-linux-arm64-1.0.8.AppImage"
        },
        {
          "reason": "Not allowed host",
          "sourceAttr": "<a href>",
          "sourceUqId": "/installation-and-requirements/desktop-application/",
          "url": "https://github.com/janreges/siteone-crawler-gui/releases/download/v1.0.8/SiteOne-Crawler-linux-arm64-1.0.8.deb"
        },
        {
          "reason": "Not allowed host",
          "sourceAttr": "<a href>",
          "sourceUqId": "/installation-and-requirements/desktop-application/",
          "url": "https://github.com/janreges/siteone-crawler-gui/releases/download/v1.0.8/SiteOne-Crawler-linux-arm64-1.0.8.snap"
        },
        {
          "reason": "Not allowed host",
          "sourceAttr": "<a href>",
          "sourceUqId": "/",
          "url": "https://github.com/janreges/siteone-crawler-gui/releases/download/v1.0.8/SiteOne-Crawler-linux-x64-1.0.8.AppImage"
        },
        {
          "reason": "Not allowed host",
          "sourceAttr": "<a href>",
          "sourceUqId": "/",
          "url": "https://github.com/janreges/siteone-crawler-gui/releases/download/v1.0.8/SiteOne-Crawler-linux-x64-1.0.8.deb"
        },
        {
          "reason": "Not allowed host",
          "sourceAttr": "<a href>",
          "sourceUqId": "/",
          "url": "https://github.com/janreges/siteone-crawler-gui/releases/download/v1.0.8/SiteOne-Crawler-linux-x64-1.0.8.snap"
        },
        {
          "reason": "Not allowed host",
          "sourceAttr": "<a href>",
          "sourceUqId": "/",
          "url": "https://github.com/janreges/siteone-crawler-gui/releases/download/v1.0.8/SiteOne-Crawler-mac-arm64-1.0.8.dmg"
        },
        {
          "reason": "Not allowed host",
          "sourceAttr": "<a href>",
          "sourceUqId": "/",
          "url": "https://github.com/janreges/siteone-crawler-gui/releases/download/v1.0.8/SiteOne-Crawler-mac-x64-1.0.8.dmg"
        },
        {
          "reason": "Not allowed host",
          "sourceAttr": "<a href>",
          "sourceUqId": "/",
          "url": "https://github.com/janreges/siteone-crawler-gui/releases/download/v1.0.8/SiteOne-Crawler-win-x64-1.0.8-portable.exe"
        },
        {
          "reason": "Not allowed host",
          "sourceAttr": "<a href>",
          "sourceUqId": "/",
          "url": "https://github.com/janreges/siteone-crawler-gui/releases/download/v1.0.8/SiteOne-Crawler-win-x64-1.0.8-setup.exe"
        },
        {
          "reason": "Not allowed host",
          "sourceAttr": "<a href>",
          "sourceUqId": "/",
          "url": "https://github.com/janreges/siteone-crawler-gui/releases/download/v1.0.8/SiteOne-Crawler-win-x64-1.0.8.msi"
        },
        {
          "reason": "Not allowed host",
          "sourceAttr": "<a href>",
          "sourceUqId": "/",
          "url": "https://github.com/janreges/siteone-crawler-markdown-examples/blob/main/react.dev/index.md"
        },
        {
          "reason": "Not allowed host",
          "sourceAttr": "<a href>",
          "sourceUqId": "/introduction/faq/",
          "url": "https://github.com/janreges/siteone-crawler/"
        },
        {
          "reason": "Not allowed host",
          "sourceAttr": "<a href>",
          "sourceUqId": "/introduction/contact-and-community/",
          "url": "https://github.com/janreges/siteone-crawler/discussions"
        },
        {
          "reason": "Not allowed host",
          "sourceAttr": "<a href>",
          "sourceUqId": "/",
          "url": "https://github.com/janreges/siteone-crawler/issues"
        },
        {
          "reason": "Not allowed host",
          "sourceAttr": "<a href>",
          "sourceUqId": "/features/ease-of-use/",
          "url": "https://github.com/janreges/siteone-crawler/issues/new"
        },
        {
          "reason": "Not allowed host",
          "sourceAttr": "<a href>",
          "sourceUqId": "/installation-and-requirements/ready-to-use-packages/",
          "url": "https://github.com/janreges/siteone-crawler/releases"
        },
        {
          "reason": "Not allowed host",
          "sourceAttr": "<a href>",
          "sourceUqId": "/installation-and-requirements/ready-to-use-packages/",
          "url": "https://github.com/janreges/siteone-crawler/releases/download/v1.0.8/siteone-crawler-v1.0.8-win-x64.zip"
        },
        {
          "reason": "Not allowed host",
          "sourceAttr": "<a href>",
          "sourceUqId": "/introduction/thanks/",
          "url": "https://github.com/matyhtf"
        },
        {
          "reason": "Not allowed host",
          "sourceAttr": "<a href>",
          "sourceUqId": "/introduction/ideas-and-roadmap/",
          "url": "https://github.com/swoole/swoole-src"
        },
        {
          "reason": "Not allowed host",
          "sourceAttr": "<a href>",
          "sourceUqId": "/installation-and-requirements/manual-installation/",
          "url": "https://github.com/swoole/swoole-src/releases"
        },
        {
          "reason": "Not allowed host",
          "sourceAttr": "<a href>",
          "sourceUqId": "/installation-and-requirements/manual-installation/",
          "url": "https://github.com/swoole/swoole-src/releases/download/v4.8.13/swoole-cli-v4.8.13-cygwin-x64.zip"
        },
        {
          "reason": "Not allowed host",
          "sourceAttr": "<a href>",
          "sourceUqId": "/installation-and-requirements/manual-installation/",
          "url": "https://github.com/swoole/swoole-src/releases/download/v4.8.13/swoole-cli-v4.8.13-linux-arm64.tar.xz"
        },
        {
          "reason": "Not allowed host",
          "sourceAttr": "<a href>",
          "sourceUqId": "/installation-and-requirements/manual-installation/",
          "url": "https://github.com/swoole/swoole-src/releases/download/v4.8.13/swoole-cli-v4.8.13-macos-arm64.tar.xz"
        },
        {
          "reason": "Not allowed host",
          "sourceAttr": "<a href>",
          "sourceUqId": "/installation-and-requirements/manual-installation/",
          "url": "https://github.com/swoole/swoole-src/releases/download/v4.8.13/swoole-cli-v4.8.13-macos-x64.tar.xz"
        },
        {
          "reason": "Not allowed host",
          "sourceAttr": "<a href>",
          "sourceUqId": "/introduction/thanks/",
          "url": "https://home.snafu.de/tilman/xenulink.html"
        },
        {
          "reason": "Not allowed host",
          "sourceAttr": "<a href>",
          "sourceUqId": "/installation-and-requirements/manual-installation/",
          "url": "https://learn.microsoft.com/en-us/windows/wsl/about"
        },
        {
          "reason": "Not allowed host",
          "sourceAttr": "<a href>",
          "sourceUqId": "/installation-and-requirements/ready-to-use-packages/",
          "url": "https://learn.microsoft.com/en-us/windows/wsl/install"
        },
        {
          "reason": "Not allowed host",
          "sourceAttr": "<a href>",
          "sourceUqId": "/introduction/thanks/",
          "url": "https://nette.org/"
        },
        {
          "reason": "Not allowed host",
          "sourceAttr": "<a href>",
          "sourceUqId": "/advanced-topics/contribution-and-development/",
          "url": "https://opensource.guide/"
        },
        {
          "reason": "Not allowed host",
          "sourceAttr": "<a href>",
          "sourceUqId": "/introduction/ideas-and-roadmap/",
          "url": "https://openswoole.com/docs/modules/swoole-table"
        },
        {
          "reason": "Not allowed host",
          "sourceAttr": "<a href>",
          "sourceUqId": "/advanced-topics/contribution-and-development/",
          "url": "https://phpbestpractices.org/"
        },
        {
          "reason": "Not allowed host",
          "sourceAttr": "<a href>",
          "sourceUqId": "/introduction/thanks/",
          "url": "https://phpstan.org/"
        },
        {
          "reason": "Not allowed host",
          "sourceAttr": "<a href>",
          "sourceUqId": "/advanced-topics/contribution-and-development/",
          "url": "https://phptherightway.com/"
        },
        {
          "reason": "Not allowed host",
          "sourceAttr": "<script src>",
          "sourceUqId": "/",
          "url": "https://platform-api.sharethis.com/js/sharethis.js"
        },
        {
          "reason": "Not allowed host",
          "sourceAttr": "<a href>",
          "sourceUqId": "/introduction/ideas-and-roadmap/",
          "url": "https://reactphp.org/"
        },
        {
          "reason": "Not allowed host",
          "sourceAttr": "<a href>",
          "sourceUqId": "/introduction/thanks/",
          "url": "https://starlight.astro.build/"
        },
        {
          "reason": "Not allowed host",
          "sourceAttr": "<a href>",
          "sourceUqId": "/introduction/thanks/",
          "url": "https://svelte.dev/"
        },
        {
          "reason": "Not allowed host",
          "sourceAttr": "<a href>",
          "sourceUqId": "/introduction/thanks/",
          "url": "https://tailwindcss.com/"
        },
        {
          "reason": "Not allowed host",
          "sourceAttr": "<a href>",
          "sourceUqId": "/introduction/thanks/",
          "url": "https://twitter.com/BillGates"
        },
        {
          "reason": "Not allowed host",
          "sourceAttr": "<a href>",
          "sourceUqId": "/introduction/thanks/",
          "url": "https://twitter.com/DavidGrudl"
        },
        {
          "reason": "Not allowed host",
          "sourceAttr": "<a href>",
          "sourceUqId": "/introduction/thanks/",
          "url": "https://twitter.com/OndrejMirtes"
        },
        {
          "reason": "Not allowed host",
          "sourceAttr": "<a href>",
          "sourceUqId": "/introduction/thanks/",
          "url": "https://twitter.com/elonmusk"
        },
        {
          "reason": "Not allowed host",
          "sourceAttr": "<a href>",
          "sourceUqId": "/introduction/thanks/",
          "url": "https://twitter.com/machal"
        },
        {
          "reason": "Not allowed host",
          "sourceAttr": "<a href>",
          "sourceUqId": "/introduction/thanks/",
          "url": "https://twitter.com/rich_harris"
        },
        {
          "reason": "Not allowed host",
          "sourceAttr": "<a href>",
          "sourceUqId": "/introduction/thanks/",
          "url": "https://twitter.com/ryancarniato"
        },
        {
          "reason": "Not allowed host",
          "sourceAttr": "<a href>",
          "sourceUqId": "/introduction/thanks/",
          "url": "https://twitter.com/saadeghi?lang=cs"
        },
        {
          "reason": "Not allowed host",
          "sourceAttr": "<a href>",
          "sourceUqId": "/introduction/thanks/",
          "url": "https://twitter.com/sama"
        },
        {
          "reason": "Not allowed host",
          "sourceAttr": "<a href>",
          "sourceUqId": "/",
          "url": "https://twitter.com/siteone_crawler"
        },
        {
          "reason": "Not allowed host",
          "sourceAttr": "<a href>",
          "sourceUqId": "/introduction/thanks/",
          "url": "https://twitter.com/spazef0rze"
        },
        {
          "reason": "Not allowed host",
          "sourceAttr": "<a href>",
          "sourceUqId": "/introduction/thanks/",
          "url": "https://twitter.com/swithinbank"
        },
        {
          "reason": "Not allowed host",
          "sourceAttr": "<a href>",
          "sourceUqId": "/introduction/thanks/",
          "url": "https://twitter.com/zdendac"
        },
        {
          "reason": "Not allowed host",
          "sourceAttr": "<a href>",
          "sourceUqId": "/installation-and-requirements/manual-installation/",
          "url": "https://ubuntu.com/wsl"
        },
        {
          "reason": "Not allowed host",
          "sourceAttr": "<a href>",
          "sourceUqId": "/introduction/thanks/",
          "url": "https://www.amd.com/"
        },
        {
          "reason": "Not allowed host",
          "sourceAttr": "<a href>",
          "sourceUqId": "/introduction/thanks/",
          "url": "https://www.cdn77.com/"
        },
        {
          "reason": "Not allowed host",
          "sourceAttr": "<a href>",
          "sourceUqId": "/introduction/ideas-and-roadmap/",
          "url": "https://www.cygwin.com/"
        },
        {
          "reason": "Not allowed host",
          "sourceAttr": "<a href>",
          "sourceUqId": "/introduction/thanks/",
          "url": "https://www.electronjs.org/"
        },
        {
          "reason": "Not allowed host",
          "sourceAttr": "<a href>",
          "sourceUqId": "/introduction/thanks/",
          "url": "https://www.jetbrains.com/"
        },
        {
          "reason": "Not allowed host",
          "sourceAttr": "<a href>",
          "sourceUqId": "/introduction/thanks/",
          "url": "https://www.lenovo.com/"
        },
        {
          "reason": "Not allowed host",
          "sourceAttr": "<a href>",
          "sourceUqId": "/introduction/thanks/",
          "url": "https://www.linkedin.com/in/linustorvalds"
        },
        {
          "reason": "Not allowed host",
          "sourceAttr": "<a href>",
          "sourceUqId": "/installation-and-requirements/manual-installation/",
          "url": "https://www.linuxfordevices.com/tutorials/linux/install-debian-on-windows-wsl"
        },
        {
          "reason": "Not allowed host",
          "sourceAttr": "<a href>",
          "sourceUqId": "/introduction/thanks/",
          "url": "https://www.michalspacek.cz/"
        },
        {
          "reason": "Not allowed host",
          "sourceAttr": "<a href>",
          "sourceUqId": "/advanced-topics/contribution-and-development/",
          "url": "https://www.php-fig.org/psr/psr-12/"
        },
        {
          "reason": "Not allowed host",
          "sourceAttr": "<a href>",
          "sourceUqId": "/configuration/command-line-options/",
          "url": "https://www.php.net/manual/en/timezones.php"
        },
        {
          "reason": "Not allowed host",
          "sourceAttr": "<a href>",
          "sourceUqId": "/introduction/contact-and-community/",
          "url": "https://www.reddit.com/r/siteone_crawler/"
        },
        {
          "reason": "Not allowed host",
          "sourceAttr": "<a href>",
          "sourceUqId": "/introduction/ideas-and-roadmap/",
          "url": "https://www.rust-lang.org/"
        },
        {
          "reason": "Not allowed host",
          "sourceAttr": "<a href>",
          "sourceUqId": "/introduction/thanks/",
          "url": "https://www.siteone.io/"
        },
        {
          "reason": "Not allowed host",
          "sourceAttr": "<a href>",
          "sourceUqId": "/introduction/thanks/",
          "url": "https://www.solidjs.com/"
        },
        {
          "reason": "Not allowed host",
          "sourceAttr": "<a href>",
          "sourceUqId": "/introduction/thanks/",
          "url": "https://www.spse-po.sk/"
        },
        {
          "reason": "Not allowed host",
          "sourceAttr": "<a href>",
          "sourceUqId": "/introduction/thanks/",
          "url": "https://www.swoole.com/"
        },
        {
          "reason": "Not allowed host",
          "sourceAttr": "<a href>",
          "sourceUqId": "/introduction/thanks/",
          "url": "https://www.vzhurudolu.cz/"
        },
        {
          "reason": "Not allowed host",
          "sourceAttr": "<a href>",
          "sourceUqId": "/configuration/command-line-options/",
          "url": "https://www.w3schools.com/xml/xpath_syntax.asp"
        },
        {
          "reason": "Not allowed host",
          "sourceAttr": "<a href>",
          "sourceUqId": "/",
          "url": "https://www.youtube.com/@SiteOne-Crawler"
        },
        {
          "reason": "Not allowed host",
          "sourceAttr": "<a href>",
          "sourceUqId": "/introduction/faq/",
          "url": "https://x.com/janreges"
        }
      ],
      "title": "Skipped URLs"
    },
    "skipped-summary": {
      "aplCode": "skipped-summary",
      "columns": {
        "count": {
          "aplCode": "count",
          "escapeOutputHtml": true,
          "forcedDataType": null,
          "formatter": null,
          "formatterWillChangeValueLength": false,
          "getDataValueCallback": null,
          "name": "Unique URLs",
          "nonBreakingSpaces": false,
          "renderer": null,
          "truncateIfLonger": false,
          "width": 11
        },
        "domain": {
          "aplCode": "domain",
          "escapeOutputHtml": true,
          "forcedDataType": null,
          "formatter": null,
          "formatterWillChangeValueLength": false,
          "getDataValueCallback": null,
          "name": "Domain",
          "nonBreakingSpaces": false,
          "renderer": null,
          "truncateIfLonger": false,
          "width": -1
        },
        "reason": {
          "aplCode": "reason",
          "escapeOutputHtml": true,
          "forcedDataType": null,
          "formatter": null,
          "formatterWillChangeValueLength": false,
          "getDataValueCallback": null,
          "name": "Reason",
          "nonBreakingSpaces": false,
          "renderer": null,
          "truncateIfLonger": false,
          "width": 18
        }
      },
      "position": "before-url-table",
      "rows": [
        {
          "count": "29",
          "domain": "github.com",
          "reason": "Not allowed host"
        },
        {
          "count": "13",
          "domain": "twitter.com",
          "reason": "Not allowed host"
        },
        {
          "count": "6",
          "domain": "crawler.siteone.io",
          "reason": "Robots.txt"
        },
        {
          "count": "4",
          "domain": "en.wikipedia.org",
          "reason": "Not allowed host"
        },
        {
          "count": "2",
          "domain": "learn.microsoft.com",
          "reason": "Not allowed host"
        },
        {
          "count": "2",
          "domain": "discord.gg",
          "reason": "Not allowed host"
        },
        {
          "count": "1",
          "domain": "www.rust-lang.org",
          "reason": "Not allowed host"
        },
        {
          "count": "1",
          "domain": "nette.org",
          "reason": "Not allowed host"
        },
        {
          "count": "1",
          "domain": "phpstan.org",
          "reason": "Not allowed host"
        },
        {
          "count": "1",
          "domain": "www.php-fig.org",
          "reason": "Not allowed host"
        },
        {
          "count": "1",
          "domain": "www.solidjs.com",
          "reason": "Not allowed host"
        },
        {
          "count": "1",
          "domain": "www.w3schools.com",
          "reason": "Not allowed host"
        },
        {
          "count": "1",
          "domain": "www.jetbrains.com",
          "reason": "Not allowed host"
        },
        {
          "count": "1",
          "domain": "phptherightway.com",
          "reason": "Not allowed host"
        },
        {
          "count": "1",
          "domain": "www.linuxfordevices.com",
          "reason": "Not allowed host"
        },
        {
          "count": "1",
          "domain": "alternativeto.net",
          "reason": "Not allowed host"
        },
        {
          "count": "1",
          "domain": "www.siteone.io",
          "reason": "Not allowed host"
        },
        {
          "count": "1",
          "domain": "reactphp.org",
          "reason": "Not allowed host"
        },
        {
          "count": "1",
          "domain": "opensource.guide",
          "reason": "Not allowed host"
        },
        {
          "count": "1",
          "domain": "www.electronjs.org",
          "reason": "Not allowed host"
        },
        {
          "count": "1",
          "domain": "phpbestpractices.org",
          "reason": "Not allowed host"
        },
        {
          "count": "1",
          "domain": "tailwindcss.com",
          "reason": "Not allowed host"
        },
        {
          "count": "1",
          "domain": "www.cygwin.com",
          "reason": "Not allowed host"
        },
        {
          "count": "1",
          "domain": "www.php.net",
          "reason": "Not allowed host"
        },
        {
          "count": "1",
          "domain": "adamwathan.me",
          "reason": "Not allowed host"
        },
        {
          "count": "1",
          "domain": "www.youtube.com",
          "reason": "Not allowed host"
        },
        {
          "count": "1",
          "domain": "www.reddit.com",
          "reason": "Not allowed host"
        },
        {
          "count": "1",
          "domain": "svelte.dev",
          "reason": "Not allowed host"
        },
        {
          "count": "1",
          "domain": "x.com",
          "reason": "Not allowed host"
        },
        {
          "count": "1",
          "domain": "www.lenovo.com",
          "reason": "Not allowed host"
        },
        {
          "count": "1",
          "domain": "www.cdn77.com",
          "reason": "Not allowed host"
        },
        {
          "count": "1",
          "domain": "www.vzhurudolu.cz",
          "reason": "Not allowed host"
        },
        {
          "count": "1",
          "domain": "starlight.astro.build",
          "reason": "Not allowed host"
        },
        {
          "count": "1",
          "domain": "www.spse-po.sk",
          "reason": "Not allowed host"
        },
        {
          "count": "1",
          "domain": "openswoole.com",
          "reason": "Not allowed host"
        },
        {
          "count": "1",
          "domain": "cz.linkedin.com",
          "reason": "Not allowed host"
        },
        {
          "count": "1",
          "domain": "www.michalspacek.cz",
          "reason": "Not allowed host"
        },
        {
          "count": "1",
          "domain": "chat.openai.com",
          "reason": "Not allowed host"
        },
        {
          "count": "1",
          "domain": "platform-api.sharethis.com",
          "reason": "Not allowed host"
        },
        {
          "count": "1",
          "domain": "www.swoole.com",
          "reason": "Not allowed host"
        },
        {
          "count": "1",
          "domain": "www.linkedin.com",
          "reason": "Not allowed host"
        },
        {
          "count": "1",
          "domain": "ubuntu.com",
          "reason": "Not allowed host"
        },
        {
          "count": "1",
          "domain": "www.amd.com",
          "reason": "Not allowed host"
        },
        {
          "count": "1",
          "domain": "daisyui.com",
          "reason": "Not allowed host"
        },
        {
          "count": "1",
          "domain": "home.snafu.de",
          "reason": "Not allowed host"
        }
      ],
      "title": "Skipped URLs Summary"
    },
    "slowest-urls": {
      "aplCode": "slowest-urls",
      "columns": {
        "requestTime": {
          "aplCode": "requestTime",
          "escapeOutputHtml": true,
          "forcedDataType": null,
          "formatter": {},
          "formatterWillChangeValueLength": false,
          "getDataValueCallback": null,
          "name": "Time",
          "nonBreakingSpaces": false,
          "renderer": null,
          "truncateIfLonger": false,
          "width": 6
        },
        "statusCode": {
          "aplCode": "statusCode",
          "escapeOutputHtml": true,
          "forcedDataType": null,
          "formatter": {},
          "formatterWillChangeValueLength": false,
          "getDataValueCallback": null,
          "name": "Status",
          "nonBreakingSpaces": false,
          "renderer": null,
          "truncateIfLonger": false,
          "width": 6
        },
        "url": {
          "aplCode": "url",
          "escapeOutputHtml": true,
          "forcedDataType": null,
          "formatter": null,
          "formatterWillChangeValueLength": true,
          "getDataValueCallback": null,
          "name": "Slow URL",
          "nonBreakingSpaces": false,
          "renderer": null,
          "truncateIfLonger": true,
          "width": 113
        }
      },
      "position": "before-url-table",
      "rows": [
        {
          "requestTime": "0.0732",
          "statusCode": "200",
          "url": "https://crawler.siteone.io/getting-started/basic-usage/"
        },
        {
          "requestTime": "0.0548",
          "statusCode": "200",
          "url": "https://crawler.siteone.io/configuration/command-line-options/"
        },
        {
          "requestTime": "0.0400",
          "statusCode": "200",
          "url": "https://crawler.siteone.io/introduction/contact-and-community/"
        },
        {
          "requestTime": "0.0330",
          "statusCode": "200",
          "url": "https://crawler.siteone.io/features/ease-of-use/"
        }
      ],
      "title": "TOP slowest URLs"
    },
    "source-domains": {
      "aplCode": "source-domains",
      "columns": {
        "CSS": {
          "aplCode": "CSS",
          "escapeOutputHtml": true,
          "forcedDataType": null,
          "formatter": {},
          "formatterWillChangeValueLength": false,
          "getDataValueCallback": null,
          "name": "CSS",
          "nonBreakingSpaces": false,
          "renderer": null,
          "truncateIfLonger": false,
          "width": -1
        },
        "Document": {
          "aplCode": "Document",
          "escapeOutputHtml": true,
          "forcedDataType": null,
          "formatter": {},
          "formatterWillChangeValueLength": false,
          "getDataValueCallback": null,
          "name": "Document",
          "nonBreakingSpaces": false,
          "renderer": null,
          "truncateIfLonger": false,
          "width": -1
        },
        "HTML": {
          "aplCode": "HTML",
          "escapeOutputHtml": true,
          "forcedDataType": null,
          "formatter": {},
          "formatterWillChangeValueLength": false,
          "getDataValueCallback": null,
          "name": "HTML",
          "nonBreakingSpaces": false,
          "renderer": null,
          "truncateIfLonger": false,
          "width": -1
        },
        "Image": {
          "aplCode": "Image",
          "escapeOutputHtml": true,
          "forcedDataType": null,
          "formatter": {},
          "formatterWillChangeValueLength": false,
          "getDataValueCallback": null,
          "name": "Image",
          "nonBreakingSpaces": false,
          "renderer": null,
          "truncateIfLonger": false,
          "width": -1
        },
        "JS": {
          "aplCode": "JS",
          "escapeOutputHtml": true,
          "forcedDataType": null,
          "formatter": {},
          "formatterWillChangeValueLength": false,
          "getDataValueCallback": null,
          "name": "JS",
          "nonBreakingSpaces": false,
          "renderer": null,
          "truncateIfLonger": false,
          "width": -1
        },
        "domain": {
          "aplCode": "domain",
          "escapeOutputHtml": true,
          "forcedDataType": null,
          "formatter": null,
          "formatterWillChangeValueLength": false,
          "getDataValueCallback": null,
          "name": "Domain",
          "nonBreakingSpaces": false,
          "renderer": null,
          "truncateIfLonger": false,
          "width": -1
        },
        "totals": {
          "aplCode": "totals",
          "escapeOutputHtml": true,
          "forcedDataType": null,
          "formatter": {},
          "formatterWillChangeValueLength": false,
          "getDataValueCallback": null,
          "name": "Totals",
          "nonBreakingSpaces": false,
          "renderer": null,
          "truncateIfLonger": false,
          "width": -1
        }
      },
      "position": "after-url-table",
      "rows": [
        {
          "Audio": "",
          "CSS": "3/79kB/11ms",
          "Document": "1/152B/3ms",
          "Font": "",
          "HTML": "55/3MB/453ms",
          "Image": "9/18MB/779ms",
          "JS": "5/9kB/59ms",
          "JSON": "",
          "Other": "",
          "Redirect": "",
          "Video": "",
          "XML": "",
          "domain": "crawler.siteone.io",
          "totalCount": "73",
          "totals": "73/21MB/1.3s"
        }
      ],
      "title": "Source domains"
    }
  }
}


================================================
FILE: docs/OUTPUT-crawler.siteone.io.txt
================================================

 ####                ####             #####        
 ####                ####           #######        
 ####      ###       ####         #########        
 ####     ######     ####       ###### ####        
  ######################       #####   ####        
    #######    #######       #####     ####        
    #######    #######         #       ####        
  ######################               ####        
 ####     ######     ####              ####        
 ####       ##       ####              ####        
 ####                ####       ################## 
 ####                ####       ################## 

==================================================
# SiteOne Crawler, v2.0.0.20260316               #
# Author: jan.reges@siteone.cz                   #
==================================================


Detected terminal width 138 < 140 chars - compact mode activated.

Progress| URL                                                    | Status | Type     | Time   | Size   | Cache  | Access.  | Best pr.
--------------------------------------------------------------------------------------------------------------------------------------
1/40    | /                                                      | 200    | HTML     | 4 ms   | 50 kB  | 60 min  | 3/1      | 7       
2/66    | /introduction/key-features/                            | 200    | HTML     | 4 ms   | 54 kB  | 60 min  | 2/2      | 1/6     
3/66    | /configuration/command-line-options/                   | 200    | HTML     | 10 ms  | 107 kB | 60 min  | 2/2      | 1/6     
4/69    | /installation-and-requirements/ready-to-use-packages/  | 200    | HTML     | 5 ms   | 67 kB  | 60 min  | 2/2      | 1/6     
5/69    | /_astro/siteone-crawler-logo-dark.DaIuiR1U.svg         | 200    | Image    | 2 ms   | 673 B  | 12 mon  |          |         
6/69    | /features/dev-devops-assistant/                        | 200    | HTML     | 4 ms   | 43 kB  | 60 min  | 2/2      | 1/6     
7/69    | /configuration/examples/                               | 200    | HTML     | 15 ms  | 79 kB  | 60 min  | 2/2      | 1/6     
8/69    | /features/online-html-report-upload/                   | 200    | HTML     | 9 ms   | 58 kB  | 60 min  | 2/2      | 1/6     
9/69    | /features/ease-of-use/                                 | 200    | HTML     | 4 ms   | 42 kB  | 60 min  | 2/2      | 1/6     
10/69   | /siteone-crawler-app-demo.avif                         | 200    | Image    | 32 ms  | 591 kB | 12 mon  |          |         
11/69   | /features/accessibility-analysis/                      | 200    | HTML     | 4 ms   | 50 kB  | 60 min  | 2/2      | 1/6     
12/69   | /introduction/overview/                                | 200    | HTML     | 4 ms   | 59 kB  | 60 min  | 2/2      | 1/6     
13/69   | /features/redirect-and-404-analysis/                   | 200    | HTML     | 4 ms   | 50 kB  | 60 min  | 2/2      | 1/6     
14/69   | /features/performance-analysis/                        | 200    | HTML     | 4 ms   | 58 kB  | 60 min  | 2/2      | 1/6     
15/69   | /features/website-to-markdown-converter/               | 200    | HTML     | 4 ms   | 52 kB  | 60 min  | 2/2      | 1/6     
16/69   | /introduction/contact-and-community/                   | 200    | HTML     | 4 ms   | 53 kB  | 60 min  | 2/2      | 1/6     
17/69   | /features/offline-website-generator/                   | 200    | HTML     | 5 ms   | 51 kB  | 60 min  | 2/2      | 1/6     
18/69   | /_astro/page.7qqag-5g.js                               | 200    | JS       | 2 ms   | 2 kB   | 12 mon  |          |         
19/69   | /_astro/Search.astro_astro_type_scri…_lang.DMZ5WJ-J.js | 200    | JS       | 3 ms   | 3 kB   | 12 mon  |          |         
20/69   | /robots.txt                                            | 200    | Document | 2 ms   | 152 B  | 60 min  |          |         
21/69   | /features/audit-report/                                | 200    | HTML     | 4 ms   | 52 kB  | 60 min  | 2/2      | 1/6     
22/69   | /installation-and-requirements/manual-installation/    | 200    | HTML     | 5 ms   | 70 kB  | 60 min  | 2/2      | 1/6     
23/70   | /installation-and-requirements/desktop-application/    | 200    | HTML     | 10 ms  | 50 kB  | 60 min  | 2/2      | 1/6     
24/70   | /features/technical-analysis/                          | 200    | HTML     | 5 ms   | 73 kB  | 60 min  | 2/2      | 1/6     
25/70   | /_astro/index.BRwACyc2.css                             | 200    | CSS      | 6 ms   | 60 kB  | 12 mon  |          |         
26/70   | /siteone-crawler-app-demo.gif                          | 200    | Image    | 134 ms | 3 MB   | 12 mon  |          |         
27/70   | /features/heading-analysis/                            | 200    | HTML     | 76 ms  | 42 kB  | 60 min  | 2/2      | 1/6     
28/70   | /features/security-analysis/                           | 200    | HTML     | 4 ms   | 55 kB  | 60 min  | 2/2      | 1/6     
29/70   | /favicon.svg                                           | 200    | Image    | 2 ms   | 673 B  | 12 mon  |          |         
30/70   | /siteone-crawler-command-line-demo-w960.avif           | 200    | Image    | 50 ms  | 1 MB   | 12 mon  |          |         
31/70   | /_astro/siteone-crawler-mascot.CPk15tXh_HGMwJ.webp     | 200    | Image    | 14 ms  | 31 kB  | 12 mon  |          |         
32/70   | /installation-and-requirements/system-requirements/    | 200    | HTML     | 5 ms   | 57 kB  | 60 min  | 2/2      | 1/6     
33/70   | /getting-started/basic-usage/                          | 200    | HTML     | 56 ms  | 60 kB  | 60 min  | 2/2      | 1/6     
34/70   | /getting-started/quick-start-guide/                    | 200    | HTML     | 70 ms  | 52 kB  | 60 min  | 2/2      | 1/6     
35/70   | /siteone-crawler-command-line-demo-full.gif            | 200    | Image    | 344 ms | 8 MB   | 12 mon  |          |         
36/70   | /features/deep-website-crawling/                       | 200    | HTML     | 142 ms | 44 kB  | 60 min  | 2/2      | 1/6     
37/70   | /features/seo-and-opengraph-analysis/                  | 200    | HTML     | 8 ms   | 44 kB  | 60 min  | 2/2      | 1/6     
38/70   | /siteone-crawler-command-line-demo-w960.gif            | 200    | Image    | 191 ms | 4 MB   | 12 mon  |          |         
39/70   | /introduction/ideas-and-roadmap/                       | 200    | HTML     | 132 ms | 52 kB  | 60 min  | 2/2      | 1/6     
40/70   | /_astro/print.DNXP8c50.css                             | 200    | CSS      | 6 ms   | 3 kB   | 12 mon  |          |         
41/73   | /features/performance-metrics/                         | 200    | HTML     | 4 ms   | 63 kB  | 60 min  | 2/2      | 1/6     
42/73   | /features/content-type-analysis/                       | 200    | HTML     | 4 ms   | 54 kB  | 60 min  | 2/2      | 1/6     
43/73   | /advanced-topics/crawler-behavior/                     | 200    | HTML     | 4 ms   | 64 kB  | 60 min  | 2/2      | 1/6     
44/73   | /features/sitemap-generator/                           | 200    | HTML     | 4 ms   | 45 kB  | 60 min  | 2/2      | 1/6     
45/73   | /features/stress-testing/                              | 200    | HTML     | 4 ms   | 46 kB  | 60 min  | 2/2      | 1/6     
46/73   | /features/best-practices-analysis/                     | 200    | HTML     | 4 ms   | 51 kB  | 60 min  | 2/2      | 1/6     
47/73   | /introduction/support-us/                              | 200    | HTML     | 5 ms   | 41 kB  | 60 min  | 2/2      | 1/6     
48/73   | /getting-started/advanced-usage/                       | 200    | HTML     | 5 ms   | 71 kB  | 60 min  | 2/2      | 1/6     
49/73   | /features/caching-analysis/                            | 200    | HTML     | 4 ms   | 62 kB  | 60 min  | 2/2      | 1/6     
50/73   | /features/exports-and-reports/                         | 200    | HTML     | 4 ms   | 46 kB  | 60 min  | 2/2      | 1/6     
51/73   | /introduction/thanks/                                  | 200    | HTML     | 4 ms   | 46 kB  | 60 min  | 2/2      | 1/6     
52/73   | /features/availability/                                | 200    | HTML     | 4 ms   | 43 kB  | 60 min  | 2/2      | 1/6     
53/73   | /advanced-topics/extending/                            | 200    | HTML     | 5 ms   | 112 kB | 60 min  | 2/2      | 1/6     
54/73   | /introduction/faq/                                     | 200    | HTML     | 4 ms   | 64 kB  | 60 min  | 2/2      | 1/6     
55/73   | /_astro/MobileTableOfContents.astro_…_lang.C181hMzK.js | 200    | JS       | 2 ms   | 667 B  | 12 mon  |          |         
56/73   | /introduction/motivation/                              | 200    | HTML     | 6 ms   | 50 kB  | 60 min  | 2/2      | 1/6     
57/73   | /features/dns-analysis/                                | 200    | HTML     | 4 ms   | 56 kB  | 60 min  | 2/2      | 1/6     
58/73   | /features/mailer/                                      | 200    | HTML     | 4 ms   | 45 kB  | 60 min  | 2/2      | 1/6     
59/73   | /advanced-topics/troubleshooting/                      | 200    | HTML     | 4 ms   | 71 kB  | 60 min  | 2/2      | 1/6     
60/73   | /_astro/TableOfContents.astro_astro_…_lang.CKWWgpjV.js | 200    | JS       | 2 ms   | 2 kB   | 12 mon  |          |         
61/73   | /advanced-topics/caching/                              | 200    | HTML     | 4 ms   | 57 kB  | 60 min  | 2/2      | 1/6     
62/73   | /advanced-topics/contribution-and-development/         | 200    | HTML     | 5 ms   | 76 kB  | 60 min  | 2/2      | 1/6     
63/73   | /features/headers-analysis/                            | 200    | HTML     | 4 ms   | 68 kB  | 60 min  | 2/2      | 1/6     
64/73   | /features/ssl-tls-analysis/                            | 200    | HTML     | 4 ms   | 59 kB  | 60 min  | 2/2      | 1/6     
65/73   | /features/improvement-meter/                           | 200    | HTML     | 4 ms   | 45 kB  | 60 min  | 2/2      | 1/6     
66/73   | /features/source-domains-analysis/                     | 429    | HTML     | 3 ms   | 788 B  | etag    |          | 4       
67/73   | /_astro/ec.8zarh.js                                    | 429    | HTML     | 2 ms   | 788 B  | etag    |          | 4       
68/73   | /_astro/ec.5wl1j.css                                   | 429    | HTML     | 2 ms   | 788 B  | etag    |          | 4       
69/73   | /_astro/ready-to-use-packages.BYCPharn_Z1ivwN5.webp    | 429    | HTML     | 2 ms   | 788 B  | etag    |          | 4       
70/73   | /_astro/desktop-app-release-assets.D…-vv8_Z2rPu7O.webp | 429    | HTML     | 2 ms   | 788 B  | etag    |          | 4       
71/73   | /docs/features/technical-analysis                      | 429    | HTML     | 2 ms   | 788 B  | etag    |          | 4       
72/73   | /docs/features/best-practices-analysis                 | 429    | HTML     | 2 ms   | 788 B  | etag    |          | 4       
73/73   | /docs/features/content-type-analysis                   | 404    | HTML     | 3 ms   | 780 B  | etag    |          | 4       

Skipped URLs Summary
--------------------

Reason             | Domain                     | Unique URLs
---------------------------------------------------------------
Not allowed host   | github.com                 | 29         
Not allowed host   | twitter.com                | 13         
Robots.txt         | crawler.siteone.io         | 6          
Not allowed host   | en.wikipedia.org           | 4          
Not allowed host   | discord.gg                 | 2          
Not allowed host   | learn.microsoft.com        | 2          
Not allowed host   | home.snafu.de              | 1          
Not allowed host   | svelte.dev                 | 1          
Not allowed host   | www.php.net                | 1          
Not allowed host   | phptherightway.com         | 1          
Not allowed host   | alternativeto.net          | 1          
Not allowed host   | reactphp.org               | 1          
Not allowed host   | www.siteone.io             | 1          
Not allowed host   | www.php-fig.org            | 1          
Not allowed host   | www.w3schools.com          | 1          
Not allowed host   | www.jetbrains.com          | 1          
Not allowed host   | www.linkedin.com           | 1          
Not allowed host   | adamwathan.me              | 1          
Not allowed host   | www.cygwin.com             | 1          
Not allowed host   | tailwindcss.com            | 1          
Not allowed host   | chat.openai.com            | 1          
Not allowed host   | nette.org                  | 1          
Not allowed host   | starlight.astro.build      | 1          
Not allowed host   | www.youtube.com            | 1          
Not allowed host   | www.linuxfordevices.com    | 1          
Not allowed host   | opensource.guide           | 1          
Not allowed host   | platform-api.sharethis.com | 1          
Not allowed host   | www.vzhurudolu.cz          | 1          
Not allowed host   | cz.linkedin.com            | 1          
Not allowed host   | www.cdn77.com              | 1          
Not allowed host   | phpbestpractices.org       | 1          
Not allowed host   | www.solidjs.com            | 1          
Not allowed host   | www.rust-lang.org          | 1          
Not allowed host   | www.amd.com                | 1          
Not allowed host   | www.spse-po.sk             | 1          
Not allowed host   | www.michalspacek.cz        | 1          
Not allowed host   | ubuntu.com                 | 1          
Not allowed host   | daisyui.com                | 1          
Not allowed host   | openswoole.com             | 1          
Not allowed host   | phpstan.org                | 1          
Not allowed host   | www.swoole.com             | 1          
Not allowed host   | www.lenovo.com             | 1          
Not allowed host   | www.reddit.com             | 1          
Not allowed host   | x.com                      | 1          
Not allowed host   | www.electronjs.org         | 1          


Skipped URLs
------------

Reason             | Skipped URL                                                  | Source              | Found at URL                                                
------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Robots.txt         | /examples-exports/docs.astro.build/                          | <a href>            | /                                                           
Robots.txt         | /examples-exports/netlify.com/                               | <a href>            | /                                                           
Robots.txt         | /examples-exports/nextjs.org/                                | <a href>            | /                                                           
Robots.txt         | /html/2024-08-23/forever/cl8xw4r-fdag8wg-44dd.html           | <a href>            | /                                                           
Robots.txt         | /html/2024-08-23/forever/x2-vuvb0oi6qxkr-ku79.html           | <a href>            | /                                                           
Robots.txt         | /html/2024-08-24/forever/hwzxj1-qrs69-1fqlxbd.html           | <a href>            | /                                                           
Not allowed host   | https://adamwathan.me/                                       | <a href>            | /introduction/thanks/                                       
Not allowed host   | https://alternativeto.net/software/siteo…ite-analyzer/about/ | <a href>            | /introduction/support-us/                                   
Not allowed host   | https://chat.openai.com/                                     | <a href>            | /introduction/thanks/                                       
Not allowed host   | https://cz.linkedin.com/in/janbezdek                         | <a href>            | /introduction/thanks/                                       
Not allowed host   | https://daisyui.com/                                         | <a href>            | /introduction/thanks/                                       
Not allowed host   | https://discord.gg/Uh66HaZJ                                  | <a href>            | /                                                           
Not allowed host   | https://discord.gg/fdm7KE8Z                                  | <a href>            | /introduction/contact-and-community/                        
Not allowed host   | https://en.wikipedia.org/wiki/Larry_Page                     | <a href>            | /introduction/thanks/                                       
Not allowed host   | https://en.wikipedia.org/wiki/Sergey_Brin                    | <a href>            | /introduction/thanks/                                       
Not allowed host   | https://en.wikipedia.org/wiki/Steve_Jobs                     | <a href>            | /introduction/thanks/                                       
Not allowed host   | https://en.wikipedia.org/wiki/Tilman_Hausherr                | <a href>            | /introduction/thanks/                                       
Not allowed host   | https://github.com/janreges/siteone-crawler                  | <a href>            | /                                                           
Not allowed host   | https://github.com/janreges/siteone-crawler-gui              | <a href>            | /                                                           
Not allowed host   | https://github.com/janreges/siteone-crawler-gui/issues/new   | <a href>            | /features/dev-devops-assistant/                             
Not allowed host   | https://github.com/janreges/siteone-crawler-gui/releases     | <a href>            | /                                                           
Not allowed host   | https://github.com/janreges/siteone-craw…rm64-1.0.8.AppImage | <a href>            | /installation-and-requirements/desktop-application/         
Not allowed host   | https://github.com/janreges/siteone-craw…nux-arm64-1.0.8.deb | <a href>            | /installation-and-requirements/desktop-application/         
Not allowed host   | https://github.com/janreges/siteone-craw…ux-arm64-1.0.8.snap | <a href>            | /installation-and-requirements/desktop-application/         
Not allowed host   | https://github.com/janreges/siteone-craw…-x64-1.0.8.AppImage | <a href>            | /                                                           
Not allowed host   | https://github.com/janreges/siteone-craw…linux-x64-1.0.8.deb | <a href>            | /                                                           
Not allowed host   | https://github.com/janreges/siteone-craw…inux-x64-1.0.8.snap | <a href>            | /                                                           
Not allowed host   | https://github.com/janreges/siteone-craw…mac-arm64-1.0.8.dmg | <a href>            | /                                                           
Not allowed host   | https://github.com/janreges/siteone-craw…r-mac-x64-1.0.8.dmg | <a href>            | /                                                           
Not allowed host   | https://github.com/janreges/siteone-craw…-1.0.8-portable.exe | <a href>            | /                                                           
Not allowed host   | https://github.com/janreges/siteone-craw…x64-1.0.8-setup.exe | <a href>            | /                                                           
Not allowed host   | https://github.com/janreges/siteone-craw…r-win-x64-1.0.8.msi | <a href>            | /                                                           
Not allowed host   | https://github.com/janreges/siteone-craw…/react.dev/index.md | <a href>            | /                                                           
Not allowed host   | https://github.com/janreges/siteone-crawler/                 | <a href>            | /introduction/faq/                                          
Not allowed host   | https://github.com/janreges/siteone-crawler/discussions      | <a href>            | /introduction/contact-and-community/                        
Not allowed host   | https://github.com/janreges/siteone-crawler/issues           | <a href>            | /                                                           
Not allowed host   | https://github.com/janreges/siteone-crawler/issues/new       | <a href>            | /features/dev-devops-assistant/                             
Not allowed host   | https://github.com/janreges/siteone-crawler/releases         | <a href>            | /installation-and-requirements/ready-to-use-packages/       
Not allowed host   | https://github.com/janreges/siteone-craw…-v1.0.8-win-x64.zip | <a href>            | /installation-and-requirements/ready-to-use-packages/       
Not allowed host   | https://github.com/matyhtf                                   | <a href>            | /introduction/thanks/                                       
Not allowed host   | https://github.com/swoole/swoole-src                         | <a href>            | /introduction/ideas-and-roadmap/                            
Not allowed host   | https://github.com/swoole/swoole-src/releases                | <a href>            | /installation-and-requirements/manual-installation/         
Not allowed host   | https://github.com/swoole/swoole-src/rel…8.13-cygwin-x64.zip | <a href>            | /installation-and-requirements/manual-installation/         
Not allowed host   | https://github.com/swoole/swoole-src/rel…-linux-arm64.tar.xz | <a href>            | /installation-and-requirements/manual-installation/         
Not allowed host   | https://github.com/swoole/swoole-src/rel…-macos-arm64.tar.xz | <a href>            | /installation-and-requirements/manual-installation/         
Not allowed host   | https://github.com/swoole/swoole-src/rel…13-macos-x64.tar.xz | <a href>            | /installation-and-requirements/manual-installation/         
Not allowed host   | https://home.snafu.de/tilman/xenulink.html                   | <a href>            | /introduction/thanks/                                       
Not allowed host   | https://learn.microsoft.com/en-us/windows/wsl/about          | <a href>            | /installation-and-requirements/manual-installation/         
Not allowed host   | https://learn.microsoft.com/en-us/windows/wsl/install        | <a href>            | /installation-and-requirements/ready-to-use-packages/       
Not allowed host   | https://nette.org/                                           | <a href>            | /introduction/thanks/                                       
Not allowed host   | https://opensource.guide/                                    | <a href>            | /advanced-topics/contribution-and-development/              
Not allowed host   | https://openswoole.com/docs/modules/swoole-table             | <a href>            | /introduction/ideas-and-roadmap/                            
Not allowed host   | https://phpbestpractices.org/                                | <a href>            | /advanced-topics/contribution-and-development/              
Not allowed host   | https://phpstan.org/                                         | <a href>            | /introduction/thanks/                                       
Not allowed host   | https://phptherightway.com/                                  | <a href>            | /advanced-topics/contribution-and-development/              
Not allowed host   | https://platform-api.sharethis.com/js/sharethis.js           | <script src>        | /                                                           
Not allowed host   | https://reactphp.org/                                        | <a href>            | /introduction/ideas-and-roadmap/                            
Not allowed host   | https://starlight.astro.build/                               | <a href>            | /introduction/thanks/                                       
Not allowed host   | https://svelte.dev/                                          | <a href>            | /introduction/thanks/                                       
Not allowed host   | https://tailwindcss.com/                                     | <a href>            | /introduction/thanks/                                       
Not allowed host   | https://twitter.com/BillGates                                | <a href>            | /introduction/thanks/                                       
Not allowed host   | https://twitter.com/DavidGrudl                               | <a href>            | /introduction/thanks/                                       
Not allowed host   | https://twitter.com/OndrejMirtes                             | <a href>            | /introduction/thanks/                                       
Not allowed host   | https://twitter.com/elonmusk                                 | <a href>            | /introduction/thanks/                                       
Not allowed host   | https://twitter.com/machal                                   | <a href>            | /introduction/thanks/                                       
Not allowed host   | https://twitter.com/rich_harris                              | <a href>            | /introduction/thanks/                                       
Not allowed host   | https://twitter.com/ryancarniato                             | <a href>            | /introduction/thanks/                                       
Not allowed host   | https://twitter.com/saadeghi?lang=cs                         | <a href>            | /introduction/thanks/                                       
Not allowed host   | https://twitter.com/sama                                     | <a href>            | /introduction/thanks/                                       
Not allowed host   | https://twitter.com/siteone_crawler                          | <a href>            | /                                                           
Not allowed host   | https://twitter.com/spazef0rze                               | <a href>            | /introduction/thanks/                                       
Not allowed host   | https://twitter.com/swithinbank                              | <a href>            | /introduction/thanks/                                       
Not allowed host   | https://twitter.com/zdendac                                  | <a href>            | /introduction/thanks/                                       
Not allowed host   | https://ubuntu.com/wsl                                       | <a href>            | /installation-and-requirements/manual-installation/         
Not allowed host   | https://www.amd.com/                                         | <a href>            | /introduction/thanks/                                       
Not allowed host   | https://www.cdn77.com/                                       | <a href>            | /introduction/thanks/                                       
Not allowed host   | https://www.cygwin.com/                                      | <a href>            | /introduction/ideas-and-roadmap/                            
Not allowed host   | https://www.electronjs.org/                                  | <a href>            | /introduction/thanks/                                       
Not allowed host   | https://www.jetbrains.com/                                   | <a href>            | /introduction/thanks/                                       
Not allowed host   | https://www.lenovo.com/                                      | <a href>            | /introduction/thanks/                                       
Not allowed host   | https://www.linkedin.com/in/linustorvalds                    | <a href>            | /introduction/thanks/                                       
Not allowed host   | https://www.linuxfordevices.com/tutorial…bian-on-windows-wsl | <a href>            | /installation-and-requirements/manual-installation/         
Not allowed host   | https://www.michalspacek.cz/                                 | <a href>            | /introduction/thanks/                                       
Not allowed host   | https://www.php-fig.org/psr/psr-12/                          | <a href>            | /advanced-topics/contribution-and-development/              
Not allowed host   | https://www.php.net/manual/en/timezones.php                  | <a href>            | /configuration/command-line-options/                        
Not allowed host   | https://www.reddit.com/r/siteone_crawler/                    | <a href>            | /introduction/contact-and-community/                        
Not allowed host   | https://www.rust-lang.org/                                   | <a href>            | /introduction/ideas-and-roadmap/                            
Not allowed host   | https://www.siteone.io/                                      | <a href>            | /introduction/thanks/                                       
Not allowed host   | https://www.solidjs.com/                                     | <a href>            | /introduction/thanks/                                       
Not allowed host   | https://www.spse-po.sk/                                      | <a href>            | /introduction/thanks/                                       
Not allowed host   | https://www.swoole.com/                                      | <a href>            | /introduction/thanks/                                       
Not allowed host   | https://www.vzhurudolu.cz/                                   | <a href>            | /introduction/thanks/                                       
Not allowed host   | https://www.w3schools.com/xml/xpath_syntax.asp               | <a href>            | /configuration/command-line-options/                        
Not allowed host   | https://www.youtube.com/@SiteOne-Crawler                     | <a href>            | /                                                           
Not allowed host   | https://x.com/janreges                                       | <a href>            | /introduction/faq/                                          


External URLs
-------------

External URL                                                                                                        | Pages | Found on URL (max 5)                                                           
---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
https://adamwathan.me/                                                                                              | 1     | /introduction/thanks/                                                          
https://alternativeto.net/software/siteone-crawler--deep-website-analyzer/about/                                    | 1     | /introduction/support-us/                                                      
https://chat.openai.com/                                                                                            | 1     | /introduction/thanks/                                                          
https://cz.linkedin.com/in/janbezdek                                                                                | 1     | /introduction/thanks/                                                          
https://daisyui.com/                                                                                                | 1     | /introduction/thanks/                                                          
https://discord.gg/Uh66HaZJ                                                                                         | 1     | /                                                                              
https://discord.gg/fdm7KE8Z                                                                                         | 1     | /introduction/contact-and-community/                                           
https://en.wikipedia.org/wiki/Larry_Page                                                                            | 1     | /introduction/thanks/                                                          
https://en.wikipedia.org/wiki/Sergey_Brin                                                                           | 1     | /introduction/thanks/                                                          
https://en.wikipedia.org/wiki/Steve_Jobs                                                                            | 1     | /introduction/thanks/                                                          
https://en.wikipedia.org/wiki/Tilman_Hausherr                                                                       | 1     | /introduction/thanks/                                                          
https://github.com/janreges/siteone-crawler                                                                         | 1     | /                                                                              
https://github.com/janreges/siteone-crawler-gui                                                                     | 1     | /                                                                              
https://github.com/janreges/siteone-crawler-gui/issues/new                                                          | 1     | /features/dev-devops-assistant/                                                
https://github.com/janreges/siteone-crawler-gui/releases                                                            | 1     | /                                                                              
https://github.com/janreges/siteone-crawler-gui/releases/download/v1.0.8/SiteOne-Crawler-linux-arm64-1.0.8.AppImage | 1     | /installation-and-requirements/desktop-application/                            
https://github.com/janreges/siteone-crawler-gui/releases/download/v1.0.8/SiteOne-Crawler-linux-arm64-1.0.8.deb      | 1     | /installation-and-requirements/desktop-application/                            
https://github.com/janreges/siteone-crawler-gui/releases/download/v1.0.8/SiteOne-Crawler-linux-arm64-1.0.8.snap     | 1     | /installation-and-requirements/desktop-application/                            
https://github.com/janreges/siteone-crawler-gui/releases/download/v1.0.8/SiteOne-Crawler-linux-x64-1.0.8.AppImage   | 1     | /                                                                              
https://github.com/janreges/siteone-crawler-gui/releases/download/v1.0.8/SiteOne-Crawler-linux-x64-1.0.8.deb        | 1     | /                                                                              
https://github.com/janreges/siteone-crawler-gui/releases/download/v1.0.8/SiteOne-Crawler-linux-x64-1.0.8.snap       | 1     | /                                                                              
https://github.com/janreges/siteone-crawler-gui/releases/download/v1.0.8/SiteOne-Crawler-mac-arm64-1.0.8.dmg        | 1     | /                                                                              
https://github.com/janreges/siteone-crawler-gui/releases/download/v1.0.8/SiteOne-Crawler-mac-x64-1.0.8.dmg          | 1     | /                                                                              
https://github.com/janreges/siteone-crawler-gui/releases/download/v1.0.8/SiteOne-Crawler-win-x64-1.0.8-portable.exe | 1     | /                                                                              
https://github.com/janreges/siteone-crawler-gui/releases/download/v1.0.8/SiteOne-Crawler-win-x64-1.0.8-setup.exe    | 1     | /                                                                              
https://github.com/janreges/siteone-crawler-gui/releases/download/v1.0.8/SiteOne-Crawler-win-x64-1.0.8.msi          | 1     | /                                                                              
https://github.com/janreges/siteone-crawler-markdown-examples/blob/main/react.dev/index.md                          | 1     | /                                                                              
https://github.com/janreges/siteone-crawler/                                                                        | 1     | /introduction/faq/                                                             
https://github.com/janreges/siteone-crawler/discussions                                                             | 1     | /introduction/contact-and-community/                                           
https://github.com/janreges/siteone-crawler/issues                                                                  | 1     | /                                                                              
https://github.com/janreges/siteone-crawler/issues/new                                                              | 1     | /features/dev-devops-assistant/                                                
https://github.com/janreges/siteone-crawler/releases                                                                | 1     | /installation-and-requirements/ready-to-use-packages/                          
https://github.com/janreges/siteone-crawler/releases/download/v1.0.8/siteone-crawler-v1.0.8-win-x64.zip             | 1     | /installation-and-requirements/ready-to-use-packages/                          
https://github.com/matyhtf                                                                                          | 1     | /introduction/thanks/                                                          
https://github.com/swoole/swoole-src                                                                                | 1     | /introduction/ideas-and-roadmap/                                               
https://github.com/swoole/swoole-src/releases                                                                       | 1     | /installation-and-requirements/manual-installation/                            
https://github.com/swoole/swoole-src/releases/download/v4.8.13/swoole-cli-v4.8.13-cygwin-x64.zip                    | 1     | /installation-and-requirements/manual-installation/                            
https://github.com/swoole/swoole-src/releases/download/v4.8.13/swoole-cli-v4.8.13-linux-arm64.tar.xz                | 1     | /installation-and-requirements/manual-installation/                            
https://github.com/swoole/swoole-src/releases/download/v4.8.13/swoole-cli-v4.8.13-macos-arm64.tar.xz                | 1     | /installation-and-requirements/manual-installation/                            
https://github.com/swoole/swoole-src/releases/download/v4.8.13/swoole-cli-v4.8.13-macos-x64.tar.xz                  | 1     | /installation-and-requirements/manual-installation/                            
https://home.snafu.de/tilman/xenulink.html                                                                          | 1     | /introduction/thanks/                                                          
https://learn.microsoft.com/en-us/windows/wsl/about                                                                 | 1     | /installation-and-requirements/manual-installation/                            
https://learn.microsoft.com/en-us/windows/wsl/install                                                               | 1     | /installation-and-requirements/ready-to-use-packages/                          
https://nette.org/                                                                                                  | 1     | /introduction/thanks/                                                          
https://opensource.guide/                                                                                           | 1     | /advanced-topics/contribution-and-development/                                 
https://openswoole.com/docs/modules/swoole-table                                                                    | 1     | /introduction/ideas-and-roadmap/                                               
https://phpbestpractices.org/                                                                                       | 1     | /advanced-topics/contribution-and-development/                                 
https://phpstan.org/                                                                                                | 1     | /introduction/thanks/                                                          
https://phptherightway.com/                                                                                         | 1     | /advanced-topics/contribution-and-development/                                 
https://platform-api.sharethis.com/js/sharethis.js                                                                  | 1     | /                                                                              
https://reactphp.org/                                                                                               | 1     | /introduction/ideas-and-roadmap/                                               
https://starlight.astro.build/                                                                                      | 1     | /introduction/thanks/                                                          
https://svelte.dev/                                                                                                 | 1     | /introduction/thanks/                                                          
https://tailwindcss.com/                                                                                            | 1     | /introduction/thanks/                                                          
https://twitter.com/BillGates                                                                                       | 1     | /introduction/thanks/                                                          
https://twitter.com/DavidGrudl                                                                                      | 1     | /introduction/thanks/                                                          
https://twitter.com/OndrejMirtes                                                                                    | 1     | /introduction/thanks/                                                          
https://twitter.com/elonmusk                                                                                        | 1     | /introduction/thanks/                                                          
https://twitter.com/machal                                                                                          | 1     | /introduction/thanks/                                                          
https://twitter.com/rich_harris                                                                                     | 1     | /introduction/thanks/                                                          
https://twitter.com/ryancarniato                                                                                    | 1     | /introduction/thanks/                                                          
https://twitter.com/saadeghi?lang=cs                                                                                | 1     | /introduction/thanks/                                                          
https://twitter.com/sama                                                                                            | 1     | /introduction/thanks/                                                          
https://twitter.com/siteone_crawler                                                                                 | 1     | /                                                                              
https://twitter.com/spazef0rze                                                                                      | 1     | /introduction/thanks/                                                          
https://twitter.com/swithinbank                                                                                     | 1     | /introduction/thanks/                                                          
https://twitter.com/zdendac                                                                                         | 1     | /introduction/thanks/                                                          
https://ubuntu.com/wsl                                                                                              | 1     | /installation-and-requirements/manual-installation/                            
https://www.amd.com/                                                                                                | 1     | /introduction/thanks/                                                          
https://www.cdn77.com/                                                                                              | 1     | /introduction/thanks/                                                          
https://www.cygwin.com/                                                                                             | 1     | /introduction/ideas-and-roadmap/                                               
https://www.electronjs.org/                                                                                         | 1     | /introduction/thanks/                                                          
https://www.jetbrains.com/                                                                                          | 1     | /introduction/thanks/                                                          
https://www.lenovo.com/                                                                                             | 1     | /introduction/thanks/                                                          
https://www.linkedin.com/in/linustorvalds                                                                           | 1     | /introduction/thanks/                                                          
https://www.linuxfordevices.com/tutorials/linux/install-debian-on-windows-wsl                                       | 1     | /installation-and-requirements/manual-installation/                            
https://www.michalspacek.cz/                                                                                        | 1     | /introduction/thanks/                                                          
https://www.php-fig.org/psr/psr-12/                                                                                 | 1     | /advanced-topics/contribution-and-development/                                 
https://www.php.net/manual/en/timezones.php                                                                         | 1     | /configuration/command-line-options/                                           
https://www.reddit.com/r/siteone_crawler/                                                                           | 1     | /introduction/contact-and-community/                                           
https://www.rust-lang.org/                                                                                          | 1     | /introduction/ideas-and-roadmap/                                               
https://www.siteone.io/                                                                                             | 1     | /introduction/thanks/                                                          
https://www.solidjs.com/                                                                                            | 1     | /introduction/thanks/                                                          
https://www.spse-po.sk/                                                                                             | 1     | /introduction/thanks/                                                          
https://www.swoole.com/                                                                                             | 1     | /introduction/thanks/                                                          
https://www.vzhurudolu.cz/                                                                                          | 1     | /introduction/thanks/                                                          
https://www.w3schools.com/xml/xpath_syntax.asp                                                                      | 1     | /configuration/command-line-options/                                           
https://www.youtube.com/@SiteOne-Crawler                                                                            | 1     | /                                                                              
https://x.com/janreges                                                                                              | 1     | /introduction/faq/                                                             


Redirected URLs
---------------

No redirects found.


404 URLs
--------

Status | URL 404                                                       | Found at URL                                                 
----------------------------------------------------------------------------------------------------------------------------------------
404    | /docs/features/content-type-analysis                          | /features/performance-metrics/                               


SSL/TLS info
------------

Info                   | Text                                                                                                        
---------------------------------------------------------------------------------------------------------------------------------------
Issuer                 | C = BE, O = GlobalSign nv-sa, CN = GlobalSign GCC R6 AlphaSSL CA 2025                                       
Subject                | CN = *.siteone.io                                                                                           
Valid from             | Feb  9 15:43:30 2026 GMT (VALID already 35 day(s))                                                          
Valid to               | Mar 13 15:43:29 2027 GMT (VALID still for 362 day(s))                                                       
Supported protocols    | TLSv1.2                                                                                                     
RAW certificate output | Certificate:
    Data:
        Version: 3 (0x2)
        Serial Number:…2:7c:16:53:e8:
        91:24:48:5b   
RAW protocols output   | === ssl2 ===
s_client: Unknown option: -ssl2
s_client: Use -help for su…ent
Verify return code: 0 (ok)
---  


TOP fastest URLs
----------------

Time   | Status | Fast URL                                                                                                              
------------------------------------------------------------------------------------------------------------------------------------------
4 ms   | 200    | /features/performance-analysis/                                                                                       
4 ms   | 200    | /features/availability/                                                                                               
4 ms   | 200    | /features/stress-testing/                                                                                             
4 ms   | 200    | /features/redirect-and-404-analysis/                                                                                  
4 ms   | 200    | /features/ease-of-use/                                                                                                
4 ms   | 200    | /features/mailer/                                                                                                     
4 ms   | 200    | /features/dns-analysis/                                                                                               
4 ms   | 200    | /features/improvement-meter/                                                                                          
4 ms   | 200    | /features/exports-and-reports/                                                                                        
4 ms   | 200    | /features/ssl-tls-analysis/                                                                                           
4 ms   | 200    | /features/best-practices-analysis/                                                                                    
4 ms   | 200    | /features/sitemap-generator/                                                                                          
4 ms   | 200    | /advanced-topics/troubleshooting/                                                                                     
4 ms   | 200    | /introduction/key-features/                                                                                           
4 ms   | 200    | /introduction/contact-and-community/                                                                                  
4 ms   | 200    | /introduction/thanks/                                                                                                 
4 ms   | 200    | /features/content-type-analysis/                                                                                      
4 ms   | 200    | /introduction/overview/                                                                                               
4 ms   | 200    | /features/security-analysis/                                                                                          
4 ms   | 200    | /features/website-to-markdown-converter/                                                                              


TOP slowest URLs
----------------

Time   | Status | Slow URL                                                                                                         
-------------------------------------------------------------------------------------------------------------------------------------
142 ms | 200    | /features/deep-website-crawling/                                                                                 
132 ms | 200    | /introduction/ideas-and-roadmap/                                                                                 
76 ms  | 200    | /features/heading-analysis/                                                                                      
70 ms  | 200    | /getting-started/quick-start-guide/                                                                              
56 ms  | 200    | /getting-started/basic-usage/                                                                                    
15 ms  | 200    | /configuration/examples/                                                                                         
10 ms  | 200    | /configuration/command-line-options/                                                                             
10 ms  | 200    | /installation-and-requirements/desktop-application/                                                              


SEO metadata
------------

This table contains large data and shows max 10 rows. To see them all, use output to HTML using `--output-html-report=tmp/myreport.html`.

URL                                                | Indexing             | Title        | H1           | Description  | Keywords    
---------------------------------------------------------------------------------------------------------------------------------------
/                                                  | Allowed              | SiteOne…ove  | SiteOne…ler  | A very u…o). |             
/advanced-topics/caching/                          | Allowed              | Caching…ler  | Caching      | SiteOne…ls.  |             
/advanced-topics/contribution-and-development/     | Allowed              | Contribu…ler | Contribu…ent | Guidelin…ts. |             
/advanced-topics/crawler-behavior/                 | Allowed              | Crawler…ler  | Crawler…ior  | Understa…ns. |             
/advanced-topics/extending/                        | Allowed              | Extendin…ler | Extending    | SiteOne…ty.  |             
/advanced-topics/troubleshooting/                  | Allowed              | Troubles…ler | Troubles…ing | Solution…es. |             
/configuration/command-line-options/               | Allowed              | Command-…ler | Command-…ons | This sec…ol. |             
/configuration/examples/                           | Allowed              | Examples…ler | Examples     | This sec…ol. |             
/features/accessibility-analysis/                  | Allowed              | Accessib…ler | Accessib…sis | SiteOne…es.  |             
/features/audit-report/                            | Allowed              | Audit Re…ler | Audit Report | Learn ab…at. |             


OpenGraph metadata
------------------

This table contains large data and shows max 10 rows. To see them all, use output to HTML using `--output-html-report=tmp/myreport.html`.

URL                                                | OG Title   | OG Description | OG Image           | Twitter Title | Twitter Description | Twitter Image     
--------------------------------------------------------------------------------------------------------------------------------------------------
/                                                  | SiteOne…r) | A very…).  | /siteone-cra…e.png |            |            |                   
/advanced-topics/caching/                          | Caching    | SiteOne…s. | /siteone-cra…e.png |            |            |                   
/advanced-topics/contribution-and-development/     | Contrib…nt | Guideli…s. | /siteone-cra…e.png |            |            |                   
/advanced-topics/crawler-behavior/                 | Crawler…or | Underst…s. | /siteone-cra…e.png |            |            |                   
/advanced-topics/extending/                        | Extending  | SiteOne…y. | /siteone-cra…e.png |            |            |                   
/advanced-topics/troubleshooting/                  | Trouble…ng | Solutio…s. | /siteone-cra…e.png |            |            |                   
/configuration/command-line-options/               | Command…ns | This se…l. | /siteone-cra…e.png |            |            |                   
/configuration/examples/                           | Examples   | This se…l. | /siteone-cra…e.png |            |            |                   
/features/accessibility-analysis/                  | Accessi…is | SiteOne…s. | /siteone-cra…e.png |            |            |                   
/features/audit-report/                            | Audit R…rt | Learn a…t. | /siteone-cra…e.png |            |            |                   


Heading structure
-----------------

This table contains large data and shows max 10 rows. To see them all, use output to HTML using `--output-html-report=tmp/myreport.html`.

Heading structure                                                                    | Count | Errors | URL                           
----------------------------------------------------------------------------------------------------------------------------------------
<h2> On this page [#starlight__on-this-page] <h1> FAQ [#…w-can-i-contact-the-author] | 14    | 13     | /introduction/faq/            
<h2> On this page [#starlight__on-this-page] <h1> Exampl…with-all-available-options] | 13    | 12     | /configuration/examples/      
<h2> On this page [#starlight__on-this-page] <h1> Quick…use-command-line-interface]  | 4     | 3      | /getting-started/qui…rt-guide/
<h2> On this page [#starlight__on-this-page] <h1> Key Fe…eatures [#list-of-features] | 5     | 1      | /introduction/key-features/   
<h2> On this page [#starlight__on-this-page] <h1> Comman…zer [#slowest-url-analyzer] | 16    | 1      | /configuration/comma…-options/
<h2> On this page [#starlight__on-this-page] <h1> CLI: R…inux (arm64) [#linux-arm64] | 8     | 1      | /installation-and-re…packages/
<h2> On this page [#starlight__on-this-page] <h1> Dev/De…#further-development-ideas] | 3     | 1      | /features/dev-devops…ssistant/
<h2> On this page [#starlight__on-this-page] <h1> Audit…#further-development-ideas]  | 7     | 1      | /features/online-htm…t-upload/
<h2> On this page [#starlight__on-this-page] <h1> Ease o…? [#what-would-you-improve] | 3     | 1      | /features/ease-of-use/        
<h2> On this page [#starlight__on-this-page] <h1> Access…#further-development-ideas] | 5     | 1      | /features/accessibil…analysis/


HTTP headers
------------

Header                    | Occurs | Unique | Values preview                                   | Min value  | Max value 
--------------------------------------------------------------------------------------------------------------------------
Accept-Ranges             | 10     | 1      | bytes                                            |            |           
Cache-Control             | 65     | 2      | max-age=3600 (51) / max-age=31536000 (14)        |            |           
Content-Length            | 18     | -      | [ignored generic values]                         | 152 B      | 8 MB      
Content-Security-Policy   | 51     | 1      | default-src 'self' 'unsafe-inlin…s://*.ytimg.com |            |           
Content-Type              | 73     | 8      | text/html (58) / application/jav…text/plain (1)  |            |           
Date                      | 73     | -      | [ignored generic values]                         | 2026-03-16 | 2026-03-16
Etag                      | 73     | -      | [ignored generic values]                         |            |           
Expires                   | 65     | -      | [ignored generic values]                         | 2026-03-16 | 2027-03-16
Feature-Policy            | 73     | 1      | accelerometer 'none'; camera 'no…ne'; usb 'none' |            |           
Last-Modified             | 65     | -      | [ignored generic values]                         | 2025-05-06 | 2025-06-08
Permissions-Policy        | 73     | 1      | accelerometer=(), camera=(), geo…ment=(), usb=() |            |           
Referrer-Policy           | 73     | 1      | no-referrer-when-downgrade                       |            |           
Server                    | 73     | 1      | -                                                |            |           
Strict-Transport-Security | 73     | 1      | max-age=15552000                                 |            |           
Vary                      | 55     | 1      | Accept-Encoding                                  |            |           
X-Content-Type-Options    | 73     | 2      | nosniff (59) / nosniff, nosniff (14)             |            |           
X-Frame-Options           | 73     | 1      | SAMEORIGIN                                       |            |           
X-XSS-Protection          | 73     | 1      | 1; mode=block                                    |            |           


HTTP header values
------------------

Header                    | Occurs | Value                                                                             
-------------------------------------------------------------------------------------------------------------------------
Accept-Ranges             | 10     | bytes                                                                             
Cache-Control             | 51     | max-age=3600                                                                      
Cache-Control             | 14     | max-age=31536000                                                                  
Content-Security-Policy   | 51     | default-src 'self' 'unsafe-inline' 'unsafe-eval' data:…is.com https://*.ytimg.com 
Content-Type              | 58     | text/html                                                                         
Content-Type              | 4      | application/javascript                                                            
Content-Type              | 3      | image/gif                                                                         
Content-Type              | 2      | text/css                                                                          
Content-Type              | 2      | image/svg+xml                                                                     
Content-Type              | 2      | image/avif                                                                        
Content-Type              | 1      | image/webp                                                                        
Content-Type              | 1      | text/plain                                                                        
Feature-Policy            | 73     | accelerometer 'none'; camera 'none'; geolocation 'self'…payment 'none'; usb 'none'
Permissions-Policy        | 73     | accelerometer=(), camera=(), geolocation=(self), gyrosc…idi=(), payment=(), usb=()
Referrer-Policy           | 73     | no-referrer-when-downgrade                                                        
Server                    | 73     | -                                                                                 
Strict-Transport-Security | 73     | max-age=15552000                                                                  
Vary                      | 55     | Accept-Encoding                                                                   
X-Content-Type-Options    | 59     | nosniff                                                                           
X-Content-Type-Options    | 14     | nosniff, nosniff                                                                  
X-Frame-Options           | 73     | SAMEORIGIN                                                                        
X-XSS-Protection          | 73     | 1; mode=block                                                                     


HTTP Caching by content type (only from crawlable domains)
----------------------------------------------------------

Content type | Cache type   | URLs  | AVG lifetime | MIN lifetime | MAX lifetime
----------------------------------------------------------------------------
HTML         | Cache-Control + ETag + Last-Modified | 50    | 60 min     | 60 min     | 60 min    
HTML         | ETag         | 8     | -          | -          | -         
Image        | Cache-Control + ETag + Last-Modified | 8     | 12 mon     | 12 mon     | 12 mon    
JS           | Cache-Control + ETag + Last-Modified | 4     | 12 mon     | 12 mon     | 12 mon    
CSS          | Cache-Control + ETag + Last-Modified | 2     | 12 mon     | 12 mon     | 12 mon    
Document     | Cache-Control + ETag + Last-Modified | 1     | 60 min     | 60 min     | 60 min    


HTTP Caching by domain
----------------------

Domain               | Cache type   | URLs  | AVG lifetime | MIN lifetime | MAX lifetime
------------------------------------------------------------------------------------
crawler.siteone.io   | Cache-Control + ETag + Last-Modified | 65    | 78 d       | 60 min     | 12 mon    
crawler.siteone.io   | ETag         | 8     | -          | -          | -         


HTTP Caching by domain and content type
---------------------------------------

Domain               | Content type | Cache type   | URLs  | AVG lifetime | MIN lifetime | MAX lifetime
---------------------------------------------------------------------------------------------------
crawler.siteone.io   | HTML         | Cache-Control + ETag + Last-Modified | 50    | 60 min     | 60 min     | 60 min    
crawler.siteone.io   | Image        | Cache-Control + ETag + Last-Modified | 8     | 12 mon     | 12 mon     | 12 mon    
crawler.siteone.io   | HTML         | ETag         | 8     | -          | -          | -         
crawler.siteone.io   | JS           | Cache-Control + ETag + Last-Modified | 4     | 12 mon     | 12 mon     | 12 mon    
crawler.siteone.io   | CSS          | Cache-Control + ETag + Last-Modified | 2     | 12 mon     | 12 mon     | 12 mon    
crawler.siteone.io   | Document     | Cache-Control + ETag + Last-Modified | 1     | 60 min     | 60 min     | 60 min    


TOP non-unique titles
---------------------

Nothing to report.


TOP non-unique descriptions
---------------------------

Count | Description                                                                                                                     
------------------------------------------------------------------------------------------------------------------------------------------
2     |                                                                                                                                 


Best practices
--------------

Analysis name                            | OK    | Notice | Warning | Critical
--------------------------------------------------------------------------------
Invalid inline SVGs                      | 34    | 0      | 0       | 0       
DOM depth (> 30)                         | 58    | 0      | 0       | 0       
Large inline SVGs (> 5120 B)             | 34    | 0      | 0       | 0       
Heading structure                        | 55    | 0      | 53      | 0       
Duplicate inline SVGs (> 5 and > 1024 B) | 34    | 0      | 0       | 0       
Title uniqueness (> 10%)                 | 50    | 0      | 0       | 0       
Description uniqueness (> 10%)           | 49    | 0      | 0       | 0       
Brotli support                           | 0     | 0      | 50      | 0       
WebP support                             | 1     | 0      | 0       | 0       
AVIF support                             | 2     | 0      | 0       | 0       


Accessibility
-------------

Analysis name                | OK    | Notice | Warning | Critical
--------------------------------------------------------------------
Missing html lang attribute  | 1     | 0      | 0       | 0       
Missing aria labels          | 2     | 0      | 119     | 0       
Missing roles                | 0     | 0      | 35      | 0       
Missing image alt attributes | 6     | 0      | 1       | 0       


Source domains
--------------

Domain             | Totals       | HTML         | Image        | JS         | CSS         | Document  
---------------------------------------------------------------------------------------------------------
crawler.siteone.io | 73/20MB/1.5s | 58/3MB/744ms | 8/18MB/773ms | 4/7kB/11ms | 2/64kB/12ms | 1/152B/2ms


Content types
-------------

Content type | URLs  | Total size | Total time | Avg time | Status 20x | Status 40x | Status 42x
--------------------------------------------------------------------------------------------------
HTML         | 58    | 3 MB       | 744 ms     | 12 ms    | 50         | 1          | 7         
Image        | 8     | 18 MB      | 773 ms     | 96 ms    | 8          | 0          | 0         
JS           | 4     | 7 kB       | 11 ms      | 2 ms     | 4          | 0          | 0         
CSS          | 2     | 64 kB      | 12 ms      | 6 ms     | 2          | 0          | 0         
Document     | 1     | 152 B      | 2 ms       | 2 ms     | 1          | 0          | 0         


Content types (MIME types)
--------------------------

Content type               | URLs  | Total size | Total time | Avg time | Status 20x | Status 40x | Status 42x
----------------------------------------------------------------------------------------------------------------
text/html                  | 58    | 3 MB       | 744 ms     | 12 ms    | 50         | 1          | 7         
application/javascript     | 4     | 7 kB       | 11 ms      | 2 ms     | 4          | 0          | 0         
image/gif                  | 3     | 16 MB      | 671 ms     | 223 ms   | 3          | 0          | 0         
text/css                   | 2     | 64 kB      | 12 ms      | 6 ms     | 2          | 0          | 0         
image/svg+xml              | 2     | 1 kB       | 5 ms       | 2 ms     | 2          | 0          | 0         
image/avif                 | 2     | 2 MB       | 82 ms      | 41 ms    | 2          | 0          | 0         
image/webp                 | 1     | 31 kB      | 14 ms      | 14 ms    | 1          | 0          | 0         
text/plain                 | 1     | 152 B      | 2 ms       | 2 ms     | 1          | 0          | 0         


DNS info
--------

DNS resolving tree                                                    
------------------------------------------------------------------------
crawler.siteone.io                                                    
  IPv4: 86.49.167.242                                                 
                                                                      
DNS server: 10.255.255.254                                            


Security
--------

Header                     | OK    | Notice | Warning | Critical | Recommendation                                                      
-----------------------------------------------------------------------------------------------------------------------------------------
Content-Security-Policy    | 50    | 0      | 0       | 4        | Content-Security-Policy header is not set. It…prevents XSS attacks. 
X-Frame-Options            | 0     | 54     | 0       | 0        | X-Frame-Options header is set to SAMEORIGIN wh…resource in a frame. 
X-XSS-Protection           | 0     | 54     | 0       | 0        | X-XSS-Protection header is set but deprecated.…urity-Policy instead.
Strict-Transport-Security  | 54    | 0      | 0       | 0        |                                                                     
X-Content-Type-Options     | 54    | 0      | 0       | 0        |                                                                     
Referrer-Policy            | 54    | 0      | 0       | 0        |                                                                     
Feature-Policy             | 54    | 0      | 0       | 0        |                                                                     
Permissions-Policy         | 54    | 0      | 0       | 0        |                                                                     
Server                     | 54    | 0      | 0       | 0        | Server header is not set or empty. This is recommended.             


Analysis stats
--------------

Class::method                                        | Exec time | Exec count
-------------------------------------------------------------------------------
SslTlsAnalyzer::getTLSandSSLCertificateInfo          | 259 ms    | 1         
BestPracticeAnalyzer::checkHeadingStructure          | 47 ms     | 58        
AccessibilityAnalyzer::checkMissingAriaLabels        | 45 ms     | 50        
AccessibilityAnalyzer::checkMissingLabels            | 42 ms     | 50        
AccessibilityAnalyzer::checkMissingRoles             | 39 ms     | 50        
BestPracticeAnalyzer::checkMaxDOMDepth               | 36 ms     | 58        
AccessibilityAnalyzer::checkMissingLang              | 36 ms     | 50        
BestPracticeAnalyzer::checkNonClickablePhoneNumbers  | 24 ms     | 58        
BestPracticeAnalyzer::checkInlineSvg                 | 11 ms     | 58        
BestPracticeAnalyzer::checkMissingQuotesOnAttributes | 3 ms      | 58        
SeoAndOpenGraphAnalyzer::analyzeHeadings             | 2 ms      | 1         
SecurityAnalyzer::checkHtmlSecurity                  | 1 ms      | 54        
AccessibilityAnalyzer::checkImageAltAttributes       | 1 ms      | 50        
SecurityAnalyzer::checkHeaders                       | 0 ms      | 54        
SeoAndOpenGraphAnalyzer::analyzeSeo                  | 0 ms      | 1         
SeoAndOpenGraphAnalyzer::analyzeOpenGraph            | 0 ms      | 1         
BestPracticeAnalyzer::checkMetaDescriptionUniqueness | 0 ms      | 1         
BestPracticeAnalyzer::checkTitleUniqueness           | 0 ms      | 1         
BestPracticeAnalyzer::checkBrotliSupport             | 0 ms      | 1         
BestPracticeAnalyzer::checkWebpSupport               | 0 ms      | 1         
BestPracticeAnalyzer::checkAvifSupport               | 0 ms      | 1         


Content processor stats
-----------------------

Class::method                                            | Exec time | Exec count
-----------------------------------------------------------------------------------
HtmlProcessor::findUrls                                  | 47 ms     | 58        
NextJsProcessor::applyContentChangesBeforeUrlParsing     | 11 ms     | 64        
JavaScriptProcessor::findUrls                            | 8 ms      | 62        
AstroProcessor::findUrls                                 | 1 ms      | 62        
CssProcessor::findUrls                                   | 1 ms      | 60        
AstroProcessor::applyContentChangesBeforeUrlParsing      | 0 ms      | 62        
NextJsProcessor::findUrls                                | 0 ms      | 64        
JavaScriptProcessor::applyContentChangesBeforeUrlParsing | 0 ms      | 62        
SvelteProcessor::applyContentChangesBeforeUrlParsing     | 0 ms      | 58        
CssProcessor::applyContentChangesBeforeUrlParsing        | 0 ms      | 60        
HtmlProcessor::applyContentChangesBeforeUrlParsing       | 0 ms      | 58        
SvelteProcessor::findUrls                                | 0 ms      | 58        


==========================================================================================================================================
Total execution time 9.2 s using 3 workers and 2048M memory limit (max used 109 MB)
Total of 73 visited URLs with a total size of 20 MB and power of 7 reqs/s with download speed 2 MB/s
Response times: AVG 21 ms MIN 3 ms MAX 345 ms TOTAL 1.5 s
==========================================================================================================================================

╔═════════════════════════════════════════════════════════════════╗
║                      WEBSITE QUALITY SCORE                      ║
╠═════════════════════════════════════════════════════════════════╣
║  Overall         █████████████████████░░░░   8.2/10  Good       ║
╠═════════════════════════════════════════════════════════════════╣
║  Performance     █████████████████████████  10.0/10  Excellent  ║
║  SEO             ████████████████████████░   9.5/10  Excellent  ║
║  Security        ███████████████████░░░░░░   7.5/10  Good       ║
║  Accessibility   █████████████░░░░░░░░░░░░   5.0/10  Fair       ║
║  Best Practices  ████████████████████████░   9.5/10  Excellent  ║
╚═════════════════════════════════════════════════════════════════╝

Summary
-------

⛔ Skipped URLs - 95 skipped URLs found.
⛔ Security - 4 pages(s) with critical finding(s).
⚠️ Latest SSL/TLS protocol TLSv1.3 is not supported. Ask your admin/provider to add TLSv1.3 support.
⚠️ 50 page(s) do not support Brotli compression.
⚠️ 49 page(s) with skipped heading levels.
⚠️ 1 page(s) without image alt attributes.
⚠️ 50 page(s) without aria labels.
⚠️ 50 page(s) without role attributes.
⏩ Loaded robots.txt for domain 'crawler.siteone.io': status code 200, size 152 B and took 24 ms.
⏩ External URLs - 89 external URL(s) found.
⏩ 404 NOTICE - 1 non-existent page(s) found.
⏩ DNS IPv6: domain crawler.siteone.io does not support IPv6 (DNS server: 10.255.255.254).
✅ Redirects - no redirects found.
✅ SSL/TLS certificate is valid until Mar 13 15:43:29 2027 GMT. Issued by C = BE, O = GlobalSign nv-sa, CN = GlobalSign GCC R6 AlphaSSL CA 2025. Subject is CN = *.siteone.io.
✅ SSL/TLS certificate issued by 'C = BE, O = GlobalSign nv-sa, CN = GlobalSign GCC R6 AlphaSSL CA 2025'.
✅ Performance OK - all non-media URLs are faster than 3 seconds.
✅ HTTP headers - found 18 unique headers.
✅ All 50 unique title(s) are within the allowed 10% duplicity. Highest duplicity title has 2%.
✅ All 49 description(s) are within the allowed 10% duplicity. Highest duplicity description has 4%.
✅ 1 WebP image(s) found on the website.
✅ 2 AVIF image(s) found on the website.
✅ All pages have quoted attributes.
✅ All pages have inline SVGs smaller than 5120 bytes.
✅ All pages have inline SVGs with less than 5 duplicates.
✅ All pages have valid or none inline SVGs.
✅ All pages without multiple <h1> headings.
✅ All pages have <h1> heading.
✅ All pages have DOM depth less than 30.
✅ All pages have clickable (interactive) phone numbers.
✅ All pages have valid HTML.
✅ All pages have form labels.
✅ All pages have lang attribute.
✅ DNS IPv4 OK: domain crawler.siteone.io resolved to 86.49.167.242 (DNS server: 10.255.255.254).
📌 Text report saved to '/home/janreges/siteone-crawler/tmp/crawler.siteone.io.output.20260316-155513.txt' and took 0 ms.
📌 JSON report saved to '/home/janreges/siteone-crawler/tmp/crawler.siteone.io.output.20260316-155513.json' and took 0 ms.
📌 HTML report saved to '/home/janreges/siteone-crawler/tmp/crawler.siteone.io.report.20260316-155513.html' and took 1 ms.


================================================
FILE: docs/TEXT-OUTPUT.md
================================================
# SiteOne Crawler: Text Output Documentation

## Table of Contents

*   [1. Introduction](#1-introduction)
*   [2. General Format](#2-general-format)
*   [3. Detailed Section Breakdown](#3-detailed-section-breakdown)
    *   [3.1. Progress Report](#31-progress-report)
    *   [3.2. Skipped URLs Summary](#32-skipped-urls-summary)
    *   [3.3. Skipped URLs](#33-skipped-urls)
    *   [3.4. External URLs](#34-external-urls)
    *   [3.5. Redirected URLs](#35-redirected-urls)
    *   [3.6. 404 URLs](#36-404-urls)
    *   [3.7. SSL/TLS Info](#37-ssltls-info)
    *   [3.8. Performance Metrics (Fastest/Slowest URLs)](#38-performance-metrics-fastestslowest-urls)
    *   [3.9. SEO & Content Analysis](#39-seo--content-analysis)
    *   [3.10. HTTP Headers](#310-http-headers)
    *   [3.11. HTTP Caching](#311-http-caching)
    *   [3.12. Non-Unique Titles and Descriptions](#312-non-unique-titles-and-descriptions)
    *   [3.13. Best Practices](#313-best-practices)
    *   [3.14. Accessibility](#314-accessibility)
    *   [3.15. Source Domains](#315-source-domains)
    *   [3.16. Content Types](#316-content-types)
    *   [3.17. DNS Info](#317-dns-info)
    *   [3.18. Security](#318-security)
    *   [3.19. Analysis Stats](#319-analysis-stats)
    *   [3.20. Content Processor Stats](#320-content-processor-stats)
    *   [3.21. Execution Summary](#321-execution-summary)
    *   [3.22. Website Quality Score](#322-website-quality-score)
    *   [3.23. Summary](#323-summary)
*   [4. Information Obtainable from Text Output](#4-information-obtainable-from-text-output)
*   [5. Use Cases for Text Output](#5-use-cases-for-text-output)
*   [6. Note on JSON Output](#6-note-on-json-output)


This document describes the format of the text (`.txt`) output generated by the SiteOne Crawler tool. This output provides a comprehensive summary of the crawl results in a human-readable format, suitable for quick analysis and review directly in a text editor or terminal.

## 1. Introduction

The text output begins with an ASCII art logo, version information, and the author's contact details. This is followed by several sections detailing various aspects of the crawled website. The primary sections include:

*   **Progress Report:** Real-time status of crawled URLs.
*   **Skipped URLs Summary:** Aggregated counts of URLs skipped for various reasons.
*   **Skipped URLs:** Detailed list of skipped URLs, reasons, and sources.
*   **External URLs:** List of all external URLs found during the crawl, with page counts and source locations.
*   **Redirected URLs:** List of URLs that resulted in redirects.
*   **404 URLs:** List of URLs that returned a 404 Not Found status.
*   **SSL/TLS Info:** Details about the website's SSL/TLS certificate.
*   **Performance Metrics:** Top fastest and slowest URLs.
*   **SEO & Content Analysis:** SEO metadata, OpenGraph metadata, heading structure.
*   **HTTP Headers:** Analysis of HTTP headers found during the crawl.
*   **HTTP Caching:** Detailed breakdown of caching strategies by content type and domain.
*   **Non-Unique Titles and Descriptions:** Reports on duplicate page titles and meta descriptions.
*   **Best Practices:** Results of various best practice checks.
*   **Accessibility:** Results of accessibility checks.
*   **Source Domains:** Summary of crawled domains with content type breakdowns.
*   **Content Types:** Summary of crawled content types (general and MIME types).
*   **DNS Info:** Information about DNS resolution.
*   **Security:** Results of security header checks.
*   **Analysis Stats:** Performance statistics for the crawler's internal analyzers.
*   **Content Processor Stats:** Performance statistics for content processors (HTML, CSS, JS, etc.).
*   **Execution Summary:** Total execution time, URL counts, response time statistics, and DNS info.
*   **Website Quality Score:** Scored rating (0-10) across five quality categories.
*   **Summary:** Categorized findings with severity indicators.

## 2. General Format

The output uses simple text formatting:

*   **Headers:** Section titles are followed by `---` underlines for visual separation. The final execution summary block uses `===` double-line borders.
*   **Tables:** Data is presented in fixed-width tables with headers underlined by hyphens (`-`). Column alignment is maintained using spaces. Columns are separated by ` | ` (pipe with surrounding spaces). This documentation uses Markdown tables for illustrative examples.
*   **Truncation:** Some tables containing potentially large amounts of data (like SEO metadata or heading structures) show only a limited number of rows (e.g., max 10) in the text output, with a note advising the use of the HTML report (`--output-html-report`) for the complete data. Long cell values may be truncated with an ellipsis character.

## 3. Detailed Section Breakdown

### 3.1. Progress Report

This section shows the progress of the crawl in real-time (or the final state if the crawl is complete). The columns displayed depend on the detected terminal width. When the terminal width is less than 140 characters, a compact mode is activated that omits the `%` and `Bar` columns.

A message like this appears before the progress table when compact mode is active:

```
Detected terminal width 138 < 140 chars - compact mode activated.
```

**Compact mode (terminal width < 140 chars):**

| Progress | URL                                                     | Status | Type | Time  | Size  | Cache  | Access. | Best pr. |
| :------- | :------------------------------------------------------ | :----- | :--- | :---- | :---- | :----- | :------ | :------- |
| 1/40     | /                                                       | 200    | HTML | 4 ms  | 50 kB | 60 min | 3/1     | 7        |
| 2/66     | /introduction/key-features/                             | 200    | HTML | 4 ms  | 54 kB | 60 min | 2/2     | 1/6      |
| ...      | ...                                                     | ...    | ...  | ...   | ...   | ...    | ...     | ...      |

**Wide mode (terminal width >= 140 chars) includes two additional columns:**

| Progress | %   | Bar | URL | Status | Type | Time | Size | Cache | Access. | Best pr. |
| :------- | :-- | :-- | :-- | :----- | :--- | :--- | :--- | :---- | :------ | :------- |
| 1/40     | 2%  | >   | /   | 200    | HTML | 4 ms | 50 kB| 60 min| 3/1     | 7        |

*   **Progress report columns:**
    *   `Progress` (`X/Y`): `X` = URL sequence number, `Y` = Total URLs found so far.
    *   `%`: *(wide mode only)* Percentage of URLs processed relative to the total found.
    *   `Bar`: *(wide mode only)* Visual progress indicator.
    *   `URL`: The path or full URL being processed.
    *   `Status`: HTTP status code returned (e.g., 200, 404, 301, 429).
    *   `Type`: Detected content type (e.g., HTML, JS, CSS, Image, Document).
    *   `Time`: Time taken to download the URL.
    *   `Size`: Size of the downloaded content.
    *   `Cache`: Detected cache lifetime (e.g., `60 min`, `12 mon`, `etag`, `none`).
    *   `Access.`: Accessibility issues summary as compact numeric counts. Values like `3/1` mean "3 OK / 1 warning", `2/2` means "2 OK / 2 warnings", and a single number like `7` means that count of findings in the most relevant severity. Empty for non-HTML resources.
    *   `Best pr.`: Best practices issues summary in the same compact numeric format. Values like `1/6` mean "1 OK / 6 warnings". Empty for non-HTML resources.

### 3.2. Skipped URLs Summary

Provides a high-level overview of why URLs were skipped during the crawl, grouped by reason and domain.

**Skipped URLs Summary**

| Reason           | Domain             | Unique URLs |
| :--------------- | :----------------- | :---------- |
| Not allowed host | github.com         | 29          |
| Not allowed host | twitter.com        | 13          |
| Robots.txt       | crawler.siteone.io | 6           |
| ...              | ...                | ...         |

*   **Reason:** Why the URL was skipped (e.g., `Not allowed host`, `Robots.txt`, `Max depth reached`).
*   **Domain:** The domain of the skipped URLs.
*   **Unique URLs:** The count of unique URLs skipped for that reason/domain combination.

### 3.3. Skipped URLs

Lists individual skipped URLs with more context.

**Skipped URLs**

| Reason           | Skipped URL                      | Source     | Found at URL                        |
| :--------------- | :------------------------------- | :--------- | :---------------------------------- |
| Robots.txt       | /examples-exports/docs.astro.build/ | `<a href>` | /                                   |
| Not allowed host | https://adamwathan.me/           | `<a href>` | /introduction/thanks/               |
| ...              | ...                              | ...        | ...                                 |

*   **Reason:** Why the URL was skipped.
*   **Skipped URL:** The specific URL that was not crawled.
*   **Source:** How the URL was discovered (e.g., `<a href>`, `<img src>`, `<script src>`, `CSS url()`).
*   **Found at URL:** The URL where the skipped URL was found.

### 3.4. External URLs

Lists all external URLs found during the crawl, along with the number of pages each was found on and up to 5 example source pages.

**External URLs**

| External URL                                            | Pages | Found on URL (max 5)                      |
| :------------------------------------------------------ | :---- | :---------------------------------------- |
| https://adamwathan.me/                                  | 1     | /introduction/thanks/                     |
| https://discord.gg/Uh66HaZJ                             | 1     | /                                         |
| https://github.com/janreges/siteone-crawler             | 1     | /                                         |
| ...                                                     | ...   | ...                                       |

*   **External URL:** The full external URL that was found but not crawled.
*   **Pages:** Number of distinct pages the external URL was found on.
*   **Found on URL (max 5):** Up to 5 source pages where the external URL was discovered.

### 3.5. Redirected URLs

Lists URLs that resulted in an HTTP redirect. If no redirects were found, the section displays "No redirects found."

**Redirected URLs**

| Status | URL from                        | URL to                          | Found at URL              |
| :----- | :------------------------------ | :------------------------------ | :------------------------ |
| 301    | /old-page/                      | /new-page/                      | /some-page/               |
| ...    | ...                             | ...                             | ...                       |

When there are no redirects, the output is simply:

```
Redirected URLs
---------------

No redirects found.
```

### 3.6. 404 URLs

Lists URLs that returned a 404 Not Found status code.

**404 URLs**

| Status | URL 404                                          | Found at URL                       |
| :----- | :----------------------------------------------- | :--------------------------------- |
| 404    | /docs/features/content-type-analysis             | /features/performance-metrics/     |
| ...    | ...                                              | ...                                |

*   **Status:** The HTTP status code (typically 404).
*   **URL 404:** The URL that resulted in the 404 error.
*   **Found at URL:** The URL containing the link to the broken page.

### 3.7. SSL/TLS Info

Provides details about the SSL/TLS certificate of the primary host. Includes the raw certificate output and raw protocol test output.

**SSL/TLS info**

| Info                   | Text                                                                                  |
| :--------------------- | :------------------------------------------------------------------------------------ |
| Issuer                 | C = BE, O = GlobalSign nv-sa, CN = GlobalSign GCC R6 AlphaSSL CA 2025                |
| Subject                | CN = *.siteone.io                                                                     |
| Valid from             | Feb  9 15:43:30 2026 GMT (VALID already 35 day(s))                                    |
| Valid to               | Mar 13 15:43:29 2027 GMT (VALID still for 362 day(s))                                 |
| Supported protocols    | TLSv1.2                                                                               |
| RAW certificate output | `Certificate: Data: Version: 3 (0x2) ...` (truncated)                                 |
| RAW protocols output   | `=== ssl2 === s_client: Unknown option: -ssl2 ...` (truncated)                         |

*   **Info:** The type of information (Issuer, Subject, Validity dates, Supported protocols).
*   **Text:** The corresponding value for the information type. The RAW rows contain the full openssl output, which may span multiple lines and be truncated in the text output.

### 3.8. Performance Metrics (Fastest/Slowest URLs)

Two tables listing the top N fastest and slowest URLs encountered during the crawl. By default, up to 20 fastest and 8 slowest URLs are shown (configurable via `--fastest-top-limit` and `--slowest-top-limit`).

**TOP fastest URLs**

| Time  | Status | Fast URL                                     |
| :---- | :----- | :------------------------------------------- |
| 4 ms  | 200    | /features/performance-analysis/              |
| 4 ms  | 200    | /features/availability/                      |
| ...   | ...    | ...                                          |

**TOP slowest URLs**

| Time   | Status | Slow URL                                     |
| :----- | :----- | :------------------------------------------- |
| 142 ms | 200    | /features/deep-website-crawling/             |
| 132 ms | 200    | /introduction/ideas-and-roadmap/             |
| ...    | ...    | ...                                          |

*   **Time:** Time taken to download the URL.
*   **Status:** HTTP status code.
*   **Fast/Slow URL:** The URL itself.

### 3.9. SEO & Content Analysis

Includes several sub-sections: SEO metadata, OpenGraph metadata, and heading structure. These tables are often truncated in the text output to show max 10 rows, with a note advising the use of `--output-html-report=tmp/myreport.html` for the complete data.

**SEO metadata**

| URL                                            | Indexing | Title        | H1           | Description  | Keywords |
| :--------------------------------------------- | :------- | :----------- | :----------- | :----------- | :------- |
| /                                              | Allowed  | SiteOne...ve | SiteOne...ler| A very u...  |          |
| /advanced-topics/caching/                      | Allowed  | Caching...ler| Caching      | SiteOne...ls.|          |
| ...                                            | ...      | ...          | ...          | ...          | ...      |

*   **URL:** The crawled page URL.
*   **Indexing:** Whether the page allows indexing (`Allowed` or specific directive).
*   **Title:** The page's `<title>` tag content (truncated in text output).
*   **H1:** The page's first `<h1>` heading content (truncated in text output).
*   **Description:** The page's meta description (truncated in text output).
*   **Keywords:** The page's meta keywords (usually empty on modern sites).

**OpenGraph metadata**

| URL                                | OG Title   | OG Description | OG Image          | Twitter Title | Twitter Description | Twitter Image |
| :--------------------------------- | :--------- | :------------- | :---------------- | :------------ | :------------------ | :------------ |
| /                                  | SiteOne... | A very...      | /siteone-cra...   |               |                     |               |
| /advanced-topics/caching/          | Caching    | SiteOne...     | /siteone-cra...   |               |                     |               |
| ...                                | ...        | ...            | ...               | ...           | ...                 | ...           |

**Heading structure**

| Heading structure                                                    | Count | Errors | URL                           |
| :------------------------------------------------------------------- | :---- | :----- | :---------------------------- |
| `<h2>` On this page ... `<h1>` FAQ ...                               | 14    | 13     | /introduction/faq/            |
| `<h2>` On this page ... `<h1>` Examples ...                          | 13    | 12     | /configuration/examples/      |
| ...                                                                  | ...   | ...    | ...                           |

*   **Heading structure:** A compressed representation of the heading hierarchy, showing heading tags with their text and IDs.
*   **Count:** Total number of headings on the page.
*   **Errors:** Number of heading structure errors (e.g., skipped levels, out-of-order headings).
*   **URL:** The page where the heading structure was found.

### 3.10. HTTP Headers

Analyzes HTTP response headers across all crawled URLs. Presented in two tables.

*   **HTTP headers:** Lists unique headers, occurrence count, unique value count, preview of values, and min/max values where applicable (e.g., for Content-Length or dates). Headers with many unique values show `[ignored generic values]` and display `-` for unique count.
*   **HTTP header values:** Lists specific values for each header with their occurrence counts.

**HTTP headers (Summary)**

| Header                    | Occurs | Unique | Values preview                                    | Min value  | Max value  |
| :------------------------ | :----- | :----- | :------------------------------------------------ | :--------- | :--------- |
| Accept-Ranges             | 10     | 1      | bytes                                             |            |            |
| Cache-Control             | 65     | 2      | max-age=3600 (51) / max-age=31536000 (14)         |            |            |
| Content-Length            | 18     | -      | [ignored generic values]                          | 152 B      | 8 MB       |
| Content-Type              | 73     | 8      | text/html (58) / application/jav...text/plain (1) |            |            |
| Date                      | 73     | -      | [ignored generic values]                          | 2026-03-16 | 2026-03-16 |
| ...                       | ...    | ...    | ...                                               | ...        | ...        |

**HTTP header values (Detailed)**

| Header                    | Occurs | Value                                                           |
| :------------------------ | :----- | :-------------------------------------------------------------- |
| Accept-Ranges             | 10     | bytes                                                           |
| Cache-Control             | 51     | max-age=3600                                                    |
| Cache-Control             | 14     | max-age=31536000                                                |
| Content-Type              | 58     | text/html                                                       |
| Content-Type              | 4      | application/javascript                                          |
| ...                       | ...    | ...                                                             |

### 3.11. HTTP Caching

Provides detailed analysis of HTTP caching headers in three tables.

*   **HTTP Caching by content type:** Summarizes caching strategies (e.g., `Cache-Control + ETag + Last-Modified`, `ETag`) used for different content types (HTML, CSS, JS, Image, etc.), including counts and average/min/max lifetimes.
*   **HTTP Caching by domain:** Similar summary, but grouped by domain.
*   **HTTP Caching by domain and content type:** The most granular view, showing caching strategies for each content type within each domain.

**HTTP Caching by content type (only from crawlable domains)**

| Content type | Cache type                           | URLs | AVG lifetime | MIN lifetime | MAX lifetime |
| :----------- | :----------------------------------- | :--- | :----------- | :----------- | :----------- |
| HTML         | Cache-Control + ETag + Last-Modified | 50   | 60 min       | 60 min       | 60 min       |
| HTML         | ETag                                 | 8    | -            | -            | -            |
| Image        | Cache-Control + ETag + Last-Modified | 8    | 12 mon       | 12 mon       | 12 mon       |
| JS           | Cache-Control + ETag + Last-Modified | 4    | 12 mon       | 12 mon       | 12 mon       |
| CSS          | Cache-Control + ETag + Last-Modified | 2    | 12 mon       | 12 mon       | 12 mon       |
| Document     | Cache-Control + ETag + Last-Modified | 1    | 60 min       | 60 min       | 60 min       |

**HTTP Caching by domain**

| Domain             | Cache type                           | URLs | AVG lifetime | MIN lifetime | MAX lifetime |
| :----------------- | :----------------------------------- | :--- | :----------- | :----------- | :----------- |
| crawler.siteone.io | Cache-Control + ETag + Last-Modified | 65   | 78 d         | 60 min       | 12 mon       |
| crawler.siteone.io | ETag                                 | 8    | -            | -            | -            |

**HTTP Caching by domain and content type**

| Domain             | Content type | Cache type                           | URLs | AVG lifetime | MIN lifetime | MAX lifetime |
| :----------------- | :----------- | :----------------------------------- | :--- | :----------- | :----------- | :----------- |
| crawler.siteone.io | HTML         | Cache-Control + ETag + Last-Modified | 50   | 60 min       | 60 min       | 60 min       |
| crawler.siteone.io | Image        | Cache-Control + ETag + Last-Modified | 8    | 12 mon       | 12 mon       | 12 mon       |
| ...                | ...          | ...                                  | ...  | ...          | ...          | ...          |

### 3.12. Non-Unique Titles and Descriptions

Two sections that report on duplicate page titles and meta descriptions across the crawled site.

**TOP non-unique titles**

Displays titles that appear on more than one page. If all titles are unique, displays "Nothing to report."

**TOP non-unique descriptions**

| Count | Description |
| :---- | :---------- |
| 2     |             |

*   **Count:** Number of pages sharing the same title or description.
*   **Description/Title:** The duplicated value. An empty value indicates pages with missing meta descriptions.

### 3.13. Best Practices

Summarizes results from various best practice checks.

**Best practices**

| Analysis name                            | OK  | Notice | Warning | Critical |
| :--------------------------------------- | :-- | :----- | :------ | :------- |
| Invalid inline SVGs                      | 34  | 0      | 0       | 0        |
| DOM depth (> 30)                         | 58  | 0      | 0       | 0        |
| Large inline SVGs (> 5120 B)             | 34  | 0      | 0       | 0        |
| Heading structure                        | 55  | 0      | 53      | 0        |
| Duplicate inline SVGs (> 5 and > 1024 B) | 34  | 0      | 0       | 0        |
| Title uniqueness (> 10%)                 | 50  | 0      | 0       | 0        |
| Description uniqueness (> 10%)           | 49  | 0      | 0       | 0        |
| Brotli support                           | 0   | 0      | 50      | 0        |
| WebP support                             | 1   | 0      | 0       | 0        |
| AVIF support                             | 2   | 0      | 0       | 0        |

*   **Analysis name:** The specific check performed.
*   **OK / Notice / Warning / Critical:** Counts of URLs falling into each severity category for that check.

### 3.14. Accessibility

Summarizes results from accessibility checks.

**Accessibility**

| Analysis name                | OK  | Notice | Warning | Critical |
| :--------------------------- | :-- | :----- | :------ | :------- |
| Missing html lang attribute  | 1   | 0      | 0       | 0        |
| Missing aria labels          | 2   | 0      | 119     | 0        |
| Missing roles                | 0   | 0      | 35      | 0        |
| Missing image alt attributes | 6   | 0      | 1       | 0        |

*   **Analysis name:** The specific accessibility check.
*   **OK / Notice / Warning / Critical:** Counts for each severity level.

### 3.15. Source Domains

Lists all domains from which resources were successfully crawled, with counts and size/time summaries per content type. The content type columns are dynamic and depend on the types of resources actually found during the crawl (e.g., HTML, Image, JS, CSS, Document). Only content types present in the crawl results are shown.

**Source domains**

| Domain             | Totals       | HTML         | Image        | JS         | CSS         | Document  |
| :----------------- | :----------- | :----------- | :----------- | :--------- | :---------- | :-------- |
| crawler.siteone.io | 73/20MB/1.5s | 58/3MB/744ms | 8/18MB/773ms | 4/7kB/11ms | 2/64kB/12ms | 1/152B/2ms|

Each cell in the content type columns contains three values separated by `/`: count of URLs, total size, and total download time.

### 3.16. Content Types

Summarizes crawled resources by content type in two tables.

*   **Content types (General):** Groups by broad categories (HTML, Image, JS, CSS, Document, etc.).
*   **Content types (MIME types):** Groups by specific MIME types (e.g., `text/html`, `image/gif`, `application/javascript`).

Note: The status columns are dynamic and reflect the actual HTTP status code ranges encountered during the crawl. For example, if HTTP 429 responses are encountered, a `Status 42x` column will appear alongside the standard `Status 20x` and `Status 40x` columns.

**Content types (General)**

| Content type | URLs | Total size | Total time | Avg time | Status 20x | Status 40x | Status 42x |
| :----------- | :--- | :--------- | :--------- | :------- | :--------- | :--------- | :--------- |
| HTML         | 58   | 3 MB       | 744 ms     | 12 ms    | 50         | 1          | 7          |
| Image        | 8    | 18 MB      | 773 ms     | 96 ms    | 8          | 0          | 0          |
| JS           | 4    | 7 kB       | 11 ms      | 2 ms     | 4          | 0          | 0          |
| CSS          | 2    | 64 kB      | 12 ms      | 6 ms     | 2          | 0          | 0          |
| Document     | 1    | 152 B      | 2 ms       | 2 ms     | 1          | 0          | 0          |

**Content types (MIME types)**

| Content type           | URLs | Total size | Total time | Avg time | Status 20x | Status 40x | Status 42x |
| :--------------------- | :--- | :--------- | :--------- | :------- | :--------- | :--------- | :--------- |
| text/html              | 58   | 3 MB       | 744 ms     | 12 ms    | 50         | 1          | 7          |
| application/javascript | 4    | 7 kB       | 11 ms      | 2 ms     | 4          | 0          | 0          |
| image/gif              | 3    | 16 MB      | 671 ms     | 223 ms   | 3          | 0          | 0          |
| text/css               | 2    | 64 kB      | 12 ms      | 6 ms     | 2          | 0          | 0          |
| image/svg+xml          | 2    | 1 kB       | 5 ms       | 2 ms     | 2          | 0          | 0          |
| image/avif             | 2    | 2 MB       | 82 ms      | 41 ms    | 2          | 0          | 0          |
| image/webp             | 1    | 31 kB      | 14 ms      | 14 ms    | 1          | 0          | 0          |
| text/plain             | 1    | 152 B      | 2 ms       | 2 ms     | 1          | 0          | 0          |

### 3.17. DNS Info

Shows the DNS resolution tree for the crawled domain(s) and the DNS server used. This section is not a table but a tree-formatted block.

```
DNS info
--------

DNS resolving tree
------------------------------------------------------------------------
crawler.siteone.io
  IPv4: 86.49.167.242

DNS server: 10.255.255.254
```

### 3.18. Security

Reports on the presence and configuration of important security-related HTTP headers. Each header is checked and results are categorized into OK, Notice, Warning, or Critical. A recommendation is provided when issues are found.

**Security**

| Header                    | OK | Notice | Warning | Critical | Recommendation                                                                      |
| :------------------------ | :- | :----- | :------ | :------- | :---------------------------------------------------------------------------------- |
| Content-Security-Policy   | 50 | 0      | 0       | 4        | Content-Security-Policy header is not set. It...prevents XSS attacks.               |
| X-Frame-Options           | 0  | 54     | 0       | 0        | X-Frame-Options header is set to SAMEORIGIN wh...resource in a frame.               |
| X-XSS-Protection          | 0  | 54     | 0       | 0        | X-XSS-Protection header is set but deprecated....urity-Policy instead.              |
| Strict-Transport-Security | 54 | 0      | 0       | 0        |                                                                                     |
| X-Content-Type-Options    | 54 | 0      | 0       | 0        |                                                                                     |
| Referrer-Policy           | 54 | 0      | 0       | 0        |                                                                                     |
| Feature-Policy            | 54 | 0      | 0       | 0        |                                                                                     |
| Permissions-Policy        | 54 | 0      | 0       | 0        |                                                                                     |
| Server                    | 54 | 0      | 0       | 0        | Server header is not set or empty. This is recommended.                             |

*   **Header:** The security header being checked.
*   **OK / Notice / Warning / Critical:** Counts based on the header's presence and configuration. Note that X-XSS-Protection produces a "Notice" (deprecated) rather than a "Critical" when it is set, because the header itself is deprecated in favor of Content-Security-Policy.
*   **Recommendation:** Suggestion for improvement if issues are found. Empty when no action is needed.

### 3.19. Analysis Stats

Provides performance metrics for the crawler's internal analysis modules. Useful for debugging the crawler itself. The method names follow Rust naming conventions (e.g., `BestPracticeAnalyzer::checkHeadingStructure`, `AccessibilityAnalyzer::checkMissingAriaLabels`).

**Analysis stats**

| Class::method                                        | Exec time | Exec count |
| :--------------------------------------------------- | :-------- | :--------- |
| SslTlsAnalyzer::getTLSandSSLCertificateInfo          | 259 ms    | 1          |
| BestPracticeAnalyzer::checkHeadingStructure           | 47 ms     | 58         |
| AccessibilityAnalyzer::checkMissingAriaLabels         | 45 ms     | 50         |
| AccessibilityAnalyzer::checkMissingLabels             | 42 ms     | 50         |
| AccessibilityAnalyzer::checkMissingRoles              | 39 ms     | 50         |
| BestPracticeAnalyzer::checkMaxDOMDepth                | 36 ms     | 58         |
| AccessibilityAnalyzer::checkMissingLang               | 36 ms     | 50         |
| BestPracticeAnalyzer::checkNonClickablePhoneNumbers   | 24 ms     | 58         |
| BestPracticeAnalyzer::checkInlineSvg                  | 11 ms     | 58         |
| BestPracticeAnalyzer::checkMissingQuotesOnAttributes  | 3 ms      | 58         |
| SeoAndOpenGraphAnalyzer::analyzeHeadings              | 2 ms      | 1          |
| SecurityAnalyzer::checkHtmlSecurity                   | 1 ms      | 54         |
| AccessibilityAnalyzer::checkImageAltAttributes        | 1 ms      | 50         |
| SecurityAnalyzer::checkHeaders                        | 0 ms      | 54         |
| SeoAndOpenGraphAnalyzer::analyzeSeo                   | 0 ms      | 1          |
| SeoAndOpenGraphAnalyzer::analyzeOpenGraph             | 0 ms      | 1          |
| BestPracticeAnalyzer::checkMetaDescriptionUniqueness  | 0 ms      | 1          |
| BestPracticeAnalyzer::checkTitleUniqueness            | 0 ms      | 1          |
| BestPracticeAnalyzer::checkBrotliSupport              | 0 ms      | 1          |
| BestPracticeAnalyzer::checkWebpSupport                | 0 ms      | 1          |
| BestPracticeAnalyzer::checkAvifSupport                | 0 ms      | 1          |

*   **Class::method:** The analyzer class and specific check method.
*   **Exec time:** Total execution time for all invocations of this method.
*   **Exec count:** Number of times the method was invoked (typically once per analyzed URL or once for aggregate checks).

### 3.20. Content Processor Stats

Provides performance metrics for content processors that run during the crawl. These processors handle URL extraction and content transformation for different resource types.

**Content processor stats**

| Class::method                                             | Exec time | Exec count |
| :-------------------------------------------------------- | :-------- | :--------- |
| HtmlProcessor::findUrls                                   | 47 ms     | 58         |
| NextJsProcessor::applyContentChangesBeforeUrlParsing      | 11 ms     | 64         |
| JavaScriptProcessor::findUrls                             | 8 ms      | 62         |
| AstroProcessor::findUrls                                  | 1 ms      | 62         |
| CssProcessor::findUrls                                    | 1 ms      | 60         |
| AstroProcessor::applyContentChangesBeforeUrlParsing       | 0 ms      | 62         |
| NextJsProcessor::findUrls                                 | 0 ms      | 64         |
| JavaScriptProcessor::applyContentChangesBeforeUrlParsing  | 0 ms      | 62         |
| SvelteProcessor::applyContentChangesBeforeUrlParsing      | 0 ms      | 58         |
| CssProcessor::applyContentChangesBeforeUrlParsing         | 0 ms      | 60         |
| HtmlProcessor::applyContentChangesBeforeUrlParsing        | 0 ms      | 58         |
| SvelteProcessor::findUrls                                 | 0 ms      | 58         |

*   **Class::method:** The content processor class and specific method (`findUrls` for URL extraction, `applyContentChangesBeforeUrlParsing` for pre-processing transformations).
*   **Exec time:** Total execution time for all invocations.
*   **Exec count:** Number of times the method was invoked.

### 3.21. Execution Summary

A bordered summary block showing overall crawl statistics, printed between `===` separator lines.

```
==========================================================================
Total execution time 9.2 s using 3 workers and 2048M memory limit (max used 109 MB)
Total of 73 visited URLs with a total size of 20 MB and power of 7 reqs/s with download speed 2 MB/s
Response times: AVG 21 ms MIN 3 ms MAX 345 ms TOTAL 1.5 s
==========================================================================
```

*   **Total execution time:** Wall-clock time for the entire crawl, including the number of concurrent workers and memory usage.
*   **Total of N visited URLs:** Count of all successfully visited URLs, total downloaded size, request throughput, and download speed.
*   **Response times:** Average, minimum, maximum, and total response times across all URLs.

### 3.22. Website Quality Score

A visual box-drawing quality score display that rates the website across five weighted categories on a 0-10 scale. Each category shows a progress bar, numeric score, and a label (Excellent, Good, Fair, Poor, etc.).

```
+=====================================================================+
|                      WEBSITE QUALITY SCORE                          |
+=================================================================+
|  Overall         #####################----   8.2/10  Good           |
+=================================================================+
|  Performance     #########################  10.0/10  Excellent      |
|  SEO             ########################-   9.5/10  Excellent      |
|  Security        ###################------   7.5/10  Good           |
|  Accessibility   #############------------   5.0/10  Fair           |
|  Best Practices  ########################-   9.5/10  Excellent      |
+=====================================================================+
```

*(The actual output uses Unicode box-drawing characters and block characters for the progress bars.)*

The five categories and their weights are:
*   **Performance** (20%): Based on response times, error rates.
*   **SEO** (20%): Based on titles, descriptions, headings, indexing.
*   **Security** (25%): Based on security header presence and configuration.
*   **Accessibility** (20%): Based on lang attributes, alt text, ARIA labels, roles.
*   **Best Practices** (15%): Based on inline SVGs, heading structure, DOM depth, compression support.

### 3.23. Summary

A categorized list of findings using severity-level prefixes. Each finding is on its own line with an emoji indicator:

*   **CRITICAL** (red circle): Serious issues requiring immediate attention (e.g., pages with critical security findings, skipped URLs).
*   **WARNING** (warning sign): Issues that should be addressed (e.g., missing Brotli support, missing ARIA labels, skipped heading levels).
*   **INFO** (fast-forward): Informational items (e.g., robots.txt status, external URL count, DNS IPv6 status, 404 notices).
*   **OK** (green check): Positive findings confirming correct configuration (e.g., valid SSL certificate, no redirects, all titles unique).
*   **NOTICE** (pin): Export notifications (e.g., text/JSON/HTML report save paths and timing).

Example summary output:

```
Summary
-------

[CRITICAL] Skipped URLs - 95 skipped URLs found.
[CRITICAL] Security - 4 pages(s) with critical finding(s).
[WARNING] Latest SSL/TLS protocol TLSv1.3 is not supported. Ask your admin/provider to add TLSv1.3 support.
[WARNING] 50 page(s) do not support Brotli compression.
[WARNING] 49 page(s) with skipped heading levels.
[WARNING] 1 page(s) without image alt attributes.
[WARNING] 50 page(s) without aria labels.
[WARNING] 50 page(s) without role attributes.
[INFO] Loaded robots.txt for domain 'crawler.siteone.io': status code 200, size 152 B and took 24 ms.
[INFO] External URLs - 89 external URL(s) found.
[INFO] 404 NOTICE - 1 non-existent page(s) found.
[INFO] DNS IPv6: domain crawler.siteone.io does not support IPv6 (DNS server: 10.255.255.254).
[OK] Redirects - no redirects found.
[OK] SSL/TLS certificate is valid until Mar 13 15:43:29 2027 GMT. Issued by ...
[OK] Performance OK - all non-media URLs are faster than 3 seconds.
[OK] HTTP headers - found 18 unique headers.
[OK] All 50 unique title(s) are within the allowed 10% duplicity.
...
[NOTICE] Text report saved to '.../crawler.siteone.io.output.20260316-155513.txt' and took 0 ms.
[NOTICE] JSON report saved to '.../crawler.siteone.io.output.20260316-155513.json' and took 0 ms.
[NOTICE] HTML report saved to '.../crawler.siteone.io.report.20260316-155513.html' and took 1 ms.
```

*(The actual output uses emoji characters for the severity prefixes rather than bracketed labels.)*

## 4. Information Obtainable from Text Output

The text output provides a wealth of information about a website, including:

*   **Crawl Overview:** Number of pages found, processed, and skipped.
*   **Website Structure:** Implicitly through the list of crawled URLs and their relationships (via "Found at URL").
*   **Link Health:** Identification of broken links (404s) and redirects.
*   **External Dependencies:** Full list of external URLs linked from the site, with page counts and source pages.
*   **Performance Bottlenecks:** Identification of the slowest loading pages and resources.
*   **Content Inventory:** Summary of different content types (HTML, images, scripts, stylesheets) and their sizes/load times.
*   **Basic SEO Health:** Status of titles, descriptions, heading structures, indexing directives, and duplicate content.
*   **OpenGraph Implementation:** Presence and content of OG tags for social sharing.
*   **Server Configuration:** Insights into HTTP headers used, including caching and security headers.
*   **Caching Strategy:** Effectiveness of caching policies across different content types and domains.
*   **Security Posture:** Checks for essential security headers (HSTS, CSP, X-Frame-Options, etc.).
*   **Accessibility Issues:** High-level view of common accessibility problems (missing alt text, lang attributes, ARIA labels, roles).
*   **Best Practice Adherence:** Checks against common web development best practices.
*   **SSL/TLS Certificate Status:** Validity and issuer details of the site's certificate.
*   **Website Quality Score:** Numeric scores (0-10) across five quality categories with an overall rating.
*   **Content Processor Performance:** Internal timing data for URL extraction and content processing.

## 5. Use Cases for Text Output

The text output is valuable for various tasks:

1.  **Quick Website Health Check:** Get a fast overview of major issues like 404s, slow pages, or critical security/accessibility warnings via the Summary section.
2.  **Identifying Broken Links:** Easily spot and locate 404 errors using the dedicated section.
3.  **Performance Audit:** Identify the slowest URLs to prioritize optimization efforts.
4.  **Basic SEO Audit:** Check for duplicate titles/descriptions and analyze heading structures.
5.  **Security Header Review:** Quickly verify the presence of important security headers and see deprecation notices.
6.  **Caching Policy Verification:** Understand how caching is implemented across the site.
7.  **Pre/Post Deployment Checks:** Compare outputs before and after changes to catch regressions.
8.  **Generating Simple Reports:** Copy-paste relevant sections into emails or documents for concise reporting.
9.  **Troubleshooting Crawl Issues:** Use skipped URLs and analysis stats to understand crawler behavior.
10. **Quality Scoring:** Use the Website Quality Score to track improvements over time across performance, SEO, security, accessibility, and best practices.
11. **Command-Line Integration:** Process the text output with standard command-line tools (grep, awk, sed) for specific data extraction or automated checks in simple scripts.

## 6. Note on JSON Output

While this document focuses on the text output, SiteOne Crawler also offers a JSON output format (`--output-json-file`). The JSON output contains much of the same information but in a structured format that is ideal for programmatic consumption, detailed data analysis, or integration with other tools and dashboards. For automated processing or complex data manipulation, the JSON output is generally preferred.

See the [JSON Output Documentation](JSON-OUTPUT.md) for more details on the JSON format.


================================================
FILE: rustfmt.toml
================================================
max_width = 120
use_field_init_shorthand = true


================================================
FILE: src/analysis/accessibility_analyzer.rs
================================================
// SiteOne Crawler - AccessibilityAnalyzer
// (c) Jan Reges <jan.reges@siteone.cz>

use std::collections::HashMap;
use std::time::Instant;

use regex::Regex;
use scraper::{Html, Selector};

use crate::analysis::analyzer::Analyzer;
use crate::analysis::base_analyzer::BaseAnalyzer;
use crate::analysis::result::analyzer_stats::AnalyzerStats;
use crate::analysis::result::url_analysis_result::UrlAnalysisResult;
use crate::components::super_table::SuperTable;
use crate::components::super_table_column::SuperTableColumn;
use crate::extra_column::ExtraColumn;
use crate::output::output::Output;
use crate::result::status::Status;
use crate::result::visited_url::VisitedUrl;
use crate::types::ContentTypeId;
use crate::utils;

const ANALYSIS_MISSING_IMAGE_ALT_ATTRIBUTES: &str = "Missing image alt attributes";
const ANALYSIS_MISSING_FORM_LABELS: &str = "Missing form labels";
const ANALYSIS_MISSING_ARIA_LABELS: &str = "Missing aria labels";
const ANALYSIS_MISSING_ROLES: &str = "Missing roles";
const ANALYSIS_MISSING_LANG_ATTRIBUTE: &str = "Missing html lang attribute";

const SUPER_TABLE_ACCESSIBILITY: &str = "accessibility";

pub struct AccessibilityAnalyzer {
    base: BaseAnalyzer,
    stats: AnalyzerStats,

    pages_with_invalid_html: usize,
    pages_without_image_alt_attributes: usize,
    pages_without_form_labels: usize,
    pages_without_aria_labels: usize,
    pages_without_roles: usize,
    pages_without_lang: usize,
}

impl Default for AccessibilityAnalyzer {
    fn default() -> Self {
        Self::new()
    }
}

impl AccessibilityAnalyzer {
    pub fn new() -> Self {
        Self {
            base: BaseAnalyzer::new(),
            stats: AnalyzerStats::new(),
            pages_with_invalid_html: 0,
            pages_without_image_alt_attributes: 0,
            pages_without_form_labels: 0,
            pages_without_aria_labels: 0,
            pages_without_roles: 0,
            pages_without_lang: 0,
        }
    }

    fn check_image_alt_attributes(&mut self, html: &str, result: &mut UrlAnalysisResult) {
        use once_cell::sync::Lazy;
        static RE_IMG: Lazy<Regex> = Lazy::new(|| Regex::new(r"(?i)<img[^>]+>").unwrap());
        let img_re = &*RE_IMG;

        let mut bad_images: Vec<String> = Vec::new();
        let mut found_count = 0usize;

        for mat in img_re.find_iter(html) {
            found_count += 1;
            let img = mat.as_str();
            let img_lower = img.to_lowercase();

            if !img_lower.contains(" alt=") || img_lower.contains(" alt=\"\"") || img_lower.contains(" alt=''") {
                bad_images.push(img.to_string());
                self.stats.add_warning(ANALYSIS_MISSING_IMAGE_ALT_ATTRIBUTES, Some(img));
            } else {
                self.stats.add_ok(ANALYSIS_MISSING_IMAGE_ALT_ATTRIBUTES, Some(img));
            }
        }

        if !bad_images.is_empty() {
            result.add_warning(
                format!("{} image(s) without 'alt' attribute", bad_images.len()),
                ANALYSIS_MISSING_IMAGE_ALT_ATTRIBUTES,
                Some(bad_images),
            );
            self.pages_without_image_alt_attributes += 1;
        } else {
            result.add_ok(
                format!("All {} image(s) have an 'alt' attribute", found_count),
                ANALYSIS_MISSING_IMAGE_ALT_ATTRIBUTES,
                None,
            );
        }
    }

    fn check_missing_labels(&mut self, html: &str, result: &mut UrlAnalysisResult) {
        let document = Html::parse_document(html);

        let input_selector = match Selector::parse("input:not([type='hidden'])") {
            Ok(s) => s,
            Err(_) => return,
        };
        let label_selector_fn =
            |id: &str| -> Option<Selector> { Selector::parse(&format!("label[for='{}']", id)).ok() };

        let inputs: Vec<_> = document.select(&input_selector).collect();
        let mut inputs_without_labels: Vec<String> = Vec::new();

        for input in &inputs {
            let input_html = get_opening_tag_html(input);
            let dedup_key = normalize_tag_for_dedup(input);
            let id = input.value().attr("id");

            if let Some(id_val) = id {
                if let Some(label_sel) = label_selector_fn(id_val)
                    && document.select(&label_sel).next().is_none()
                {
                    inputs_without_labels.push(input_html);
                    self.stats.add_warning(ANALYSIS_MISSING_FORM_LABELS, Some(&dedup_key));
                }
            } else {
                inputs_without_labels.push(input_html);
                self.stats.add_warning(ANALYSIS_MISSING_FORM_LABELS, Some(&dedup_key));
            }
        }

        if !inputs_without_labels.is_empty() {
            result.add_warning(
                format!("{} input(s) without associated <label>", inputs_without_labels.len()),
                ANALYSIS_MISSING_FORM_LABELS,
                Some(inputs_without_labels),
            );
            self.pages_without_form_labels += 1;
        } else if !inputs.is_empty() {
            result.add_ok(
                format!("All {} input(s) have associated 'label'", inputs.len()),
                ANALYSIS_MISSING_FORM_LABELS,
                None,
            );
        }
    }

    fn check_missing_aria_labels(&mut self, html: &str, result: &mut UrlAnalysisResult) {
        let document = Html::parse_document(html);

        let mut critical_elements_without: Vec<String> = Vec::new();
        let critical_selectors = ["input:not([type='hidden'])", "select", "textarea"];

        for sel_str in &critical_selectors {
            let selector = match Selector::parse(sel_str) {
                Ok(s) => s,
                Err(_) => continue,
            };

            for element in document.select(&selector) {
                let element_html = get_opening_tag_html(&element);
                let dedup_key = normalize_tag_for_dedup(&element);

                let has_aria_label = element.value().attr("aria-label").is_some();
                let has_aria_labelledby = element.value().attr("aria-labelledby").is_some();

                if !has_aria_label && !has_aria_labelledby {
                    critical_elements_without.push(element_html);
                    self.stats.add_critical(ANALYSIS_MISSING_ARIA_LABELS, Some(&dedup_key));
                } else {
                    self.stats.add_ok(ANALYSIS_MISSING_ARIA_LABELS, Some(&dedup_key));
                }
            }
        }

        let mut warning_elements_without: Vec<String> = Vec::new();
        let warning_selectors = ["a", "button"];

        for sel_str in &warning_selectors {
            let selector = match Selector::parse(sel_str) {
                Ok(s) => s,
                Err(_) => continue,
            };

            for element in document.select(&selector) {
                let element_html = get_opening_tag_html(&element);
                let dedup_key = normalize_tag_for_dedup(&element);

                let has_aria_label = element.value().attr("aria-label").is_some();
                let has_aria_labelledby = element.value().attr("aria-labelledby").is_some();

                if !has_aria_label && !has_aria_labelledby {
                    warning_elements_without.push(element_html);
                    self.stats.add_warning(ANALYSIS_MISSING_ARIA_LABELS, Some(&dedup_key));
                } else {
                    self.stats.add_ok(ANALYSIS_MISSING_ARIA_LABELS, Some(&dedup_key));
                }
            }
        }

        if !critical_elements_without.is_empty() {
            result.add_critical(
                format!(
                    "{} form element(s) without defined 'aria-label' or 'aria-labelledby'",
                    critical_elements_without.len()
                ),
                ANALYSIS_MISSING_ARIA_LABELS,
                Some(critical_elements_without.clone()),
            );
        }
        if !warning_elements_without.is_empty() {
            result.add_warning(
                format!(
                    "{} element(s) without defined 'aria-label' or 'aria-labelledby'",
                    warning_elements_without.len()
                ),
                ANALYSIS_MISSING_ARIA_LABELS,
                Some(warning_elements_without.clone()),
            );
        }

        if !critical_elements_without.is_empty() || !warning_elements_without.is_empty() {
            self.pages_without_aria_labels += 1;
        } else {
            result.add_ok(
                "All key interactive element(s) have defined 'aria-label' or 'aria-labelledby'".to_string(),
                ANALYSIS_MISSING_ARIA_LABELS,
                None,
            );
        }
    }

    fn check_missing_roles(&mut self, html: &str, result: &mut UrlAnalysisResult) {
        let document = Html::parse_document(html);

        let mut elements_without_roles: Vec<String> = Vec::new();
        let elements_to_check = ["nav", "main", "aside", "header", "footer"];

        for sel_str in &elements_to_check {
            let selector = match Selector::parse(sel_str) {
                Ok(s) => s,
                Err(_) => continue,
            };

            for element in document.select(&selector) {
                if element.value().attr("role").is_some() {
                    continue;
                }
                let element_html = get_opening_tag_html(&element);
                let dedup_key = normalize_tag_for_dedup(&element);
                elements_without_roles.push(element_html);
                self.stats.add_warning(ANALYSIS_MISSING_ROLES, Some(&dedup_key));
            }
        }

        if !elements_without_roles.is_empty() {
            result.add_warning(
                format!("{} element(s) without defined 'role'", elements_without_roles.len()),
                ANALYSIS_MISSING_ROLES,
                Some(elements_without_roles),
            );
            self.pages_without_roles += 1;
        } else {
            result.add_ok(
                "All key element(s) have defined 'role'".to_string(),
                ANALYSIS_MISSING_ROLES,
                None,
            );
        }
    }

    fn check_missing_lang(&mut self, html: &str, result: &mut UrlAnalysisResult) {
        let document = Html::parse_document(html);

        let html_selector = match Selector::parse("html") {
            Ok(s) => s,
            Err(_) => return,
        };

        if let Some(html_el) = document.select(&html_selector).next() {
            if let Some(lang) = html_el.value().attr("lang") {
                let element_html = format!("<html lang=\"{}\">", lang);
                if lang.is_empty() {
                    result.add_critical(
                        "The 'lang' attribute is present in <html> but empty.".to_string(),
                        ANALYSIS_MISSING_LANG_ATTRIBUTE,
                        Some(vec!["HTML lang attribute value is empty ''.".to_string()]),
                    );
                    self.stats
                        .add_critical(ANALYSIS_MISSING_LANG_ATTRIBUTE, Some(&element_html));
                    self.pages_without_lang += 1;
                } else {
                    result.add_ok(
                        format!("Document has defined 'lang' attribute as '{}'.", lang),
                        ANALYSIS_MISSING_LANG_ATTRIBUTE,
                        None,
                    );
                    self.stats.add_ok(ANALYSIS_MISSING_LANG_ATTRIBUTE, Some(&element_html));
                }
            } else {
                result.add_critical(
                    "Document does not have a defined 'lang' attribute in <html>.".to_string(),
                    ANALYSIS_MISSING_LANG_ATTRIBUTE,
                    Some(vec!["HTML lang attribute is not present.".to_string()]),
                );
                self.stats.add_critical(ANALYSIS_MISSING_LANG_ATTRIBUTE, Some("<html>"));
                self.pages_without_lang += 1;
            }
        } else {
            result.add_critical(
                "Document does not have a defined 'lang' attribute in <html>.".to_string(),
                ANALYSIS_MISSING_LANG_ATTRIBUTE,
                Some(vec!["HTML lang attribute is not present.".to_string()]),
            );
            self.stats.add_critical(ANALYSIS_MISSING_LANG_ATTRIBUTE, Some("<html>"));
            self.pages_without_lang += 1;
        }
    }

    fn set_findings_to_summary(&self, status: &Status) {
        if self.pages_with_invalid_html > 0 {
            status.add_critical_to_summary(
                "pages-with-invalid-html",
                &format!("{} page(s) with invalid HTML", self.pages_with_invalid_html),
            );
        } else {
            status.add_ok_to_summary("pages-with-invalid-html", "All pages have valid HTML");
        }

        if self.pages_without_image_alt_attributes > 0 {
            status.add_warning_to_summary(
                "pages-without-image-alt-attributes",
                &format!(
                    "{} page(s) without image alt attributes",
                    self.pages_without_image_alt_attributes
                ),
            );
        } else {
            status.add_ok_to_summary(
                "pages-without-image-alt-attributes",
                "All pages have image alt attributes",
            );
        }

        if self.pages_without_form_labels > 0 {
            status.add_warning_to_summary(
                "pages-without-form-labels",
                &format!("{} page(s) without form labels", self.pages_without_form_labels),
            );
        } else {
            status.add_ok_to_summary("pages-without-form-labels", "All pages have form labels");
        }

        if self.pages_without_aria_labels > 0 {
            status.add_warning_to_summary(
                "pages-without-aria-labels",
                &format!("{} page(s) without aria labels", self.pages_without_aria_labels),
            );
        } else {
            status.add_ok_to_summary("pages-without-aria-labels", "All pages have aria labels");
        }

        if self.pages_without_roles > 0 {
            status.add_warning_to_summary(
                "pages-without-roles",
                &format!("{} page(s) without role attributes", self.pages_without_roles),
            );
        } else {
            status.add_ok_to_summary("pages-without-roles", "All pages have role attributes");
        }

        if self.pages_without_lang > 0 {
            status.add_critical_to_summary(
                "pages-without-lang",
                &format!("{} page(s) without lang attribute", self.pages_without_lang),
            );
        } else {
            status.add_ok_to_summary("pages-without-lang", "All pages have lang attribute");
        }
    }
}

impl Analyzer for AccessibilityAnalyzer {
    fn analyze(&mut self, status: &Status, output: &mut dyn Output) {
        let columns = vec![
            SuperTableColumn::new(
                "analysisName".to_string(),
                "Analysis name".to_string(),
                -1, // AUTO_WIDTH
                None,
                None,
                false,
                false,
                false,
                true,
                None,
            ),
            SuperTableColumn::new(
                "ok".to_string(),
                "OK".to_string(),
                5,
                Some(Box::new(|value: &str, _render_into: &str| {
                    if let Ok(v) = value.parse::<usize>()
                        && v > 0
                    {
                        return utils::get_color_text(&v.to_string(), "green", false);
                    }
                    "0".to_string()
                })),
                None,
                false,
                false,
                false,
                true,
                None,
            ),
            SuperTableColumn::new(
                "notice".to_string(),
                "Notice".to_string(),
                6,
                Some(Box::new(|value: &str, _render_into: &str| {
                    if let Ok(v) = value.parse::<usize>()
                        && v > 0
                    {
                        return utils::get_color_text(&v.to_string(), "blue", false);
                    }
                    "0".to_string()
                })),
                None,
                false,
                true,
                false,
                true,
                None,
            ),
            SuperTableColumn::new(
                "warning".to_string(),
                "Warning".to_string(),
                7,
                Some(Box::new(|value: &str, _render_into: &str| {
                    if let Ok(v) = value.parse::<usize>()
                        && v > 0
                    {
                        return utils::get_color_text(&v.to_string(), "magenta", true);
                    }
                    "0".to_string()
                })),
                None,
                false,
                true,
                false,
                true,
                None,
            ),
            SuperTableColumn::new(
                "critical".to_string(),
                "Critical".to_string(),
                8,
                Some(Box::new(|value: &str, _render_into: &str| {
                    if let Ok(v) = value.parse::<usize>()
                        && v > 0
                    {
                        return utils::get_color_text(&v.to_string(), "red", true);
                    }
                    "0".to_string()
                })),
                None,
                false,
                true,
                false,
                true,
                None,
            ),
        ];

        let data = self.stats.to_table_data();

        let mut super_table = SuperTable::new(
            SUPER_TABLE_ACCESSIBILITY.to_string(),
            "Accessibility".to_string(),
            "Nothing to report.".to_string(),
            columns,
            true,
            None,
            "ASC".to_string(),
            None,
            None,
            None,
        );

        super_table.set_data(data);
        status.configure_super_table_url_stripping(&mut super_table);
        output.add_super_table(&super_table);
        status.add_super_table_at_end(super_table);

        self.set_findings_to_summary(status);
    }

    fn analyze_visited_url(
        &mut self,
        visited_url: &VisitedUrl,
        body: Option<&str>,
        _headers: Option<&HashMap<String, String>>,
    ) -> Option<UrlAnalysisResult> {
        let is_html = visited_url.content_type == ContentTypeId::Html
            && visited_url.status_code == 200
            && visited_url.is_allowed_for_crawling;

        if !is_html {
            return None;
        }

        let html = body?;
        let mut result = UrlAnalysisResult::new();

        let s = Instant::now();
        self.check_image_alt_attributes(html, &mut result);
        self.base
            .measure_exec_time("AccessibilityAnalyzer", "checkImageAltAttributes", s);

        let s = Instant::now();
        self.check_missing_labels(html, &mut result);
        self.base
            .measure_exec_time("AccessibilityAnalyzer", "checkMissingLabels", s);

        let s = Instant::now();
        self.check_missing_aria_labels(html, &mut result);
        self.base
            .measure_exec_time("AccessibilityAnalyzer", "checkMissingAriaLabels", s);

        let s = Instant::now();
        self.check_missing_roles(html, &mut result);
        self.base
            .measure_exec_time("AccessibilityAnalyzer", "checkMissingRoles", s);

        let s = Instant::now();
        self.check_missing_lang(html, &mut result);
        self.base
            .measure_exec_time("AccessibilityAnalyzer", "checkMissingLang", s);

        Some(result)
    }

    fn show_analyzed_visited_url_result_as_column(&self) -> Option<ExtraColumn> {
        ExtraColumn::new("Access.".to_string(), Some(8), false, None, None, None).ok()
    }

    fn should_be_activated(&self) -> bool {
        true
    }

    fn get_order(&self) -> i32 {
        175
    }

    fn get_name(&self) -> &str {
        "AccessibilityAnalyzer"
    }

    fn get_exec_times(&self) -> &HashMap<String, f64> {
        self.base.get_exec_times()
    }

    fn get_exec_counts(&self) -> &HashMap<String, usize> {
        self.base.get_exec_counts()
    }
}

/// Get the opening tag HTML from an element reference (strip inner content).
/// Built directly from element name + attributes to avoid html5ever serialization
/// panics on malformed HTML ("no parent ElemInfo").
fn get_opening_tag_html(element: &scraper::ElementRef) -> String {
    let name = element.value().name();
    let attrs: Vec<String> = element
        .value()
        .attrs()
        .map(|(k, v)| format!("{}=\"{}\"", k, v))
        .collect();
    if attrs.is_empty() {
        format!("<{}>", name)
    } else {
        format!("<{} {}>", name, attrs.join(" "))
    }
}

/// Normalize an opening tag for deduplication purposes.
/// Replaces dynamic attribute values (href, src, action, id, class, style, data-*)
/// with "*" so that structurally identical elements on different pages
/// (e.g. same nav `<a>` with different href) are counted only once.
fn normalize_tag_for_dedup(element: &scraper::ElementRef) -> String {
    let name = element.value().name();
    let attrs: Vec<String> = element
        .value()
        .attrs()
        .map(|(k, v)| {
            if k == "href"
                || k == "src"
                || k == "action"
                || k == "id"
                || k == "class"
                || k == "style"
                || k == "for"
                || k.starts_with("data-")
            {
                format!("{}=\"*\"", k)
            } else {
                format!("{}=\"{}\"", k, v)
            }
        })
        .collect();
    if attrs.is_empty() {
        format!("<{}>", name)
    } else {
        format!("<{} {}>", name, attrs.join(" "))
    }
}


================================================
FILE: src/analysis/analyzer.rs
================================================
// SiteOne Crawler - Analyzer trait
// (c) Jan Reges <jan.reges@siteone.cz>

use std::collections::HashMap;

use crate::analysis::result::url_analysis_result::UrlAnalysisResult;
use crate::extra_column::ExtraColumn;
use crate::result::visited_url::VisitedUrl;

/// Trait that all analyzers must implement.
pub trait Analyzer: Send + Sync {
    /// Do your analysis and set results to output (post-crawl).
    /// Called after all URLs have been visited.
    fn analyze(&mut self, status: &crate::result::status::Status, output: &mut dyn crate::output::output::Output);

    /// Do your analysis for a just-visited URL.
    /// Body and headers are already downloaded and decompressed.
    /// Return None if you don't want to analyze this URL,
    /// otherwise return UrlAnalysisResult with your results.
    fn analyze_visited_url(
        &mut self,
        _visited_url: &VisitedUrl,
        _body: Option<&str>,
        _headers: Option<&HashMap<String, String>>,
    ) -> Option<UrlAnalysisResult> {
        None
    }

    /// If you want to show URL analysis results in table column,
    /// return the ExtraColumn under which results will be shown.
    fn show_analyzed_visited_url_result_as_column(&self) -> Option<ExtraColumn> {
        None
    }

    /// Should this analyzer be activated based on options?
    fn should_be_activated(&self) -> bool;

    /// Get order of this analyzer (lower = earlier).
    fn get_order(&self) -> i32;

    /// Get the name of this analyzer.
    fn get_name(&self) -> &str;

    /// Get execution times of analyzer methods.
    fn get_exec_times(&self) -> &HashMap<String, f64>;

    /// Get execution counts of analyzer methods.
    fn get_exec_counts(&self) -> &HashMap<String, usize>;
}


================================================
FILE: src/analysis/base_analyzer.rs
================================================
// SiteOne Crawler - BaseAnalyzer
// (c) Jan Reges <jan.reges@siteone.cz>

use std::collections::HashMap;
use std::time::Instant;

/// Common state and methods shared by all analyzers.
/// Embed this as a field in each concrete analyzer struct.
#[derive(Debug, Default)]
pub struct BaseAnalyzer {
    /// Total exec times of analyzer methods: "ClassName::method" -> seconds
    pub exec_times: HashMap<String, f64>,
    /// Total exec counts of analyzer methods: "ClassName::method" -> count
    pub exec_counts: HashMap<String, usize>,
}

impl BaseAnalyzer {
    pub fn new() -> Self {
        Self::default()
    }

    /// Measure and increment exec time and count of an analyzer method.
    pub fn measure_exec_time(&mut self, class: &str, method: &str, start_time: Instant) {
        let elapsed = start_time.elapsed().as_secs_f64();
        let key = format!("{}::{}", class, method);

        *self.exec_times.entry(key.clone()).or_insert(0.0) += elapsed;
        *self.exec_counts.entry(key).or_insert(0) += 1;
    }

    pub fn get_exec_times(&self) -> &HashMap<String, f64> {
        &self.exec_times
    }

    pub fn get_exec_counts(&self) -> &HashMap<String, usize> {
        &self.exec_counts
    }
}


================================================
FILE: src/analysis/best_practice_analyzer.rs
================================================
// SiteOne Crawler - BestPracticeAnalyzer
// (c) Jan Reges <jan.reges@siteone.cz>

use std::collections::HashMap;
use std::time::Instant;

use regex::Regex;
use scraper::{Html, Selector};

use crate::analysis::analyzer::Analyzer;
use crate::analysis::base_analyzer::BaseAnalyzer;
use crate::analysis::result::analyzer_stats::AnalyzerStats;
use crate::analysis::result::url_analysis_result::UrlAnalysisResult;
use crate::components::super_table::SuperTable;
use crate::components::super_table_column::SuperTableColumn;
use crate::extra_column::ExtraColumn;
use crate::output::output::Output;
use crate::result::status::Status;
use crate::result::visited_url::VisitedUrl;
use crate::types::ContentTypeId;
use crate::utils;

const ANALYSIS_LARGE_SVGS: &str = "Large inline SVGs";
const ANALYSIS_DUPLICATED_SVGS: &str = "Duplicate inline SVGs";
const ANALYSIS_INVALID_SVGS: &str = "Invalid inline SVGs";
const ANALYSIS_MISSING_QUOTES: &str = "Missing quotes on attributes";
const ANALYSIS_HEADING_STRUCTURE: &str = "Heading structure";
const ANALYSIS_NON_CLICKABLE_PHONE_NUMBERS: &str = "Non-clickable phone numbers";
const ANALYSIS_DOM_DEPTH: &str = "DOM depth";
const ANALYSIS_TITLE_UNIQUENESS: &str = "Title uniqueness";
const ANALYSIS_DESCRIPTION_UNIQUENESS: &str = "Description uniqueness";
const ANALYSIS_BROTLI_SUPPORT: &str = "Brotli support";
const ANALYSIS_WEBP_SUPPORT: &str = "WebP support";
const ANALYSIS_AVIF_SUPPORT: &str = "AVIF support";

const SUPER_TABLE_BEST_PRACTICES: &str = "best-practices";
const SUPER_TABLE_NON_UNIQUE_TITLES: &str = "non-unique-titles";
const SUPER_TABLE_NON_UNIQUE_DESCRIPTIONS: &str = "non-unique-descriptions";

pub struct BestPracticeAnalyzer {
    base: BaseAnalyzer,
    stats: AnalyzerStats,

    // options
    max_inline_svg_size: usize,
    max_inline_svg_duplicate_size: usize,
    max_inline_svg_duplicates: usize,
    title_uniqueness_percentage: usize,
    meta_description_uniqueness_percentage: usize,
    max_dom_depth_warning: usize,
    max_dom_depth_critical: usize,

    // stats counters
    pages_with_large_svgs: usize,
    pages_with_duplicated_svgs: usize,
    pages_with_invalid_svgs: usize,
    pages_with_missing_quotes: usize,
    pages_with_multiple_h1: usize,
    pages_without_h1: usize,
    pages_with_skipped_heading_levels: usize,
    pages_with_deep_dom: usize,
    pages_with_non_clickable_phone_numbers: usize,
}

impl Default for BestPracticeAnalyzer {
    fn default() -> Self {
        Self::new()
    }
}

impl BestPracticeAnalyzer {
    pub fn new() -> Self {
        Self {
            base: BaseAnalyzer::new(),
            stats: AnalyzerStats::new(),

            max_inline_svg_size: 5 * 1024,
            max_inline_svg_duplicate_size: 1024,
            max_inline_svg_duplicates: 5,
            title_uniqueness_percentage: 10,
            meta_description_uniqueness_percentage: 10,
            max_dom_depth_warning: 30,
            max_dom_depth_critical: 50,

            pages_with_large_svgs: 0,
            pages_with_duplicated_svgs: 0,
            pages_with_invalid_svgs: 0,
            pages_with_missing_quotes: 0,
            pages_with_multiple_h1: 0,
            pages_without_h1: 0,
            pages_with_skipped_heading_levels: 0,
            pages_with_deep_dom: 0,
            pages_with_non_clickable_phone_numbers: 0,
        }
    }

    fn get_analysis_result(
        analysis_name: &str,
        ok: usize,
        notice: usize,
        warning: usize,
        critical: usize,
    ) -> HashMap<String, String> {
        let mut row = HashMap::new();
        row.insert("analysisName".to_string(), analysis_name.to_string());
        row.insert("ok".to_string(), ok.to_string());
        row.insert("notice".to_string(), notice.to_string());
        row.insert("warning".to_string(), warning.to_string());
        row.insert("critical".to_string(), critical.to_string());
        row
    }

    fn analyze_urls(&mut self, status: &Status, output: &mut dyn Output) -> Vec<HashMap<String, String>> {
        let mut data = self.stats.to_table_data();
        let visited_urls = status.get_visited_urls();

        let html_urls: Vec<&VisitedUrl> = visited_urls
            .iter()
            .filter(|u| u.is_allowed_for_crawling && u.status_code == 200 && u.content_type == ContentTypeId::Html)
            .collect();

        let image_urls: Vec<&VisitedUrl> = visited_urls
            .iter()
            .filter(|u| u.status_code == 200 && u.content_type == ContentTypeId::Image)
            .collect();

        // Check title uniqueness
        let s = Instant::now();
        let titles: Vec<Option<String>> = html_urls
            .iter()
            .map(|u| u.extras.as_ref().and_then(|e| e.get("Title").cloned()))
            .collect();
        data.push(self.check_title_uniqueness(&titles, status, output));
        self.base
            .measure_exec_time("BestPracticeAnalyzer", "checkTitleUniqueness", s);

        // Check meta description uniqueness
        let s = Instant::now();
        let descriptions: Vec<Option<String>> = html_urls
            .iter()
            .map(|u| u.extras.as_ref().and_then(|e| e.get("Description").cloned()))
            .collect();
        data.push(self.check_meta_description_uniqueness(&descriptions, status, output));
        self.base
            .measure_exec_time("BestPracticeAnalyzer", "checkMetaDescriptionUniqueness", s);

        // Check brotli support on internal HTML pages
        let s = Instant::now();
        let internal_html: Vec<&VisitedUrl> = html_urls
            .iter()
            .filter(|u| !u.is_external && u.content_type == ContentTypeId::Html)
            .copied()
            .collect();
        data.push(self.check_brotli_support(&internal_html, status));
        self.base
            .measure_exec_time("BestPracticeAnalyzer", "checkBrotliSupport", s);

        // Check WebP support
        let s = Instant::now();
        data.push(self.check_webp_support(&image_urls, status));
        self.base
            .measure_exec_time("BestPracticeAnalyzer", "checkWebpSupport", s);

        // Check AVIF support
        let s = Instant::now();
        data.push(self.check_avif_support(&image_urls, status));
        self.base
            .measure_exec_time("BestPracticeAnalyzer", "checkAvifSupport", s);

        data
    }

    fn check_inline_svg(&mut self, html: &str, result: &mut UrlAnalysisResult) {
        use once_cell::sync::Lazy;
        static RE_SVG: Lazy<Regex> = Lazy::new(|| Regex::new(r"(?is)<svg[^>]*>(.*?)</svg>").unwrap());
        let svg_re = &*RE_SVG;

        let matches: Vec<String> = svg_re.find_iter(html).map(|m| m.as_str().to_string()).collect();

        if matches.is_empty() {
            return;
        }

        let svg_count = matches.len();
        let mut large_svgs: Vec<String> = Vec::new();
        let mut max_found_svg_size: usize = 0;
        let mut duplicates: HashMap<String, (usize, String, usize)> = HashMap::new();
        let mut invalid_svgs: HashMap<String, (String, Vec<String>)> = HashMap::new();

        for svg in &matches {
            // Skip escaped SVGs (code examples)
            if svg.contains("&#x22;") || svg.contains("&#x27;") {
                continue;
            }

            let svg_trimmed = svg.trim();
            let size = svg_trimmed.len();

            // Use md5 hash as key
            use md5::{Digest, Md5};
            let mut hasher = Md5::new();
            hasher.update(svg_trimmed.as_bytes());
            let svg_hash = format!("{:x}", hasher.finalize());

            // Check inline SVG size
            if size > self.max_inline_svg_size {
                large_svgs.push(sanitize_svg(svg_trimmed));
                max_found_svg_size = max_found_svg_size.max(size);
                self.stats.add_warning(ANALYSIS_LARGE_SVGS, Some(svg_trimmed));
            } else {
                self.stats.add_ok(ANALYSIS_LARGE_SVGS, Some(svg_trimmed));
            }

            // Track duplicates
            let entry = duplicates
                .entry(svg_hash.clone())
                .or_insert((0, sanitize_svg(svg_trimmed), size));
            entry.0 += 1;

            // Check SVG validity
            let errors = validate_svg(svg_trimmed);
            if let Some(errors) = errors {
                invalid_svgs.insert(svg_hash.clone(), (sanitize_svg(svg_trimmed), errors));
                self.stats.add_warning(ANALYSIS_INVALID_SVGS, Some(svg_trimmed));
            } else {
                self.stats.add_ok(ANALYSIS_INVALID_SVGS, Some(svg_trimmed));
            }
        }

        // Evaluate duplicated SVGs
        let mut duplicated_svgs: Vec<String> = Vec::new();
        for (svg_hash, (count, sanitized, size)) in &duplicates {
            if *count > self.max_inline_svg_duplicates && *size > self.max_inline_svg_duplicate_size {
                duplicated_svgs.push(format!("{}x SVG ({} B): {}", count, size, sanitized));
                self.stats.add_warning(ANALYSIS_DUPLICATED_SVGS, Some(svg_hash));
            } else {
                self.stats.add_ok(ANALYSIS_DUPLICATED_SVGS, Some(svg_hash));
            }
        }

        // Report large SVGs
        if !large_svgs.is_empty() {
            result.add_warning(
                format!(
                    "{} inline svg(s) larger than {} bytes. Largest SVG is {} bytes. Consider loading from an external file to minimize HTML size",
                    large_svgs.len(),
                    self.max_inline_svg_size,
                    max_found_svg_size
                ),
                ANALYSIS_LARGE_SVGS,
                Some(large_svgs.clone()),
            );
            self.pages_with_large_svgs += 1;
        }

        let small_svgs = svg_count.saturating_sub(large_svgs.len());
        if small_svgs > 0 {
            result.add_ok(
                format!(
                    "{} inline svg(s) have a size less than {} bytes",
                    small_svgs, self.max_inline_svg_size
                ),
                ANALYSIS_LARGE_SVGS,
                None,
            );
        }

        // Report duplicated SVGs
        let duplicated_count = duplicated_svgs.len();
        if !duplicated_svgs.is_empty() {
            result.add_warning(
                format!(
                    "{} inline svg(s) are duplicated. Consider loading from an external file to minimize HTML size",
                    duplicated_count
                ),
                ANALYSIS_DUPLICATED_SVGS,
                Some(duplicated_svgs),
            );
            self.pages_with_duplicated_svgs += 1;
        }

        let uniq_svgs = svg_count.saturating_sub(duplicated_count);
        if uniq_svgs > 0 {
            result.add_ok(
                format!(
                    "{} inline svg(s) are unique (less than {} duplicates)",
                    uniq_svgs, self.max_inline_svg_duplicates
                ),
                ANALYSIS_DUPLICATED_SVGS,
                None,
            );
        }

        // Report invalid SVGs
        let invalid_count = invalid_svgs.len();
        if !invalid_svgs.is_empty() {
            let invalid_details: Vec<String> = invalid_svgs
                .values()
                .map(|(sanitized, errors)| {
                    format!(
                        "{}<br />Found {} error(s) in SVG. Errors:<br /> &nbsp; &gt; {}",
                        sanitized,
                        errors.len(),
                        errors.join("<br /> &nbsp; &gt; ")
                    )
                })
                .collect();
            result.add_critical(
                format!(
                    "{} invalid inline svg(s). Check the content of the SVG as it may contain invalid XML and cause unexpected display problems",
                    invalid_count
                ),
                ANALYSIS_INVALID_SVGS,
                Some(invalid_details),
            );
            self.pages_with_invalid_svgs += 1;
        }

        let valid_svgs = svg_count.saturating_sub(invalid_count);
        if valid_svgs > 0 {
            result.add_ok(
                format!("{} inline svg(s) are valid", valid_svgs),
                ANALYSIS_INVALID_SVGS,
                None,
            );
        }
    }

    fn check_missing_quotes_on_attributes(&mut self, html: &str, result: &mut UrlAnalysisResult) {
        use once_cell::sync::Lazy;
        static RE_UNQUOTED_ATTRS: Lazy<Regex> =
            Lazy::new(|| Regex::new(r#"<[^>]*\s(href|src|content|alt|title)\s*=\s*([^"'][^\s>]*)[^>]*>"#).unwrap());
        let re = &*RE_UNQUOTED_ATTRS;

        let mut issues: Vec<String> = Vec::new();

        for caps in re.captures_iter(html) {
            let full_match = match caps.get(0) {
                Some(m) => m.as_str(),
                None => continue,
            };
            let attribute = match caps.get(1) {
                Some(m) => m.as_str(),
                None => continue,
            };
            let value = match caps.get(2) {
                Some(m) => m.as_str(),
                None => continue,
            };

            // Skip attributes without value or with very long value
            if value.trim().is_empty() || full_match.len() > 1000 {
                continue;
            }

            // Skip escaped quotes and special cases
            if full_match.contains("\\\"")
                || full_match.contains("\\'")
                || full_match.contains("&#")
                || full_match.starts_with("<astro")
            {
                continue;
            }

            // Skip numeric values
            if value.trim().is_empty() || value.parse::<f64>().is_ok() {
                continue;
            }

            issues.push(format!(
                "The attribute '{}' has a value '{}' not enclosed in quotes in tag {}",
                attribute, value, full_match
            ));
            self.stats.add_warning(ANALYSIS_MISSING_QUOTES, Some(full_match));
        }

        if !issues.is_empty() {
            result.add_warning(
                format!("{} attribute(s) with missing quotes", issues.len()),
                ANALYSIS_MISSING_QUOTES,
                Some(issues),
            );
            self.pages_with_missing_quotes += 1;
        }
    }

    fn check_max_dom_depth(&mut self, html: &str, url: &str, result: &mut UrlAnalysisResult) {
        let document = Html::parse_document(html);

        // Find the body element and compute max depth
        let body_selector = match Selector::parse("body") {
            Ok(s) => s,
            Err(_) => return,
        };

        let body_node_id = match document.select(&body_selector).next() {
            Some(b) => b.id(),
            None => return,
        };

        let body_node = match document.tree.get(body_node_id) {
            Some(n) => n,
            None => return,
        };

        let max_depth = find_max_depth(body_node, 0);

        if max_depth >= self.max_dom_depth_critical {
            let msg = format!(
                "The DOM depth exceeds the critical limit: {}. Found depth: {}.",
                self.max_dom_depth_critical, max_depth
            );
            result.add_critical(msg.clone(), ANALYSIS_DOM_DEPTH, Some(vec![msg]));
            self.stats.add_critical(ANALYSIS_DOM_DEPTH, Some(url));
            self.pages_with_deep_dom += 1;
        } else if max_depth >= self.max_dom_depth_warning {
            let msg = format!(
                "The DOM depth exceeds the warning limit: {}. Found depth: {}.",
                self.max_dom_depth_warning, max_depth
            );
            result.add_warning(msg.clone(), ANALYSIS_DOM_DEPTH, Some(vec![msg]));
            self.stats.add_warning(ANALYSIS_DOM_DEPTH, Some(url));
            self.pages_with_deep_dom += 1;
        } else {
            result.add_ok(
                format!("The DOM depth is within acceptable limits. Found depth: {}", max_depth),
                ANALYSIS_DOM_DEPTH,
                None,
            );
            self.stats.add_ok(ANALYSIS_DOM_DEPTH, Some(url));
        }
    }

    fn check_heading_structure(&mut self, html: &str, result: &mut UrlAnalysisResult) {
        let document = Html::parse_document(html);

        let heading_selector = match Selector::parse("h1, h2, h3, h4, h5, h6") {
            Ok(s) => s,
            Err(_) => return,
        };

        let headings: Vec<(i32, String)> = document
            .select(&heading_selector)
            .filter_map(|el| {
                // Skip headings inside SVG, script, style, template, noscript
                // (headings inside foreign content are not relevant)
                let mut parent = el.parent();
                while let Some(p) = parent {
                    if let Some(p_el) = p.value().as_element() {
                        match p_el.name() {
                            "svg" | "script" | "style" | "template" | "noscript" => return None,
                            _ => {}
                        }
                    }
                    parent = p.parent();
                }

                let tag = el.value().name();
                let level = tag.strip_prefix('h').and_then(|s| s.parse::<i32>().ok())?;
                let text = el.text().collect::<String>();
                Some((level, text))
            })
            .collect();

        if headings.is_empty() {
            result.add_notice(
                "No headings found in the HTML content.".to_string(),
                ANALYSIS_HEADING_STRUCTURE,
                Some(vec!["No headings found in the HTML content.".to_string()]),
            );
            self.stats.add_notice(ANALYSIS_HEADING_STRUCTURE, Some(html));
            return;
        }

        let mut warning_issues: Vec<String> = Vec::new();
        let mut critical_issues: Vec<String> = Vec::new();
        let mut found_h1 = false;
        let mut previous_level = 0i32;

        for (level, _text) in &headings {
            let current_level = *level;

            if current_level == 1 {
                if found_h1 {
                    critical_issues.push("Multiple <h1> headings found.".to_string());
                    self.stats.add_critical(
                        ANALYSIS_HEADING_STRUCTURE,
                        Some(&format!("{} - multiple h1 tags found", html)),
                    );
                } else {
                    found_h1 = true;
                }
            }

            if current_level > previous_level + 1 {
                let msg = if previous_level > 0 {
                    format!(
                        "Heading structure is skipping levels: found an <h{}> after an <h{}>.",
                        current_level, previous_level
                    )
                } else {
                    format!(
                        "Heading structure is skipping levels: found an <h{}> without a previous higher heading.",
                        current_level
                    )
                };
                warning_issues.push(msg);
                self.stats.add_warning(
                    ANALYSIS_HEADING_STRUCTURE,
                    Some(&format!(
                        "{} - found <h{}> {}",
                        html,
                        current_level,
                        if previous_level > 0 {
                            format!("after an <h{}>.", previous_level)
                        } else {
                            "without a previous higher heading.".to_string()
                        }
                    )),
                );
            }

            previous_level = current_level;
        }

        if !found_h1 {
            critical_issues.push("No <h1> tag found in the HTML content.".to_string());
            self.pages_without_h1 += 1;
        } else {
            result.add_ok(
                "At least one h1 tag was found.".to_string(),
                ANALYSIS_HEADING_STRUCTURE,
                None,
            );
            self.stats.add_ok(
                ANALYSIS_HEADING_STRUCTURE,
                Some(&format!("{} - at least one h1 tag found", html)),
            );

            if !critical_issues.is_empty() {
                self.pages_with_multiple_h1 += 1;
            }
        }

        let has_critical = !critical_issues.is_empty();
        let has_warning = !warning_issues.is_empty();
        let critical_count = critical_issues.len();
        let warning_count = warning_issues.len();

        if has_critical {
            if !found_h1 {
                result.add_critical(
                    "No <h1> found.".to_string(),
                    ANALYSIS_HEADING_STRUCTURE,
                    Some(critical_issues),
                );
            } else {
                result.add_critical(
                    format!("Up to {} headings <h1> found.", critical_count + 1),
                    ANALYSIS_HEADING_STRUCTURE,
                    Some(critical_issues),
                );
            }
        }
        if has_warning {
            result.add_warning(
                format!("{} heading structure issue(s) found.", warning_count),
                ANALYSIS_HEADING_STRUCTURE,
                Some(warning_issues),
            );
            self.pages_with_skipped_heading_levels += 1;
        }
        if !has_critical && !has_warning {
            result.add_ok(
                "Heading structure is valid.".to_string(),
                ANALYSIS_HEADING_STRUCTURE,
                None,
            );
            self.stats.add_ok(
                ANALYSIS_HEADING_STRUCTURE,
                Some(&format!("{} - heading structure is valid", html)),
            );
        }
    }

    fn check_non_clickable_phone_numbers(&mut self, html: &str, result: &mut UrlAnalysisResult) {
        let all_phones = parse_phone_numbers_from_html(html, false);
        let non_clickable = parse_phone_numbers_from_html(html, true);

        if !non_clickable.is_empty() {
            result.add_warning(
                format!("{} non-clickable phone number(s) found.", non_clickable.len()),
                ANALYSIS_NON_CLICKABLE_PHONE_NUMBERS,
                Some(non_clickable.clone()),
            );
            for phone in &non_clickable {
                self.stats
                    .add_warning(ANALYSIS_NON_CLICKABLE_PHONE_NUMBERS, Some(phone));
            }
            self.pages_with_non_clickable_phone_numbers += 1;
        } else {
            result.add_ok(
                "No non-clickable phone numbers found.".to_string(),
                ANALYSIS_NON_CLICKABLE_PHONE_NUMBERS,
                None,
            );
            for phone in &all_phones {
                if !non_clickable.contains(phone) {
                    self.stats.add_ok(ANALYSIS_NON_CLICKABLE_PHONE_NUMBERS, Some(phone));
                }
            }
        }
    }

    fn check_title_uniqueness(
        &mut self,
        titles: &[Option<String>],
        status: &Status,
        output: &mut dyn Output,
    ) -> HashMap<String, String> {
        let summary_code = "title-uniqueness";

        // Check unfiltered array first, then filter nulls
        if titles.is_empty() {
            status.add_warning_to_summary(summary_code, "No titles provided for uniqueness check.");
            return Self::get_analysis_result(ANALYSIS_TITLE_UNIQUENESS, 0, 0, 1, 0);
        }

        let filtered: Vec<&str> = titles.iter().filter_map(|t| t.as_deref()).collect();

        if filtered.len() <= 1 {
            status.add_ok_to_summary(summary_code, "Only one title provided for uniqueness check.");
            return Self::get_analysis_result(ANALYSIS_TITLE_UNIQUENESS, 1, 0, 0, 0);
        }

        let mut counts: HashMap<&str, usize> = HashMap::new();
        for title in &filtered {
            *counts.entry(title).or_insert(0) += 1;
        }

        let total = filtered.len();
        let mut ok = 0usize;
        let mut warnings = 0usize;
        let mut highest_pct = 0usize;
        let mut non_unique_found = false;

        for (title, count) in &counts {
            let pct = (*count * 100) / total;
            highest_pct = highest_pct.max(pct);

            if *count > 1 && pct > self.title_uniqueness_percentage {
                status.add_warning_to_summary(
                    summary_code,
                    &format!(
                        "The title '{}' exceeds the allowed {}% duplicity. {}% of pages have this same title.",
                        title, self.title_uniqueness_percentage, pct
                    ),
                );
                non_unique_found = true;
                warnings += 1;
            } else {
                ok += 1;
            }
        }

        // Build top non-unique titles table
        let mut sorted_counts: Vec<(&str, usize)> = counts.into_iter().collect();
        sorted_counts.sort_by(|a, b| b.1.cmp(&a.1));

        let mut top_titles_data: Vec<HashMap<String, String>> = Vec::new();
        for (title, count) in sorted_counts.iter().take(10) {
            if *count > 1 {
                let mut row = HashMap::new();
                row.insert("count".to_string(), count.to_string());
                row.insert("title".to_string(), title.to_string());
                top_titles_data.push(row);
            }
        }

        let console_width = utils::get_console_width();
        let title_col_width = (console_width as i32 - 10).clamp(20, 200);

        let columns = vec![
            SuperTableColumn::new(
                "count".to_string(),
                "Count".to_string(),
                5,
                None,
                None,
                false,
                false,
                false,
                true,
                None,
            ),
            SuperTableColumn::new(
                "title".to_string(),
                "Title".to_string(),
                title_col_width,
                None,
                None,
                true,
                false,
                false,
                true,
                None,
            ),
        ];

        let mut super_table = SuperTable::new(
            SUPER_TABLE_NON_UNIQUE_TITLES.to_string(),
            "TOP non-unique titles".to_string(),
            "Nothing to report.".to_string(),
            columns,
            true,
            Some("count".to_string()),
            "DESC".to_string(),
            None,
            None,
            None,
        );
        super_table.set_data(top_titles_data);
        status.configure_super_table_url_stripping(&mut super_table);
        output.add_super_table(&super_table);
        status.add_super_table_at_end(super_table);

        if !non_unique_found {
            status.add_ok_to_summary(
                summary_code,
                &format!(
                    "All {} unique title(s) are within the allowed {}% duplicity. Highest duplicity title has {}%.",
                    ok, self.title_uniqueness_percentage, highest_pct
                ),
            );
        }

        Self::get_analysis_result(ANALYSIS_TITLE_UNIQUENESS, ok, 0, warnings, 0)
    }

    fn check_meta_description_uniqueness(
        &mut self,
        descriptions: &[Option<String>],
        status: &Status,
        output: &mut dyn Output,
    ) -> HashMap<String, String> {
        let summary_code = "meta-description-uniqueness";

        // Include empty strings for pages without descriptions
        let filtered: Vec<&str> = descriptions.iter().map(|d| d.as_deref().unwrap_or("")).collect();

        if filtered.is_empty() {
            status.add_warning_to_summary(summary_code, "No meta descriptions provided for uniqueness check.");
            return Self::get_analysis_result(ANALYSIS_DESCRIPTION_UNIQUENESS, 0, 0, 1, 0);
        }

        if filtered.len() <= 1 {
            status.add_ok_to_summary(summary_code, "Only one meta description provided for uniqueness check.");
            return Self::get_analysis_result(ANALYSIS_DESCRIPTION_UNIQUENESS, 1, 0, 0, 0);
        }

        let mut counts: HashMap<&str, usize> = HashMap::new();
        for desc in &filtered {
            *counts.entry(desc).or_insert(0) += 1;
        }

        let total = filtered.len();
        let mut ok = 0usize;
        let mut warnings = 0usize;
        let mut highest_pct = 0usize;
        let mut non_unique_found = false;

        for (desc, count) in &counts {
            let pct = (*count * 100) / total;
            highest_pct = highest_pct.max(pct);

            if *count > 1 && pct > self.meta_description_uniqueness_percentage {
                status.add_warning_to_summary(
                    summary_code,
                    &format!(
                        "The description '{}' exceeds the allowed {}% duplicity. {}% of pages have this same description.",
                        desc, self.meta_description_uniqueness_percentage, pct
                    ),
                );
                non_unique_found = true;
                warnings += 1;
            } else {
                ok += 1;
            }
        }

        let mut sorted_counts: Vec<(&str, usize)> = counts.into_iter().collect();
        sorted_counts.sort_by(|a, b| b.1.cmp(&a.1));

        let mut top_desc_data: Vec<HashMap<String, String>> = Vec::new();
        for (desc, count) in sorted_counts.iter().take(10) {
            if *count > 1 {
                let mut row = HashMap::new();
                row.insert("count".to_string(), count.to_string());
                row.insert("description".to_string(), desc.to_string());
                top_desc_data.push(row);
            }
        }

        let console_width = utils::get_console_width();
        let desc_col_width = (console_width as i32 - 10).clamp(20, 200);

        let columns = vec![
            SuperTableColumn::new(
                "count".to_string(),
                "Count".to_string(),
                5,
                None,
                None,
                false,
                false,
                false,
                true,
                None,
            ),
            SuperTableColumn::new(
                "description".to_string(),
                "Description".to_string(),
                desc_col_width,
                None,
                None,
                true,
                false,
                false,
                true,
                None,
            ),
        ];

        let mut super_table = SuperTable::new(
            SUPER_TABLE_NON_UNIQUE_DESCRIPTIONS.to_string(),
            "TOP non-unique descriptions".to_string(),
            "Nothing to report.".to_string(),
            columns,
            true,
            Some("count".to_string()),
            "DESC".to_string(),
            None,
            None,
            None,
        );
        super_table.set_data(top_desc_data);
        status.configure_super_table_url_stripping(&mut super_table);
        output.add_super_table(&super_table);
        status.add_super_table_at_end(super_table);

        if !non_unique_found {
            status.add_ok_to_summary(
                summary_code,
                &format!(
                    "All {} description(s) are within the allowed {}% duplicity. Highest duplicity description has {}%.",
                    ok, self.meta_description_uniqueness_percentage, highest_pct
                ),
            );
        }

        Self::get_analysis_result(ANALYSIS_DESCRIPTION_UNIQUENESS, ok, 0, warnings, 0)
    }

    fn check_brotli_support(&self, urls: &[&VisitedUrl], status: &Status) -> HashMap<String, String> {
        let summary_code = "brotli-support";
        let without_brotli = urls
            .iter()
            .filter(|u| u.content_encoding.as_deref() != Some("br"))
            .count();
        let with_brotli = urls.len().saturating_sub(without_brotli);

        if without_brotli > 0 {
            status.add_warning_to_summary(
                summary_code,
                &format!("{} page(s) do not support Brotli compression.", without_brotli),
            );
        } else {
            status.add_ok_to_summary(summary_code, "All pages support Brotli compression.");
        }

        Self::get_analysis_result(ANALYSIS_BROTLI_SUPPORT, with_brotli, 0, without_brotli, 0)
    }

    fn check_webp_support(&self, urls: &[&VisitedUrl], status: &Status) -> HashMap<String, String> {
        let summary_code = "webp-support";
        let webp_count = urls
            .iter()
            .filter(|u| u.content_type_header.as_deref() == Some("image/webp"))
            .count();
        let avif_count = urls
            .iter()
            .filter(|u| u.content_type_header.as_deref() == Some("image/avif"))
            .count();

        if webp_count > 0 {
            status.add_ok_to_summary(
                summary_code,
                &format!("{} WebP image(s) found on the website.", webp_count),
            );
        } else if avif_count > 0 {
            status.add_ok_to_summary(
                summary_code,
                &format!(
                    "No WebP images found, but AVIF (more modern format) is supported with {} image(s).",
                    avif_count
                ),
            );
            return Self::get_analysis_result(ANALYSIS_WEBP_SUPPORT, 1, 0, 0, 0);
        } else {
            status.add_warning_to_summary(summary_code, "No WebP image found on the website.");
        }

        Self::get_analysis_result(
            ANALYSIS_WEBP_SUPPORT,
            webp_count,
            0,
            if webp_count > 0 { 0 } else { 1 },
            0,
        )
    }

    fn check_avif_support(&self, urls: &[&VisitedUrl], status: &Status) -> HashMap<String, String> {
        let summary_code = "avif-support";
        let avif_count = urls
            .iter()
            .filter(|u| u.content_type_header.as_deref() == Some("image/avif"))
            .count();

        if avif_count > 0 {
            status.add_ok_to_summary(
                summary_code,
                &format!("{} AVIF image(s) found on the website.", avif_count),
            );
        } else {
            status.add_warning_to_summary(summary_code, "No AVIF image found on the website.");
        }

        Self::get_analysis_result(
            ANALYSIS_AVIF_SUPPORT,
            avif_count,
            0,
            if avif_count > 0 { 0 } else { 1 },
            0,
        )
    }

    fn set_findings_to_summary(&self, status: &Status) {
        // Missing quotes
        if self.pages_with_missing_quotes > 0 {
            status.add_warning_to_summary(
                "pages-with-missing-quotes",
                &format!(
                    "{} page(s) with missing quotes on attributes",
                    self.pages_with_missing_quotes
                ),
            );
        } else {
            status.add_ok_to_summary("pages-with-missing-quotes", "All pages have quoted attributes");
        }

        // Inline SVGs
        if self.pages_with_large_svgs > 0 {
            status.add_warning_to_summary(
                "pages-with-large-svgs",
                &format!(
                    "{} page(s) with large inline SVGs (> {} bytes)",
                    self.pages_with_large_svgs, self.max_inline_svg_size
                ),
            );
        } else {
            status.add_ok_to_summary(
                "pages-with-large-svgs",
                &format!(
                    "All pages have inline SVGs smaller than {} bytes",
                    self.max_inline_svg_size
                ),
            );
        }

        if self.pages_with_duplicated_svgs > 0 {
            status.add_warning_to_summary(
                "pages-with-duplicated-svgs",
                &format!(
                    "{} page(s) with duplicated inline SVGs (> {} duplicates)",
                    self.pages_with_duplicated_svgs, self.max_inline_svg_duplicates
                ),
            );
        } else {
            status.add_ok_to_summary(
                "pages-with-duplicated-svgs",
                &format!(
                    "All pages have inline SVGs with less than {} duplicates",
                    self.max_inline_svg_duplicates
                ),
            );
        }

        if self.pages_with_invalid_svgs > 0 {
            status.add_warning_to_summary(
                "pages-with-invalid-svgs",
                &format!("{} page(s) with invalid inline SVGs", self.pages_with_invalid_svgs),
            );
        } else {
            status.add_ok_to_summary("pages-with-invalid-svgs", "All pages have valid or none inline SVGs");
        }

        // Heading structure
        if self.pages_with_multiple_h1 > 0 {
            status.add_critical_to_summary(
                "pages-with-multiple-h1",
                &format!("{} page(s) with multiple <h1> headings", self.pages_with_multiple_h1),
            );
        } else {
            status.add_ok_to_summary("pages-with-multiple-h1", "All pages without multiple <h1> headings");
        }

        if self.pages_without_h1 > 0 {
            status.add_critical_to_summary(
                "pages-without-h1",
                &format!("{} page(s) without <h1> heading", self.pages_without_h1),
            );
        } else {
            status.add_ok_to_summary("pages-without-h1", "All pages have <h1> heading");
        }

        if self.pages_with_skipped_heading_levels > 0 {
            status.add_warning_to_summary(
                "pages-with-skipped-heading-levels",
                &format!(
                    "{} page(s) with skipped heading levels",
                    self.pages_with_skipped_heading_levels
                ),
            );
        } else {
            status.add_ok_to_summary(
                "pages-with-skipped-heading-levels",
                "All pages have heading structure without skipped levels",
            );
        }

        // DOM depth
        if self.pages_with_deep_dom > 0 {
            status.add_warning_to_summary(
                "pages-with-deep-dom",
                &format!(
                    "{} page(s) with deep DOM (> {} levels)",
                    self.pages_with_deep_dom, self.max_dom_depth_warning
                ),
            );
        } else {
            status.add_ok_to_summary(
                "pages-with-deep-dom",
                &format!("All pages have DOM depth less than {}", self.max_dom_depth_warning),
            );
        }

        // Non-clickable phone numbers
        if self.pages_with_non_clickable_phone_numbers > 0 {
            status.add_warning_to_summary(
                "pages-with-non-clickable-phone-numbers",
                &format!(
                    "{} page(s) with non-clickable (non-interactive) phone numbers",
                    self.pages_with_non_clickable_phone_numbers
                ),
            );
        } else {
            status.add_ok_to_summary(
                "pages-with-non-clickable-phone-numbers",
                "All pages have clickable (interactive) phone numbers",
            );
        }
    }
}

impl Analyzer for BestPracticeAnalyzer {
    fn analyze(&mut self, status: &Status, output: &mut dyn Output) {
        let max_svg_size = self.max_inline_svg_size;
        let max_svg_dup = self.max_inline_svg_duplicates;
        let max_svg_dup_size = self.max_inline_svg_duplicate_size;
        let max_dom_depth = self.max_dom_depth_warning;
        let title_pct = self.title_uniqueness_percentage;
        let desc_pct = self.meta_description_uniqueness_percentage;

        let columns = vec![
            SuperTableColumn::new(
                "analysisName".to_string(),
                "Analysis name".to_string(),
                -1, // AUTO_WIDTH
                Some(Box::new(move |value: &str, _render_into: &str| match value {
                    "Large inline SVGs" => format!("{} (> {} B)", value, max_svg_size),
                    "Duplicate inline SVGs" => format!("{} (> {} and > {} B)", value, max_svg_dup, max_svg_dup_size),
                    "DOM depth" => format!("{} (> {})", value, max_dom_depth),
                    "Title uniqueness" => format!("{} (> {}%)", value, title_pct),
                    "Description uniqueness" => format!("{} (> {}%)", value, desc_pct),
                    _ => value.to_string(),
                })),
                None,
                false,
                true,
                false,
                true,
                None,
            ),
            SuperTableColumn::new(
                "ok".to_string(),
                "OK".to_string(),
                5,
                Some(Box::new(|value: &str, _render_into: &str| {
                    if let Ok(v) = value.parse::<usize>()
                        && v > 0
                    {
                        return utils::get_color_text(&v.to_string(), "green", false);
                    }
                    "0".to_string()
                })),
                None,
                false,
                false,
                false,
                true,
                None,
            ),
            SuperTableColumn::new(
                "notice".to_string(),
                "Notice".to_string(),
                6,
                Some(Box::new(|value: &str, _render_into: &str| {
                    if let Ok(v) = value.parse::<usize>()
                        && v > 0
                    {
                        return utils::get_color_text(&v.to_string(), "blue", false);
                    }
                    "0".to_string()
                })),
                None,
                false,
                false, // color-only formatter doesn't change visible length
                false,
                true,
                None,
            ),
            SuperTableColumn::new(
                "warning".to_string(),
                "Warning".to_string(),
                7,
                Some(Box::new(|value: &str, _render_into: &str| {
                    if let Ok(v) = value.parse::<usize>()
                        && v > 0
                    {
                        return utils::get_color_text(&v.to_string(), "magenta", true);
                    }
                    "0".to_string()
                })),
                None,
                false,
                false, // color-only formatter doesn't change visible length
                false,
                true,
                None,
            ),
            SuperTableColumn::new(
                "critical".to_string(),
                "Critical".to_string(),
                8,
                Some(Box::new(|value: &str, _render_into: &str| {
                    if let Ok(v) = value.parse::<usize>()
                        && v > 0
                    {
                        return utils::get_color_text(&v.to_string(), "red", true);
                    }
                    "0".to_string()
                })),
                None,
                false,
                false, // color-only formatter doesn't change visible length
                false,
                true,
                None,
            ),
        ];

        let data = self.analyze_urls(status, output);

        let mut super_table = SuperTable::new(
            SUPER_TABLE_BEST_PRACTICES.to_string(),
            "Best practices".to_string(),
            "Nothing to report.".to_string(),
            columns,
            true,
            None,
            "ASC".to_string(),
            None,
            None,
            None,
        );

        super_table.set_data(data);
        status.configure_super_table_url_stripping(&mut super_table);
        output.add_super_table(&super_table);
        status.add_super_table_at_end(super_table);

        self.set_findings_to_summary(status);
    }

    fn analyze_visited_url(
        &mut self,
        visited_url: &VisitedUrl,
        body: Option<&str>,
        _headers: Option<&HashMap<String, String>>,
    ) -> Option<UrlAnalysisResult> {
        let is_html = visited_url.content_type == ContentTypeId::Html && body.is_some();

        if !is_html {
            return None;
        }

        let html = body?;
        let mut result = UrlAnalysisResult::new();

        let s = Instant::now();
        self.check_inline_svg(html, &mut result);
        self.base.measure_exec_time("BestPracticeAnalyzer", "checkInlineSvg", s);

        let s = Instant::now();
        self.check_missing_quotes_on_attributes(html, &mut result);
        self.base
            .measure_exec_time("BestPracticeAnalyzer", "checkMissingQuotesOnAttributes", s);

        let s = Instant::now();
        self.check_max_dom_depth(html, &visited_url.url, &mut result);
        self.base
            .measure_exec_time("BestPracticeAnalyzer", "checkMaxDOMDepth", s);

        let s = Instant::now();
        self.check_heading_structure(html, &mut result);
        self.base
            .measure_exec_time("BestPracticeAnalyzer", "checkHeadingStructure", s);

        let s = Instant::now();
        self.check_non_clickable_phone_numbers(html, &mut result);
        self.base
            .measure_exec_time("BestPracticeAnalyzer", "checkNonClickablePhoneNumbers", s);

        Some(result)
    }

    fn show_analyzed_visited_url_result_as_column(&self) -> Option<ExtraColumn> {
        ExtraColumn::new("Best pr.".to_string(), Some(8), false, None, None, None).ok()
    }

    fn should_be_activated(&self) -> bool {
        true
    }

    fn get_order(&self) -> i32 {
        170
    }

    fn get_name(&self) -> &str {
        "BestPracticeAnalyzer"
    }

    fn get_exec_times(&self) -> &HashMap<String, f64> {
        self.base.get_exec_times()
    }

    fn get_exec_counts(&self) -> &HashMap<String, usize> {
        self.base.get_exec_counts()
    }
}

/// Validate SVG XML and return None for valid or Some(errors) for invalid
fn validate_svg(svg: &str) -> Option<Vec<String>> {
    use quick_xml::Reader;
    use quick_xml::events::Event;

    let mut reader = Reader::from_str(svg);
    let mut errors = Vec::new();

    loop {
        match reader.read_event() {
            Ok(Event::Eof) => break,
            Ok(_) => {}
            Err(e) => {
                errors.push(format!("{}", e));
            }
        }
    }

    if errors.is_empty() { None } else { Some(errors) }
}

/// Sanitize SVG: remove content, keep only the opening tag
fn sanitize_svg(svg: &str) -> String {
    if let Some(end) = svg.find('>') {
        format!("{}> ...", &svg[..end])
    } else {
        svg.to_string()
    }
}

/// Find max DOM depth using the scraper tree
fn find_max_depth(node_ref: ego_tree::NodeRef<scraper::Node>, depth: usize) -> usize {
    let mut max = depth;
    for child in node_ref.children() {
        let child_depth = find_max_depth(child, depth + 1);
        max = max.max(child_depth);
    }
    max
}

/// Parse phone numbers from HTML. Returns numbers found outside tel: links if only_non_clickable is true.
fn parse_phone_numbers_from_html(html: &str, only_non_clickable: bool) -> Vec<String> {
    use once_cell::sync::Lazy;
    // Formats with country codes and spaces, e.g.: +420 123 456 789 or +1234 1234567890
    static RE_PHONE_COUNTRY: Lazy<Regex> = Lazy::new(|| Regex::new(r"\+\d{1,4}(\s?[0-9\- ]{1,5}){1,5}").unwrap());
    // Formats with country codes without spaces, e.g.: +420123456789
    static RE_PHONE_NO_SPACE: Lazy<Regex> = Lazy::new(|| Regex::new(r"\+[0-9\- ]{7,20}").unwrap());
    // US format with parentheses, e.g.: (123) 456-7890
    static RE_PHONE_US: Lazy<Regex> = Lazy::new(|| Regex::new(r"\(\d{1,5}\)\s?\d{3,4}-\d{4}").unwrap());
    // Regular format with dashes, e.g.: 123-456-7890
    static RE_PHONE_DASH: Lazy<Regex> = Lazy::new(|| Regex::new(r"\d{1,5}-\d{3,4}-\d{4}").unwrap());

    let mut phones: Vec<String> = Vec::new();

    // Strip JavaScript and CSS content first (phone numbers are not visible in these)
    let html_clean = strip_js_and_css(html);

    // Replace &nbsp; with space
    let html_clean = html_clean.replace("&nbsp;", " ");

    let phone_regexes: [&Regex; 4] = [&RE_PHONE_COUNTRY, &RE_PHONE_NO_SPACE, &RE_PHONE_US, &RE_PHONE_DASH];
    for re in &phone_regexes {
        for m in re.find_iter(&html_clean) {
            let phone = m.as_str().trim().to_string();
            if !phones.contains(&phone) {
                phones.push(phone);
            }
        }
    }

    // Filter: phone number must be at least 8 chars
    phones.retain(|p| p.len() >= 8);

    if only_non_clickable {
        phones.retain(|phone| {
            let escaped = regex::escape(phone);

            // Check pattern 1: <a href="tel:PHONE">...</a>
            let tel_pattern1 = format!(r#"<a[^>]*href=["']tel:{}["'][^>]*>.*?</a>"#, escaped);
            let in_tel1 = Regex::new(&tel_pattern1).map(|re| re.is_match(html)).unwrap_or(false);

            // Check pattern 2: <a href="tel:...">...PHONE...</a>
            let tel_pattern2 = format!(r#"(?is)<a[^>]*href=["']tel:[^"'>]+["'][^>]*>.*?{}.*?</a>"#, escaped);
            let in_tel2 = Regex::new(&tel_pattern2).map(|re| re.is_match(html)).unwrap_or(false);

            // Check unwanted pattern: phone number is part of a larger alphanumeric string
            let unwanted_pattern = format!(r"(?i)[0-9a-z._-]{}[0-9a-z._-]", escaped);
            let is_unwanted = Regex::new(&unwanted_pattern)
                .map(|re| re.is_match(html))
                .unwrap_or(false);

            !in_tel1 && !in_tel2 && !is_unwanted
        });
    }

    phones
}

/// Strip JavaScript content from HTML
fn strip_js_and_css(html: &str) -> String {
    use once_cell::sync::Lazy;
    static RE_SCRIPT: Lazy<Regex> = Lazy::new(|| Regex::new(r"(?is)<script[^>]*>.*?</script>").unwrap());
    static RE_STYLE: Lazy<Regex> = Lazy::new(|| Regex::new(r"(?is)<style[^>]*>.*?</style>").unwrap());

    let result = RE_SCRIPT.replace_all(html, " ").to_string();
    RE_STYLE.replace_all(&result, " ").to_string()
}


================================================
FILE: src/analysis/caching_analyzer.rs
================================================
// SiteOne Crawler - CachingAnalyzer
// (c) Jan Reges <jan.reges@siteone.cz>

use std::collections::HashMap;

use crate::analysis::analyzer::Analyzer;
use crate::analysis::base_analyzer::BaseAnalyzer;
use crate::components::super_table::SuperTable;
use crate::components::super_table_column::SuperTableColumn;
use crate::output::output::Output;
use crate::result::status::Status;
use crate::result::visited_url::VisitedUrl;
use crate::utils;

const SUPER_TABLE_CACHING_PER_CONTENT_TYPE: &str = "caching-per-content-type";
const SUPER_TABLE_CACHING_PER_DOMAIN: &str = "caching-per-domain";
const SUPER_TABLE_CACHING_PER_DOMAIN_AND_CONTENT_TYPE: &str = "caching-per-domain-and-content-type";

pub struct CachingAnalyzer {
    base: BaseAnalyzer,
}

impl Default for CachingAnalyzer {
    fn default() -> Self {
        Self::new()
    }
}

impl CachingAnalyzer {
    pub fn new() -> Self {
        Self {
            base: BaseAnalyzer::new(),
        }
    }

    fn update_cache_stat(stat: &mut CacheStat, visited_url: &VisitedUrl) {
        stat.count += 1;
        if let Some(lifetime) = visited_url.cache_lifetime {
            stat.count_with_lifetime += 1;
            stat.total_lifetime += lifetime;
            stat.avg_lifetime = Some(stat.total_lifetime as f64 / stat.count_with_lifetime as f64);
            stat.min_lifetime = Some(match stat.min_lifetime {
                Some(min) => min.min(lifetime),
                None => lifetime,
            });
            stat.max_lifetime = Some(match stat.max_lifetime {
                Some(max) => max.max(lifetime),
                None => lifetime,
            });
        }
    }

    fn build_lifetime_columns(first_col_name: &str, first_col_key: &str) -> Vec<SuperTableColumn> {
        let mut columns = vec![SuperTableColumn::new(
            first_col_key.to_string(),
            first_col_name.to_string(),
            if first_col_key == "domain" { 20 } else { 12 },
            None,
            None,
            false,
            false,
            false,
            true,
            None,
        )];

        // Add cacheType column only when not the first column
        if first_col_key != "cacheType" {
            columns.push(SuperTableColumn::new(
                "cacheType".to_string(),
                "Cache type".to_string(),
                12,
                None,
                None,
                false,
                false,
                false,
                true,
                None,
            ));
        }

        columns.extend(vec![
            SuperTableColumn::new(
                "count".to_string(),
                "URLs".to_string(),
                5,
                None,
                None,
                false,
                false,
                false,
                true,
                None,
            ),
            SuperTableColumn::new(
                "avgLifetime".to_string(),
                "AVG lifetime".to_string(),
                10,
                Some(Box::new(|value: &str, _render_into: &str| {
                    if let Ok(v) = value.parse::<i64>() {
                        utils::get_colored_cache_lifetime(v, 6)
                    } else {
                        "-".to_string()
                    }
                })),
                None,
                false,
                false,
                false,
                true,
                None,
            ),
            SuperTableColumn::new(
                "minLifetime".to_string(),
                "MIN lifetime".to_string(),
                10,
                Some(Box::new(|value: &str, _render_into: &str| {
                    if let Ok(v) = value.parse::<i64>() {
                        utils::get_colored_cache_lifetime(v, 6)
                    } else {
                        "-".to_string()
                    }
                })),
                None,
                false,
                false,
                false,
                true,
                None,
            ),
            SuperTableColumn::new(
                "maxLifetime".to_string(),
                "MAX lifetime".to_string(),
                10,
                Some(Box::new(|value: &str, _render_into: &str| {
                    if let Ok(v) = value.parse::<i64>() {
                        utils::get_colored_cache_lifetime(v, 6)
                    } else {
                        "-".to_string()
                    }
                })),
                None,
                false,
                false,
                false,
                true,
                None,
            ),
        ]);

        columns
    }
}

impl Analyzer for CachingAnalyzer {
    fn analyze(&mut self, status: &Status, output: &mut dyn Output) {
        let visited_urls = status.get_visited_urls();

        let mut stats_per_content_type: HashMap<String, CacheStatWithType> = HashMap::new();
        let mut stats_per_domain: HashMap<String, CacheStatWithDomain> = HashMap::new();
        let mut stats_per_domain_and_ct: HashMap<String, CacheStatWithDomainAndType> = HashMap::new();

        for visited_url in &visited_urls {
            let content_type_name = visited_url.content_type.name().to_string();
            let cache_type_label = visited_url.get_cache_type_label();
            let domain_name = visited_url.get_host().unwrap_or_else(|| "unknown".to_string());

            // Per domain
            {
                let key = format!("{}.{}", domain_name, cache_type_label);
                let stat = stats_per_domain.entry(key).or_insert_with(|| CacheStatWithDomain {
                    domain: domain_name.clone(),
                    cache_type: cache_type_label.clone(),
                    stat: CacheStat::default(),
                });
                Self::update_cache_stat(&mut stat.stat, visited_url);
            }

            // Per domain and content type
            {
                let key = format!("{}.{}.{}", domain_name, content_type_name, cache_type_label);
                let stat = stats_per_domain_and_ct
                    .entry(key)
                    .or_insert_with(|| CacheStatWithDomainAndType {
                        domain: domain_name.clone(),
                        content_type: content_type_name.clone(),
                        cache_type: cache_type_label.clone(),
                        stat: CacheStat::default(),
                    });
                Self::update_cache_stat(&mut stat.stat, visited_url);
            }

            // Per content type (only crawlable domains)
            if visited_url.is_allowed_for_crawling {
                let key = format!("{}.{}", content_type_name, cache_type_label);
                let stat = stats_per_content_type.entry(key).or_insert_with(|| CacheStatWithType {
                    content_type: content_type_name.clone(),
                    cache_type: cache_type_label.clone(),
                    stat: CacheStat::default(),
                });
                Self::update_cache_stat(&mut stat.stat, visited_url);
            }
        }

        // Per content type table
        if !stats_per_content_type.is_empty() {
            let data: Vec<HashMap<String, String>> = stats_per_content_type.values().map(|s| s.to_row()).collect();

            let columns = Self::build_lifetime_columns("Content type", "contentType");

            let mut super_table = SuperTable::new(
                SUPER_TABLE_CACHING_PER_CONTENT_TYPE.to_string(),
                "HTTP Caching by content type (only from crawlable domains)".to_string(),
                "No URLs found.".to_string(),
                columns,
                true,
                Some("count".to_string()),
                "DESC".to_string(),
                None,
                None,
                Some("HTTP cache".to_string()),
            );

            super_table.set_data(data);
            status.configure_super_table_url_stripping(&mut super_table);
            output.add_super_table(&super_table);
            status.add_super_table_at_beginning(super_table);
        }

        // Per domain table
        {
            let data: Vec<HashMap<String, String>> = stats_per_domain.values().map(|s| s.to_row()).collect();

            let columns = Self::build_lifetime_columns("Domain", "domain");

            let mut super_table = SuperTable::new(
                SUPER_TABLE_CACHING_PER_DOMAIN.to_string(),
                "HTTP Caching by domain".to_string(),
                "No URLs found.".to_string(),
                columns,
                true,
                Some("count".to_string()),
                "DESC".to_string(),
                None,
                None,
                None,
            );

            super_table.set_data(data);
            status.configure_super_table_url_stripping(&mut super_table);
            output.add_super_table(&super_table);
            status.add_super_table_at_beginning(super_table);
        }

        // Per domain and content type table
        {
            let data: Vec<HashMap<String, String>> = stats_per_domain_and_ct.values().map(|s| s.to_row()).collect();

            let mut columns = Self::build_lifetime_columns("Domain", "domain");
            columns.insert(
                1,
                SuperTableColumn::new(
                    "contentType".to_string(),
                    "Content type".to_string(),
                    12,
                    None,
                    None,
                    false,
                    false,
                    false,
                    true,
                    None,
                ),
            );

            let mut super_table = SuperTable::new(
                SUPER_TABLE_CACHING_PER_DOMAIN_AND_CONTENT_TYPE.to_string(),
                "HTTP Caching by domain and content type".to_string(),
                "No URLs found.".to_string(),
                columns,
                true,
                Some("count".to_string()),
                "DESC".to_string(),
                None,
                None,
                None,
            );

            super_table.set_data(data);
            status.configure_super_table_url_stripping(&mut super_table);
            output.add_super_table(&super_table);
            status.add_super_table_at_beginning(super_table);
        }
    }

    fn should_be_activated(&self) -> bool {
        true
    }

    fn get_order(&self) -> i32 {
        116
    }

    fn get_name(&self) -> &str {
        "CachingAnalyzer"
    }

    fn get_exec_times(&self) -> &HashMap<String, f64> {
        self.base.get_exec_times()
    }

    fn get_exec_counts(&self) -> &HashMap<String, usize> {
        self.base.get_exec_counts()
    }
}

#[derive(Default)]
struct CacheStat {
    count: usize,
    count_with_lifetime: usize,
    total_lifetime: i64,
    avg_lifetime: Option<f64>,
    min_lifetime: Option<i64>,
    max_lifetime: Option<i64>,
}

struct CacheStatWithType {
    content_type: String,
    cache_type: String,
    stat: CacheStat,
}

impl CacheStatWithType {
    fn to_row(&self) -> HashMap<String, String> {
        let mut row = HashMap::new();
        row.insert("contentType".to_string(), self.content_type.clone());
        row.insert("cacheType".to_string(), self.cache_type.clone());
        row.insert("count".to_string(), self.stat.count.to_string());
        row.insert(
            "avgLifetime".to_string(),
            self.stat
                .avg_lifetime
                .map(|v| format!("{}", v as i64))
                .unwrap_or_default(),
        );
        row.insert(
            "minLifetime".to_string(),
            self.stat.min_lifetime.map(|v| v.to_string()).unwrap_or_default(),
        );
        row.insert(
            "maxLifetime".to_string(),
            self.stat.max_lifetime.map(|v| v.to_string()).unwrap_or_default(),
        );
        row
    }
}

struct CacheStatWithDomain {
    domain: String,
    cache_type: String,
    stat: CacheStat,
}

impl CacheStatWithDomain {
    fn to_row(&self) -> HashMap<String, String> {
        let mut row = HashMap::new();
        row.insert("domain".to_string(), self.domain.clone());
        row.insert("cacheType".to_string(), self.cache_type.clone());
        row.insert("count".to_string(), self.stat.count.to_string());
        row.insert(
            "avgLifetime".to_string(),
            self.stat
                .avg_lifetime
                .map(|v| format!("{}", v as i64))
                .unwrap_or_default(),
        );
        row.insert(
            "minLifetime".to_string(),
            self.stat.min_lifetime.map(|v| v.to_string()).unwrap_or_default(),
        );
        row.insert(
            "maxLifetime".to_string(),
            self.stat.max_lifetime.map(|v| v.to_string()).unwrap_or_default(),
        );
        row
    }
}

struct CacheStatWithDomainAndType {
    domain: String,
    content_type: String,
    cache_type: String,
    stat: CacheStat,
}

impl CacheStatWithDomainAndType {
    fn to_row(&self) -> HashMap<String, String> {
        let mut row = HashMap::new();
        row.insert("domain".to_string(), self.domain.clone());
        row.insert("contentType".to_string(), self.content_type.clone());
        row.insert("cacheType".to_string(), self.cache_type.clone());
        row.insert("count".to_string(), self.stat.count.to_string());
        row.insert(
            "avgLifetime".to_string(),
            self.stat
                .avg_lifetime
                .map(|v| format!("{}", v as i64))
                .unwrap_or_default(),
        );
        row.insert(
            "minLifetime".to_string(),
            self.stat.min_lifetime.map(|v| v.to_string()).unwrap_or_default(),
        );
        row.insert(
            "maxLifetime".to_string(),
            self.stat.max_lifetime.map(|v| v.to_string()).unwrap_or_default(),
        );
        row
    }
}


================================================
FILE: src/analysis/content_type_analyzer.rs
================================================
// SiteOne Crawler - ContentTypeAnalyzer
// (c) Jan Reges <jan.reges@siteone.cz>

use std::collections::HashMap;

use crate::analysis::analyzer::Analyzer;
use crate::analysis::base_analyzer::BaseAnalyzer;
use crate::components::super_table::SuperTable;
use crate::components::super_table_column::SuperTableColumn;
use crate::output::output::Output;
use crate::result::status::Status;
use crate::types::ContentTypeId;
use crate::utils;

const SUPER_TABLE_CONTENT_TYPES: &str = "content-types";
const SUPER_TABLE_CONTENT_MIME_TYPES: &str = "content-types-raw";

pub struct ContentTypeAnalyzer {
    base: BaseAnalyzer,
}

impl Default for ContentTypeAnalyzer {
    fn default() -> Self {
        Self::new()
    }
}

impl ContentTypeAnalyzer {
    pub fn new() -> Self {
        Self {
            base: BaseAnalyzer::new(),
        }
    }

    fn add_content_type_super_table(&self, status: &Status, output: &mut dyn Output) {
        let visited_urls = status.get_visited_urls();
        let content_type_ids = get_all_content_type_ids();

        let mut stats: HashMap<String, ContentTypeStat> = HashMap::new();
        for ct_id in &content_type_ids {
            let key = format!("{:?}", ct_id);
            stats.insert(
                key,
                ContentTypeStat {
                    content_type_id: *ct_id,
                    content_type: ct_id.name().to_string(),
                    count: 0,
                    total_size: 0,
                    total_time: 0.0,
                    status_20x: 0,
                    status_30x: 0,
                    status_40x: 0,
                    status_42x: 0,
                    status_50x: 0,
                    status_other: 0,
                },
            );
        }

        for visited_url in &visited_urls {
            if visited_url.has_error_status_code() {
                continue;
            }
            let key = format!("{:?}", visited_url.content_type);
            if let Some(stat) = stats.get_mut(&key) {
                stat.count += 1;
                stat.total_size += visited_url.size.unwrap_or(0);
                stat.total_time += visited_url.request_time;

                let status_code = visited_url.status_code;
                if (200..300).contains(&status_code) {
                    stat.status_20x += 1;
                } else if (300..400).contains(&status_code) {
                    stat.status_30x += 1;
                } else if (400..420).contains(&status_code) {
                    stat.status_40x += 1;
                } else if (420..500).contains(&status_code) {
                    stat.status_42x += 1;
                } else if (500..600).contains(&status_code) {
                    stat.status_50x += 1;
                } else {
                    stat.status_other += 1;
                }
            }
        }

        // Remove empty stats and compute avg time
        let data: Vec<HashMap<String, String>> = stats
            .values()
            .filter(|s| s.count > 0)
            .map(|s| {
                let avg_time = s.total_time / s.count as f64;
                let mut row = HashMap::new();
                row.insert("contentType".to_string(), s.content_type.clone());
                row.insert("count".to_string(), s.count.to_string());
                row.insert("totalSize".to_string(), s.total_size.to_string());
                row.insert("totalTime".to_string(), format!("{:.4}", s.total_time));
                row.insert("avgTime".to_string(), format!("{:.4}", avg_time));
                row.insert("status20x".to_string(), s.status_20x.to_string());
                row.insert("status30x".to_string(), s.status_30x.to_string());
                row.insert("status40x".to_string(), s.status_40x.to_string());
                row.insert("status42x".to_string(), s.status_42x.to_string());
                row.insert("status50x".to_string(), s.status_50x.to_string());
                row.insert("statusOther".to_string(), s.status_other.to_string());
                row
            })
            .collect();

        let columns = build_content_type_columns();

        let mut super_table = SuperTable::new(
            SUPER_TABLE_CONTENT_TYPES.to_string(),
            "Content types".to_string(),
            "No URLs found.".to_string(),
            columns,
            true,
            Some("count".to_string()),
            "DESC".to_string(),
            None,
            None,
            None,
        );

        super_table.set_show_only_columns_with_values(true);
        super_table.set_data(data);
        status.configure_super_table_url_stripping(&mut super_table);
        output.add_super_table(&super_table);
        status.add_super_table_at_beginning(super_table);
    }

    fn add_content_type_raw_super_table(&self, status: &Status, output: &mut dyn Output) {
        let visited_urls = status.get_visited_urls();

        let mut stats: HashMap<String, MimeTypeStat> = HashMap::new();

        for visited_url in &visited_urls {
            if visited_url.has_error_status_code() {
                continue;
            }
            let key = visited_url
                .content_type_header
                .clone()
                .unwrap_or_else(|| "unknown".to_string());

            let stat = stats.entry(key.clone()).or_insert_with(|| MimeTypeStat {
                content_type: key,
                count: 0,
                total_size: 0,
                total_time: 0.0,
                status_20x: 0,
                status_30x: 0,
                status_40x: 0,
                status_42x: 0,
                status_50x: 0,
                status_other: 0,
            });

            stat.count += 1;
            stat.total_size += visited_url.size.unwrap_or(0);
            stat.total_time += visited_url.request_time;

            let status_code = visited_url.status_code;
            if (200..300).contains(&status_code) {
                stat.status_20x += 1;
            } else if (300..400).contains(&status_code) {
                stat.status_30x += 1;
            } else if (400..420).contains(&status_code) {
                stat.status_40x += 1;
            } else if (420..500).contains(&status_code) {
                stat.status_42x += 1;
            } else if (500..600).contains(&status_code) {
                stat.status_50x += 1;
            } else {
                stat.status_other += 1;
            }
        }

        let data: Vec<HashMap<String, String>> = stats
            .values()
            .map(|s| {
                let avg_time = if s.count > 0 {
                    s.total_time / s.count as f64
                } else {
                    0.0
                };
                let mut row = HashMap::new();
                row.insert("contentType".to_string(), s.content_type.clone());
                row.insert("count".to_string(), s.count.to_string());
                row.insert("totalSize".to_string(), s.total_size.to_string());
                row.insert("totalTime".to_string(), format!("{:.4}", s.total_time));
                row.insert("avgTime".to_string(), format!("{:.4}", avg_time));
                row.insert("status20x".to_string(), s.status_20x.to_string());
                row.insert("status30x".to_string(), s.status_30x.to_string());
                row.insert("status40x".to_string(), s.status_40x.to_string());
                row.insert("status42x".to_string(), s.status_42x.to_string());
                row.insert("status50x".to_string(), s.status_50x.to_string());
                row.insert("statusOther".to_string(), s.status_other.to_string());
                row
            })
            .collect();

        let mut columns = build_content_type_columns();
        // Adjust content type column width for MIME types
        if let Some(col) = columns.first_mut() {
            col.width = 26;
        }

        let mut super_table = SuperTable::new(
            SUPER_TABLE_CONTENT_MIME_TYPES.to_string(),
            "Content types (MIME types)".to_string(),
            "No MIME types found.".to_string(),
            columns,
            true,
            Some("count".to_string()),
            "DESC".to_string(),
            None,
            None,
            None,
        );

        super_table.set_show_only_columns_with_values(true);
        super_table.set_data(data);
        status.configure_super_table_url_stripping(&mut super_table);
        output.add_super_table(&super_table);
        status.add_super_table_at_beginning(super_table);
    }
}

impl Analyzer for ContentTypeAnalyzer {
    fn analyze(&mut self, status: &Status, output: &mut dyn Output) {
        self.add_content_type_super_table(status, output);
        self.add_content_type_raw_super_table(status, output);
    }

    fn should_be_activated(&self) -> bool {
        true
    }

    fn get_order(&self) -> i32 {
        210
    }

    fn get_name(&self) -> &str {
        "ContentTypeAnalyzer"
    }

    fn get_exec_times(&self) -> &HashMap<String, f64> {
        self.base.get_exec_times()
    }

    fn get_exec_counts(&self) -> &HashMap<String, usize> {
        self.base.get_exec_counts()
    }
}

struct ContentTypeStat {
    #[allow(dead_code)]
    content_type_id: ContentTypeId,
    content_type: String,
    count: usize,
    total_size: i64,
    total_time: f64,
    status_20x: usize,
    status_30x: usize,
    status_40x: usize,
    status_42x: usize,
    status_50x: usize,
    status_other: usize,
}

struct MimeTypeStat {
    content_type: String,
    count: usize,
    total_size: i64,
    total_time: f64,
    status_20x: usize,
    status_30x: usize,
    status_40x: usize,
    status_42x: usize,
    status_50x: usize,
    status_other: usize,
}

fn build_content_type_columns() -> Vec<SuperTableColumn> {
    vec![
        SuperTableColumn::new(
            "contentType".to_string(),
            "Content type".to_string(),
            12,
            None,
            None,
            false,
            false,
            false,
            true,
            None,
        ),
        SuperTableColumn::new(
            "count".to_string(),
            "URLs".to_string(),
            5,
            None,
            None,
            false,
            false,
            false,
            true,
            None,
        ),
        SuperTableColumn::new(
            "totalSize".to_string(),
            "Total size".to_string(),
            10,
            Some(Box::new(|value: &str, _render_into: &str| {
                if let Ok(v) = value.parse::<i64>() {
                    if v > 0 {
                        utils::get_formatted_size(v, 0)
                    } else {
                        "-".to_string()
                    }
                } else {
                    "-".to_string()
                }
            })),
            None,
            false,
            false,
            false,
            true,
            None,
        ),
        SuperTableColumn::new(
            "totalTime".to_string(),
            "Total time".to_string(),
            10,
            Some(Box::new(|value: &str, _render_into: &str| {
                if let Ok(v) = value.parse::<f64>() {
                    utils::get_formatted_duration(v)
                } else {
                    value.to_string()
                }
            })),
            None,
            false,
            false,
            false,
            true,
            None,
        ),
        SuperTableColumn::new(
            "avgTime".to_string(),
            "Avg time".to_string(),
            8,
            Some(Box::new(|value: &str, _render_into: &str| {
                if let Ok(v) = value.parse::<f64>() {
                    utils::get_colored_request_time(v, 8)
                } else {
                    value.to_string()
                }
            })),
            None,
            false,
            false,
            false,
            true,
            None,
        ),
        SuperTableColumn::new(
            "status20x".to_string(),
            "Status 20x".to_string(),
            10,
            Some(Box::new(|value: &str, _render_into: &str| {
                if let Ok(v) = value.parse::<i32>() {
                    if v > 0 {
                        utils::get_color_text(&format!("{:<10}", v), "green", false)
                    } else {
                        value.to_string()
                    }
                } else {
                    value.to_string()
                }
            })),
            None,
            false,
            false,
            false,
            true,
            None,
        ),
        SuperTableColumn::new(
            "status30x".to_string(),
            "Status 30x".to_string(),
            10,
            Some(Box::new(|value: &str, _render_into: &str| {
                if let Ok(v) = value.parse::<i32>() {
                    if v > 0 {
                        utils::get_color_text(&format!("{:<10}", v), "yellow", true)
                    } else {
                        value.to_string()
                    }
                } else {
                    value.to_string()
                }
            })),
            None,
            false,
            false,
            false,
            true,
            None,
        ),
        SuperTableColumn::new(
            "status40x".to_string(),
            "Status 40x".to_string(),
            10,
            Some(Box::new(|value: &str, _render_into: &str| {
                if let Ok(v) = value.parse::<i32>() {
                    if v > 0 {
                        utils::get_color_text(&format!("{:<10}", v), "magenta", true)
                    } else {
                        value.to_string()
                    }
                } else {
                    value.to_string()
                }
            })),
            None,
            false,
            false,
            false,
            true,
            None,
        ),
        SuperTableColumn::new(
            "status42x".to_string(),
            "Status 42x".to_string(),
            10,
            Some(Box::new(|value: &str, _render_into: &str| {
                if let Ok(v) = value.parse::<i32>() {
                    if v > 0 {
                        utils::get_color_text(&format!("{:<10}", v), "magenta", true)
                    } else {
                        value.to_string()
                    }
                } else {
                    value.to_string()
                }
            })),
            None,
            false,
            false,
            false,
            true,
            None,
        ),
        SuperTableColumn::new(
            "status50x".to_string(),
            "Status 50x".to_string(),
            10,
            Some(Box::new(|value: &str, _render_into: &str| {
                if let Ok(v) = value.parse::<i32>() {
                    if v > 0 {
                        utils::get_color_text(&format!("{:<10}", v), "red", true)
                    } else {
                        value.to_string()
                    }
                } else {
                    value.to_string()
                }
            })),
            None,
            false,
            false,
            false,
            true,
            None,
        ),
        SuperTableColumn::new(
            "statusOther".to_string(),
            "Status ERR".to_string(),
            10,
            Some(Box::new(|value: &str, _render_into: &str| {
                if let Ok(v) = value.parse::<i32>() {
                    if v > 0 {
                        utils::get_color_text(&format!("{:<10}", v), "red", true)
                    } else {
                        value.to_string()
                    }
                } else {
                    value.to_string()
                }
            })),
            None,
            false,
            false,
            false,
            true,
            None,
        ),
    ]
}

fn get_all_content_type_ids() -> Vec<ContentTypeId> {
    vec![
        ContentTypeId::Html,
        ContentTypeId::Script,
        ContentTypeId::Stylesheet,
        ContentTypeId::Image,
        ContentTypeId::Video,
        ContentTypeId::Audio,
        ContentTypeId::Font,
        ContentTypeId::Document,
        ContentTypeId::Json,
        ContentTypeId::Xml,
        ContentTypeId::Redirect,
        ContentTypeId::Other,
    ]
}


================================================
FILE: src/analysis/dns_analyzer.rs
================================================
// SiteOne Crawler - DnsAnalyzer
// (c) Jan Reges <jan.reges@siteone.cz>

use std::collections::HashMap;

use crate::analysis::analyzer::Analyzer;
use crate::analysis::base_analyzer::BaseAnalyzer;
use crate::analysis::result::dns_analysis_result::DnsAnalysisResult;
use crate::components::super_table::SuperTable;
use crate::components::super_table_column::SuperTableColumn;
use crate::output::output::Output;
use crate::result::status::Status;
use crate::utils;

const SUPER_TABLE_DNS: &str = "dns";

pub struct DnsAnalyzer {
    base: BaseAnalyzer,
}

impl Default for DnsAnalyzer {
    fn default() -> Self {
        Self::new()
    }
}

impl DnsAnalyzer {
    pub fn new() -> Self {
        Self {
            base: BaseAnalyzer::new(),
        }
    }

    /// Resolve DNS for the given domain using hickory-resolver.
    fn get_dns_info(&self, domain: &str) -> Result<DnsAnalysisResult, String> {
        use hickory_resolver::Resolver;
        use hickory_resolver::proto::rr::RecordType;

        let domain_owned = domain.to_string();

        // Use block_in_place to allow blocking the current thread while running async DNS lookups
        tokio::task::block_in_place(|| {
            let rt = tokio::runtime::Handle::current();
            rt.block_on(async {
                let resolver = Resolver::builder_tokio()
                    .map_err(|e| format!("Failed to create DNS resolver: {}", e))?
                    .build();

                let mut resolved_domains = vec![domain_owned.clone()];
                let mut ipv4_addresses = Vec::new();
                let mut ipv6_addresses = Vec::new();

                // Resolve CNAME records
                if let Ok(cname_response) = resolver.lookup(domain_owned.as_str(), RecordType::CNAME).await {
                    for record in cname_response.iter() {
                        let cname_str = record.to_string().trim_end_matches('.').to_string();
                        if !resolved_domains.contains(&cname_str) {
                            resolved_domains.push(cname_str);
                        }
                    }
                }

                // Resolve A records (IPv4)
                if let Ok(ipv4_response) = resolver.lookup(domain_owned.as_str(), RecordType::A).await {
                    for record in ipv4_response.iter() {
                        let ip_str = record.to_string();
                        if !ip_str.is_empty() {
                            ipv4_addresses.push(ip_str);
                        }
                    }
                }

                // Resolve AAAA records (IPv6)
                if let Ok(ipv6_response) = resolver.lookup(domain_owned.as_str(), RecordType::AAAA).await {
                    for record in ipv6_response.iter() {
                        let ip_str = record.to_string();
                        if !ip_str.is_empty() {
                            ipv6_addresses.push(ip_str);
                        }
                    }
                }

                if ipv4_addresses.is_empty() && ipv6_addresses.is_empty() {
                    return Err(format!("Unable to resolve DNS records for {}", domain_owned));
                }

                let dns_server_ip = Self::get_system_dns_server().unwrap_or_else(|| "0.0.0.0".to_string());
                let dns_server_name = dns_server_ip.clone();

                Ok(DnsAnalysisResult::new(
                    dns_server_name,
                    dns_server_ip,
                    resolved_domains,
                    ipv4_addresses,
                    ipv6_addresses,
                ))
            })
        })
    }

    /// Read the first nameserver entry from /etc/resolv.conf to get the system DNS server IP.
    fn get_system_dns_server() -> Option<String> {
        let contents = std::fs::read_to_string("/etc/resolv.conf").ok()?;
        for line in contents.lines() {
            let trimmed = line.trim();
            if trimmed.starts_with("nameserver")
                && let Some(ip) = trimmed.split_whitespace().nth(1)
            {
                return Some(ip.to_string());
            }
        }
        None
    }
}

impl Analyzer for DnsAnalyzer {
    fn analyze(&mut self, status: &Status, output: &mut dyn Output) {
        let columns = vec![SuperTableColumn::new(
            "info".to_string(),
            "DNS resolving tree".to_string(),
            70,
            Some(Box::new(|value: &str, _render_into: &str| {
                let mut result = value.to_string();
                // Colorize IPv4 addresses
                if let Ok(re) = regex::Regex::new(r"(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})") {
                    result = re
                        .replace_all(&result, |caps: &regex::Captures| {
                            let ip = &caps[1];
                            if ip.parse::<std::net::Ipv4Addr>().is_ok() {
                                utils::get_color_text(ip, "blue", true)
                            } else {
                                ip.to_string()
                            }
                        })
                        .to_string();
                }
                // Colorize IPv6 addresses
                if let Ok(re) = regex::Regex::new(r"([0-9a-f:]+:+)+[0-9a-f]+") {
                    result = re
                        .replace_all(&result, |caps: &regex::Captures| {
                            let ip = &caps[0];
                            if ip.parse::<std::net::Ipv6Addr>().is_ok() {
                                utils::get_color_text(ip, "blue", true)
                            } else {
                                ip.to_string()
                            }
                        })
                        .to_string();
                }
                result
            })),
            None,
            true,
            false,
            true,
            false,
            None,
        )];

        let mut super_table = SuperTable::new(
            SUPER_TABLE_DNS.to_string(),
            "DNS info".to_string(),
            "No DNS info found.".to_string(),
            columns,
            false,
            None,
            "ASC".to_string(),
            None,
            None,
            None,
        );

        let mut data: Vec<HashMap<String, String>> = Vec::new();

        // Extract domain from the first visited URL
        let domain = status
            .get_visited_urls()
            .first()
            .and_then(|u| u.get_host())
            .unwrap_or_else(|| "unknown".to_string());

        match self.get_dns_info(&domain) {
            Ok(dns_info) => {
                for line in dns_info.get_txt_description().lines() {
                    let mut row = HashMap::new();
                    row.insert("info".to_string(), line.to_string());
                    data.push(row);
                }

                let resolved_domain = dns_info
                    .resolved_domains
                    .first()
                    .cloned()
                    .unwrap_or_else(|| "unknown".to_string());

                // DNS server suffix — omit when unknown (e.g. on Windows where /etc/resolv.conf doesn't exist)
                let dns_suffix = if dns_info.dns_server_ip_address != "0.0.0.0" {
                    format!(" (DNS server: {})", dns_info.dns_server_name)
                } else {
                    String::new()
                };

                // IPv4 summary
                if !dns_info.ipv4_addresses.is_empty() {
                    status.add_ok_to_summary(
                        "dns-ipv4",
                        &format!(
                            "DNS IPv4 OK: domain {} resolved to {}{}",
                            resolved_domain,
                            dns_info.ipv4_addresses.join(", "),
                            dns_suffix
                        ),
                    );
                } else {
                    status.add_notice_to_summary(
                        "dns-ipv4",
                        &format!(
                            "DNS IPv4: domain {} does not support IPv4{}",
                            resolved_domain, dns_suffix
                        ),
                    );
                }

                // IPv6 summary
                if !dns_info.ipv6_addresses.is_empty() {
                    status.add_ok_to_summary(
                        "dns-ipv6",
                        &format!(
                            "DNS IPv6 OK: domain {} resolved to {}{}",
                            resolved_domain,
                            dns_info.ipv6_addresses.join(", "),
                            dns_suffix
                        ),
                    );
                } else {
                    status.add_notice_to_summary(
                        "dns-ipv6",
                        &format!(
                            "DNS IPv6: domain {} does not support IPv6{}",
                            resolved_domain, dns_suffix
                        ),
                    );
                }

                // CNAME chain summary
                if dns_info.resolved_domains.len() > 1 {
                    status.add_info_to_summary(
                        "dns-aliases",
                        &format!(
                            "DNS Aliases: IP(s) for domain {} were resolved by CNAME chain {}.",
                            resolved_domain,
                            dns_info.resolved_domains.join(" > ")
                        ),
                    );
                }
            }
            Err(e) => {
                let mut row = HashMap::new();
                row.insert("info".to_string(), e.clone());
                data.push(row);
                status.add_critical_to_summary("dns", &format!("Problem with DNS analysis: {}", e));
            }
        }

        super_table.set_data(data);
        status.configure_super_table_url_stripping(&mut super_table);
        output.add_super_table(&super_table);
        status.add_super_table_at_end(super_table);
    }

    fn should_be_activated(&self) -> bool {
        true
    }

    fn get_order(&self) -> i32 {
        215
    }

    fn get_name(&self) -> &str {
        "DnsAnalyzer"
    }

    fn get_exec_times(&self) -> &HashMap<String, f64> {
        self.base.get_exec_times()
    }

    fn get_exec_counts(&self) -> &HashMap<String, usize> {
        self.base.get_exec_counts()
    }
}


================================================
FILE: src/analysis/external_links_analyzer.rs
================================================
// SiteOne Crawler - ExternalLinksAnalyzer
// (c) Jan Reges <jan.reges@siteone.cz>
//
// Presents external URLs discovered during crawling as a dedicated section.
// Groups external URLs, shows occurrence count and up to 5 source pages.

use std::collections::HashMap;

use crate::analysis::analyzer::Analyzer;
use crate::analysis::base_analyzer::BaseAnalyzer;
use crate::components::super_table::SuperTable;
use crate::components::super_table_column::SuperTableColumn;
use crate::output::output::Output;
use crate::result::status::Status;
use crate::types::SkippedReason;

const SUPER_TABLE_EXTERNAL_URLS: &str = "external-urls";
const MAX_SOURCE_PAGES: usize = 5;

pub struct ExternalLinksAnalyzer {
    base: BaseAnalyzer,
}

impl Default for ExternalLinksAnalyzer {
    fn default() -> Self {
        Self::new()
    }
}

impl ExternalLinksAnalyzer {
    pub fn new() -> Self {
        Self {
            base: BaseAnalyzer::new(),
        }
    }
}

impl Analyzer for ExternalLinksAnalyzer {
    fn analyze(&mut self, status: &Status, output: &mut dyn Output) {
        let skipped_entries = status.get_skipped_urls();

        // Filter only external links (NotAllowedHost reason)
        let external_entries: Vec<_> = skipped_entries
            .iter()
            .filter(|e| matches!(e.reason, SkippedReason::NotAllowedHost))
            .collect();

        // Group by external URL: collect count and source page URLs
        let mut url_data: HashMap<String, Vec<String>> = HashMap::new();
        for entry in &external_entries {
            let source_url = status.get_url_by_uq_id(&entry.source_uq_id).unwrap_or_default();
            let sources = url_data.entry(entry.url.clone()).or_default();
            if !source_url.is_empty() && !sources.contains(&source_url) {
                sources.push(source_url);
            }
        }

        let total_urls = url_data.len();

        let mut rows: Vec<HashMap<String, String>> = url_data
            .iter()
            .map(|(ext_url, sources)| {
                let mut row = HashMap::new();
                row.insert("url".to_string(), ext_url.clone());
                row.insert("count".to_string(), sources.len().to_string());
                let display_sources: Vec<&str> = sources.iter().take(MAX_SOURCE_PAGES).map(|s| s.as_str()).collect();
                let mut found_on = display_sources.join(", ");
                if sources.len() > MAX_SOURCE_PAGES {
                    found_on.push_str(&format!(" (+{})", sources.len() - MAX_SOURCE_PAGES));
                }
                row.insert("foundOn".to_string(), found_on);
                row
            })
            .collect();
        rows.sort_by(|a, b| {
            let count_a: usize = a.get("count").and_then(|c| c.parse().ok()).unwrap_or(0);
            let count_b: usize = b.get("count").and_then(|c| c.parse().ok()).unwrap_or(0);
            count_b.cmp(&count_a).then_with(|| a.get("url").cmp(&b.get("url")))
        });

        let url_column_width = 60;

        let columns = vec![
            SuperTableColumn::new(
                "url".to_string(),
                "External URL".to_string(),
                url_column_width,
                None,
                None,
                true,
                true,
                false,
                true,
                None,
            ),
            SuperTableColumn::new(
                "count".to_string(),
                "Pages".to_string(),
                5,
                None,
                None,
                false,
                false,
                false,
                true,
                None,
            ),
            SuperTableColumn::new(
                "foundOn".to_string(),
                "Found on URL (max 5)".to_string(),
                url_column_width,
                None,
                None,
                true,
                true,
                false,
                true,
                None,
            ),
        ];

        let mut super_table = SuperTable::new(
            SUPER_TABLE_EXTERNAL_URLS.to_string(),
            "External URLs".to_string(),
            "No external URLs found.".to_string(),
            columns,
            true,
            Some("count".to_string()),
            "DESC".to_string(),
            Some(format!("{} external URL(s)", total_urls)),
            None,
            None,
        );

        super_table.set_data(rows);
        status.configure_super_table_url_stripping(&mut super_table);
        output.add_super_table(&super_table);
        status.add_super_table_at_beginning(super_table);

        status.add_summary_item_by_ranges(
            "external-urls",
            total_urls as f64,
            &[(0.0, 0.0), (1.0, f64::MAX)],
            &[
                "External URLs - no external URLs found",
                "External URLs - {} external URL(s) found",
            ],
        );
    }

    fn should_be_activated(&self) -> bool {
        true
    }

    fn get_order(&self) -> i32 {
        7 // After skipped URLs (6)
    }

    fn get_name(&self) -> &str {
        "ExternalLinksAnalyzer"
    }

    fn get_exec_times(&self) -> &HashMap<String, f64> {
        self.base.get_exec_times()
    }

    fn get_exec_counts(&self) -> &HashMap<String, usize> {
        self.base.get_exec_counts()
    }
}


================================================
FILE: src/analysis/fastest_analyzer.rs
================================================
// SiteOne Crawler - FastestAnalyzer
// (c) Jan Reges <jan.reges@siteone.cz>

use std::collections::HashMap;

use crate::analysis::analyzer::Analyzer;
use crate::analysis::base_analyzer::BaseAnalyzer;
use crate::components::super_table::SuperTable;
use crate::components::super_table_column::SuperTableColumn;
use crate::output::output::Output;
use crate::result::status::Status;
use crate::types::ContentTypeId;
use crate::utils;

const SUPER_TABLE_FASTEST_URLS: &str = "fastest-urls";

pub struct FastestAnalyzer {
    base: BaseAnalyzer,
    fastest_top_limit: usize,
    fastest_max_time: f64,
}

impl Default for FastestAnalyzer {
    fn default() -> Self {
        Self::new()
    }
}

impl FastestAnalyzer {
    pub fn new() -> Self {
        Self {
            base: BaseAnalyzer::new(),
            fastest_top_limit: 20,
            fastest_max_time: 1.0,
        }
    }

    /// Set configuration from CoreOptions.
    pub fn set_config(&mut self, fastest_top_limit: usize, fastest_max_time: f64) {
        self.fastest_top_limit = fastest_top_limit;
        self.fastest_max_time = fastest_max_time;
    }
}

impl Analyzer for FastestAnalyzer {
    fn analyze(&mut self, status: &Status, output: &mut dyn Output) {
        let visited_urls = status.get_visited_urls();

        let mut fast_urls: Vec<_> = visited_urls
            .into_iter()
            .filter(|u| {
                u.status_code == 200
                    && u.is_allowed_for_crawling
                    && u.content_type == ContentTypeId::Html
                    && u.request_time <= self.fastest_max_time
            })
            .collect();

        fast_urls.sort_by(|a, b| {
            a.request_time
                .partial_cmp(&b.request_time)
                .unwrap_or(std::cmp::Ordering::Equal)
        });
        fast_urls.truncate(self.fastest_top_limit);

        let console_width = utils::get_console_width();
        let url_column_width = (console_width as i32 - 20).max(20);

        let columns = vec![
            SuperTableColumn::new(
                "requestTime".to_string(),
                "Time".to_string(),
                6,
                Some(Box::new(|value: &str, _render_into: &str| {
                    if let Ok(v) = value.parse::<f64>() {
                        utils::get_colored_request_time(v, 6)
                    } else {
                        value.to_string()
                    }
                })),
                None,
                false,
                false,
                false,
                true,
                None,
            ),
            SuperTableColumn::new(
                "statusCode".to_string(),
                "Status".to_string(),
                6,
                Some(Box::new(|value: &str, _render_into: &str| {
                    if let Ok(v) = value.parse::<i32>() {
                        utils::get_colored_status_code(v, 6)
                    } else {
                        value.to_string()
                    }
                })),
                None,
                false,
                false,
                false,
                true,
                None,
            ),
            SuperTableColumn::new(
                "url".to_string(),
                "Fast URL".to_string(),
                url_column_width,
                None,
                None,
                true,
                true,
                false,
                true,
                None,
            ),
        ];

        let data: Vec<HashMap<String, String>> = fast_urls
            .iter()
            .map(|u| {
                let mut row = HashMap::new();
                row.insert("requestTime".to_string(), format!("{:.4}", u.request_time));
                row.insert("statusCode".to_string(), u.status_code.to_string());
                row.insert("url".to_string(), u.url.clone());
                row
            })
            .collect();

        let mut super_table = SuperTable::new(
            SUPER_TABLE_FASTEST_URLS.to_string(),
            "TOP fastest URLs".to_string(),
            format!("No fast URLs faster than {} second(s) found.", self.fastest_max_time),
            columns,
            true,
            Some("requestTime".to_string()),
            "ASC".to_string(),
            None,
            None,
            None,
        );

        super_table.set_data(data);
        status.configure_super_table_url_stripping(&mut super_table);
        output.add_super_table(&super_table);
        status.add_super_table_at_beginning(super_table);
    }

    fn should_be_activated(&self) -> bool {
        true
    }

    fn get_order(&self) -> i32 {
        100
    }

    fn get_name(&self) -> &str {
        "FastestAnalyzer"
    }

    fn get_exec_times(&self) -> &HashMap<String, f64> {
        self.base.get_exec_times()
    }

    fn get_exec_counts(&self) -> &HashMap<String, usize> {
        self.base.get_exec_counts()
    }
}


================================================
FILE: src/analysis/headers_analyzer.rs
================================================
// SiteOne Crawler - HeadersAnalyzer
// (c) Jan Reges <jan.reges@siteone.cz>

use std::collections::HashMap;

use crate::analysis::analyzer::Analyzer;
use crate::analysis::base_analyzer::BaseAnalyzer;
use crate::analysis::result::header_stats::HeaderStats;
use crate::analysis::result::url_analysis_result::UrlAnalysisResult;
use crate::components::super_table::SuperTable;
use crate::components::super_table_column::SuperTableColumn;
use crate::output::output::Output;
use crate::result::status::Status;
use crate::result::visited_url::VisitedUrl;
use crate::utils;

const SUPER_TABLE_HEADERS: &str = "headers";
const SUPER_TABLE_HEADERS_VALUES: &str = "headers-values";

pub struct HeadersAnalyzer {
    base: BaseAnalyzer,
    header_stats: HashMap<String, HeaderStats>,
}

impl Default for HeadersAnalyzer {
    fn default() -> Self {
        Self::new()
    }
}

impl HeadersAnalyzer {
    pub fn new() -> Self {
        Self {
            base: BaseAnalyzer::new(),
            header_stats: HashMap::new(),
        }
    }
}

impl Analyzer for HeadersAnalyzer {
    fn analyze(&mut self, status: &Status, output: &mut dyn Output) {
        let console_width = utils::get_console_width();

        // Basic header stats table
        let data: Vec<HashMap<String, String>> = self
            .header_stats
            .values()
            .map(|hs| {
                let mut row = HashMap::new();
                row.insert("header".to_string(), hs.get_formatted_header_name());
                row.insert("occurrences".to_string(), hs.occurrences.to_string());

                let unique_count = hs.unique_values.len();
                let unique_str = if unique_count == 0 {
                    "-".to_string()
                } else if hs.unique_values_limit_reached {
                    format!("{}+", unique_count)
                } else {
                    unique_count.to_string()
                };
                row.insert("uniqueValues".to_string(), unique_str);

                row.insert("valuesPreview".to_string(), hs.get_values_preview(120));

                let min_value = hs.get_min_value().unwrap_or_default();
                let max_value = hs.get_max_value().unwrap_or_default();

                // Format min/max for content-length and age
                if hs.header == "content-length" {
                    if let Some(min_int) = hs.min_int_value {
                        row.insert("minValue".to_string(), utils::get_formatted_size(min_int, 0));
                    } else {
                        row.insert("minValue".to_string(), String::new());
                    }
                    if let Some(max_int) = hs.max_int_value {
                        row.insert("maxValue".to_string(), utils::get_formatted_size(max_int, 0));
                    } else {
                        row.insert("maxValue".to_string(), String::new());
                    }
                } else if hs.header == "age" {
                    if let Some(min_int) = hs.min_int_value {
                        row.insert("minValue".to_string(), utils::get_formatted_age(min_int));
                    } else {
                        row.insert("minValue".to_string(), String::new());
                    }
                    if let Some(max_int) = hs.max_int_value {
                        row.insert("maxValue".to_string(), utils::get_formatted_age(max_int));
                    } else {
                        row.insert("maxValue".to_string(), String::new());
                    }
                } else {
                    row.insert("minValue".to_string(), min_value);
                    row.insert("maxValue".to_string(), max_value);
                }

                row
            })
            .collect();

        let columns = vec![
            SuperTableColumn::new(
                "header".to_string(),
                "Header".to_string(),
                -1, // AUTO_WIDTH
                None,
                None,
                false,
                false,
                false,
                true,
                None,
            ),
            SuperTableColumn::new(
                "occurrences".to_string(),
                "Occurs".to_string(),
                6,
                None,
                None,
                false,
                false,
                false,
                true,
                None,
            ),
            SuperTableColumn::new(
                "uniqueValues".to_string(),
                "Unique".to_string(),
                6,
                None,
                None,
                false,
                false,
                false,
                true,
                None,
            ),
            SuperTableColumn::new(
                "valuesPreview".to_string(),
                "Values preview".to_string(),
                (console_width as i32 - 90).max(20),
                None,
                None,
                true,
                true,
                false,
                false,
                None,
            ),
            SuperTableColumn::new(
                "minValue".to_string(),
                "Min value".to_string(),
                10,
                None,
                None,
                false,
                false,
                false,
                true,
                None,
            ),
            SuperTableColumn::new(
                "maxValue".to_string(),
                "Max value".to_string(),
                10,
                None,
                None,
                false,
                false,
                false,
                true,
                None,
            ),
        ];

        let mut super_table = SuperTable::new(
            SUPER_TABLE_HEADERS.to_string(),
            "HTTP headers".to_string(),
            "No HTTP headers found.".to_string(),
            columns,
            true,
            Some("header".to_string()),
            "ASC".to_string(),
            None,
            None,
            None,
        );

        super_table.set_data(data);
        status.configure_super_table_url_stripping(&mut super_table);
        output.add_super_table(&super_table);
        status.add_super_table_at_end(super_table);

        let unique_count = self.header_stats.len();
        status.add_summary_item_by_ranges(
            "unique-headers",
            unique_count as f64,
            &[(0.0, 30.0), (31.0, 40.0), (41.0, 50.0), (51.0, f64::MAX)],
            &[
                "HTTP headers - found {} unique headers",
                "HTTP headers - found {} unique headers",
                "HTTP headers - found {} unique headers (too many)",
                "HTTP headers - found {} unique headers (too many)",
            ],
        );

        // Detail info with header values
        let mut details: Vec<HashMap<String, String>> = Vec::new();
        for header_stat in self.header_stats.values() {
            for (value, count) in &header_stat.unique_values {
                let mut row = HashMap::new();
                row.insert("header".to_string(), header_stat.get_formatted_header_name());
                row.insert("occurrences".to_string(), count.to_string());
                row.insert("value".to_string(), value.clone());
                details.push(row);
            }
        }

        // Sort by header asc, then by occurrences desc
        details.sort_by(|a, b| {
            let header_a = a.get("header").cloned().unwrap_or_default();
            let header_b = b.get("header").cloned().unwrap_or_default();
            if header_a == header_b {
                let occ_a = a.get("occurrences").and_then(|v| v.parse::<usize>().ok()).unwrap_or(0);
                let occ_b = b.get("occurrences").and_then(|v| v.parse::<usize>().ok()).unwrap_or(0);
                occ_b.cmp(&occ_a)
            } else {
                header_a.cmp(&header_b)
            }
        });

        let detail_columns = vec![
            SuperTableColumn::new(
                "header".to_string(),
                "Header".to_string(),
                -1, // AUTO_WIDTH
                None,
                None,
                false,
                false,
                false,
                true,
                None,
            ),
            SuperTableColumn::new(
                "occurrences".to_string(),
                "Occurs".to_string(),
                6,
                None,
                None,
                false,
                false,
                false,
                true,
                None,
            ),
            SuperTableColumn::new(
                "value".to_string(),
                "Value".to_string(),
                (console_width as i32 - 56).max(20),
                None,
                None,
                true,
                true,
                false,
                true,
                None,
            ),
        ];

        let mut detail_table = SuperTable::new(
            SUPER_TABLE_HEADERS_VALUES.to_string(),
            "HTTP header values".to_string(),
            "No HTTP headers found.".to_string(),
            detail_columns,
            true,
            None,
            "ASC".to_string(),
            None,
            None,
            None,
        );

        detail_table.set_data(details);
        status.configure_super_table_url_stripping(&mut detail_table);
        output.add_super_table(&detail_table);
        status.add_super_table_at_end(detail_table);
    }

    fn analyze_visited_url(
        &mut self,
        visited_url: &VisitedUrl,
        _body: Option<&str>,
        headers: Option<&HashMap<String, String>>,
    ) -> Option<UrlAnalysisResult> {
        let headers = headers?;
        if !visited_url.is_allowed_for_crawling {
            return None;
        }

        for (header, values) in headers {
            let header_lower = header.to_lowercase();
            let stat = self
                .header_stats
                .entry(header_lower.clone())
                .or_insert_with(|| HeaderStats::new(header_lower));

            stat.add_value(values);
        }

        None
    }

    fn should_be_activated(&self) -> bool {
        true
    }

    fn get_order(&self) -> i32 {
        115
    }

    fn get_name(&self) -> &str {
        "HeadersAnalyzer"
    }

    fn get_exec_times(&self) -> &HashMap<String, f64> {
        self.base.get_exec_times()
    }

    fn get_exec_counts(&self) -> &HashMap<String, usize> {
        self.base.get_exec_counts()
    }
}


================================================
FILE: src/analysis/manager.rs
================================================
// SiteOne Crawler - Analysis Manager
// (c) Jan Reges <jan.reges@siteone.cz>

use std::collections::HashMap;

use crate::analysis::analyzer::Analyzer;
use crate::analysis::result::url_analysis_result::UrlAnalysisResult;
use crate::output::output::Output;
use crate::result::manager_stats::ManagerStats;
use crate::result::status::Status;
use crate::result::visited_url::VisitedUrl;
use crate::utils;

pub const SUPER_TABLE_ANALYSIS_STATS: &str = "analysis-stats";

pub struct AnalysisManager {
    analyzers: Vec<Box<dyn Analyzer>>,
    stats: ManagerStats,
}

impl AnalysisManager {
    pub fn new() -> Self {
        Self {
            analyzers: Vec::new(),
            stats: ManagerStats::new(),
        }
    }

    /// Register all analyzer instances. Each analyzer's should_be_activated()
    /// determines whether it is actually used.
    pub fn register_analyzer(&mut self, analyzer: Box<dyn Analyzer>) {
        self.analyzers.push(analyzer);
    }

    /// Auto-activate: remove analyzers that should not be activated based on options.
    pub fn auto_activate_analyzers(&mut self) {
        self.analyzers.retain(|a| a.should_be_activated());
    }

    /// Filter analyzers by regex pattern.
    /// Only analyzers whose name matches the regex are kept.
    /// Supports PCRE-style delimited patterns (e.g., /security/i).
    pub fn filter_analyzers_by_regex(&mut self, filter_regex: &str) {
        let pattern = utils::extract_pcre_regex_pattern(filter_regex);
        if let Ok(re) = fancy_regex::Regex::new(&pattern) {
            self.analyzers.retain(|a| re.is_match(a.get_name()).unwrap_or(true));
        }
    }

    /// Run analyze_visited_url for each active analyzer.
    /// Called per URL during the crawl.
    pub fn analyze_visited_url(
        &mut self,
        visited_url: &VisitedUrl,
        body: Option<&str>,
        headers: Option<&HashMap<String, String>>,
        status: &Status,
    ) -> Vec<(String, UrlAnalysisResult)> {
        let mut results = Vec::new();

        for analyzer in &mut self.analyzers {
            if let Some(result) = analyzer.analyze_visited_url(visited_url, body, headers) {
                let name = analyzer.get_name().to_string();
                status.add_url_analysis_result(
                    &visited_url.uq_id,
                    crate::result::status::UrlAnalysisResultEntry {
                        analysis_name: name.clone(),
                        result: result.clone(),
                    },
                );
                results.push((name, result));
            }
        }

        results
    }

    /// Run post-crawl analysis for all active analyzers, sorted by order.
    pub fn run_analyzers(&mut self, status: &Status, output: &mut dyn Output) {
        // Check if there are any working URLs
        if status.get_number_of_working_visited_urls() == 0 {
            let error_message =
                "The analysis has been suspended because no working URL could be found. Please check the URL/domain.";
            output.add_error(error_message);
            status.add_critical_to_summary("analysis-manager-error", error_message);
            return;
        }

        // Sort analyzers by order
        self.analyzers.sort_by_key(|a| a.get_order());

        for analyzer in &mut self.analyzers {
            analyzer.analyze(status, output);
        }

        // Collect and merge exec times from all analyzers
        if !self.analyzers.is_empty() {
            let mut all_exec_times: HashMap<String, f64> = HashMap::new();
            let mut all_exec_counts: HashMap<String, usize> = HashMap::new();

            for analyzer in &self.analyzers {
                for (key, time) in analyzer.get_exec_times() {
                    *all_exec_times.entry(key.clone()).or_insert(0.0) += time;
                }
                for (key, count) in analyzer.get_exec_counts() {
                    *all_exec_counts.entry(key.clone()).or_insert(0) += count;
                }
            }

            let super_table = self.stats.get_super_table(
                SUPER_TABLE_ANALYSIS_STATS,
                "Analysis stats",
                "No analysis stats",
                Some(&all_exec_times),
                Some(&all_exec_counts),
            );

            let mut super_table = super_table;
            status.configure_super_table_url_stripping(&mut super_table);
            output.add_super_table(&super_table);
            status.add_super_table_at_end(super_table);
        }
    }

    /// Get all analyzers
    pub fn get_analyzers(&self) -> &[Box<dyn Analyzer>] {
        &self.analyzers
    }

    /// Check if analyzer with given name is active
    pub fn has_analyzer(&self, name: &str) -> bool {
        self.analyzers.iter().any(|a| a.get_name() == name)
    }

    /// Get extra columns from all analyzers that want to show results as columns.
    /// Returns columns in registration order (alphabetical).
    pub fn get_extra_columns(&self) -> Vec<crate::extra_column::ExtraColumn> {
        self.analyzers
            .iter()
            .filter_map(|a| a.show_analyzed_visited_url_result_as_column())
            .collect()
    }

    /// Map analysis results to extra column values for the progress table.
    /// Returns a HashMap of column_name -> colorized_value_string.
    pub fn get_analysis_column_values(
        &self,
        analysis_results: &[(String, UrlAnalysisResult)],
    ) -> HashMap<String, String> {
        let mut result = HashMap::new();

        for analyzer in &self.analyzers {
            if let Some(extra_col) = analyzer.show_analyzed_visited_url_result_as_column() {
                let analyzer_name = analyzer.get_name();
                // Find the matching result for this analyzer
                if let Some((_, url_result)) = analysis_results.iter().find(|(name, _)| name == analyzer_name) {
                    let colorized = url_result.to_colorized_string(true);
                    if !colorized.is_empty() {
                        result.insert(extra_col.name.clone(), colorized);
                    }
                }
            }
        }

        result
    }
}

impl Default for AnalysisManager {
    fn default() -> Self {
        Self::new()
    }
}


================================================
FILE: src/analysis/mod.rs
================================================
pub mod analyzer;
pub mod base_analyzer;
pub mod manager;
pub mod result;

// Simple analyzers
pub mod caching_analyzer;
pub mod content_type_analyzer;
pub mod dns_analyzer;
pub mod external_links_analyzer;
pub mod fastest_analyzer;
pub mod headers_analyzer;
pub mod page404_analyzer;
pub mod redirects_analyzer;
pub mod skipped_urls_analyzer;
pub mod slowest_analyzer;
pub mod source_domains_analyzer;

// Complex analyzers (DOM parsing / TLS inspection)
pub mod accessibility_analyzer;
pub mod best_practice_analyzer;
pub mod security_analyzer;
pub mod seo_opengraph_analyzer;
pub mod ssl_tls_analyzer;


================================================
FILE: src/analysis/page404_analyzer.rs
================================================
// SiteOne Crawler - Page404Analyzer
// (c) Jan Reges <jan.reges@siteone.cz>

use std::collections::HashMap;

use crate::analysis::analyzer::Analyzer;
use crate::analysis::base_analyzer::BaseAnalyzer;
use crate::components::super_table::SuperTable;
use crate::components::super_table_column::SuperTableColumn;
use crate::output::output::Output;
use crate::result::status::Status;
use crate::utils;

const SUPER_TABLE_404: &str = "404";

pub struct Page404Analyzer {
    base: BaseAnalyzer,
}

impl Default for Page404Analyzer {
    fn default() -> Self {
        Self::new()
    }
}

impl Page404Analyzer {
    pub fn new() -> Self {
        Self {
            base: BaseAnalyzer::new(),
        }
    }
}

impl Analyzer for Page404Analyzer {
    fn analyze(&mut self, status: &Status, output: &mut dyn Output) {
        let visited_urls = status.get_visited_urls();

        let urls_404: Vec<_> = visited_urls.iter().filter(|u| u.status_code == 404).cloned().collect();

        let console_width = utils::get_console_width();
        let url_column_size = ((console_width as i32 - 16) / 2).max(20);

        let status_ref = status;
        let columns = vec![
            SuperTableColumn::new(
                "statusCode".to_string(),
                "Status".to_string(),
                6,
                Some(Box::new(|value: &str, _render_into: &str| {
                    if let Ok(v) = value.parse::<i32>() {
                        utils::get_colored_status_code(v, 6)
                    } else {
                        value.to_string()
                    }
                })),
                None,
                false,
                false,
                false,
                true,
                None,
            ),
            SuperTableColumn::new(
                "url".to_string(),
                "URL 404".to_string(),
                url_column_size,
                None,
                None,
                true,
                true,
                false,
                true,
                None,
            ),
            SuperTableColumn::new(
                "sourceUqId".to_string(),
                "Found at URL".to_string(),
                url_column_size,
                None,
                None,
                true,
                true,
                false,
                true,
                None,
            ),
        ];

        let data: Vec<HashMap<String, String>> = urls_404
            .iter()
            .map(|u| {
                let mut row = HashMap::new();
                row.insert("statusCode".to_string(), u.status_code.to_string());
                row.insert("url".to_string(), u.url.clone());
                let source_url = if !u.source_uq_id.is_empty() {
                    status_ref.get_url_by_uq_id(&u.source_uq_id).unwrap_or_default()
                } else {
                    String::new()
                };
                row.insert("sourceUqId".to_string(), source_url);
                row
            })
            .collect();

        let count_404 = data.len();

        let mut super_table = SuperTable::new(
            SUPER_TABLE_404.to_string(),
            "404 URLs".to_string(),
            "No 404 URLs found.".to_string(),
            columns,
            true,
            Some("url".to_string()),
            "ASC".to_string(),
            None,
            None,
            None,
        );

        super_table.set_data(data);
        status.configure_super_table_url_stripping(&mut super_table);
        output.add_super_table(&super_table);
        status.add_super_table_at_beginning(super_table);

        status.add_summary_item_by_ranges(
            "404",
            count_404 as f64,
            &[(0.0, 0.0), (1.0, 2.0), (3.0, 5.0), (6.0, f64::MAX)],
            &[
                "404 OK - all pages exists, no non-existent pages found",
                "404 NOTICE - {} non-existent page(s) found",
                "404 WARNING - {} non-existent pages found",
                "404 CRITICAL - {} non-existent pages found",
            ],
        );
    }

    fn should_be_activated(&self) -> bool {
        true
    }

    fn get_order(&self) -> i32 {
        20
    }

    fn get_name(&self) -> &str {
        "Page404Analyzer"
    }

    fn get_exec_times(&self) -> &HashMap<String, f64> {
        self.base.get_exec_times()
    }

    fn get_exec_counts(&self) -> &HashMap<String, usize> {
        self.base.get_exec_counts()
    }
}


================================================
FILE: src/analysis/redirects_analyzer.rs
================================================
// SiteOne Crawler - RedirectsAnalyzer
// (c) Jan Reges <jan.reges@siteone.cz>

use std::collections::HashMap;

use crate::analysis::analyzer::Analyzer;
use crate::analysis::base_analyzer::BaseAnalyzer;
use crate::components::super_table::SuperTable;
use crate::components::super_table_column::SuperTableColumn;
use crate::output::output::Output;
use crate::result::status::Status;
use crate::utils;

const SUPER_TABLE_REDIRECTS: &str = "redirects";

pub struct RedirectsAnalyzer {
    base: BaseAnalyzer,
}

impl Default for RedirectsAnalyzer {
    fn default() -> Self {
        Self::new()
    }
}

impl RedirectsAnalyzer {
    pub fn new() -> Self {
        Self {
            base: BaseAnalyzer::new(),
        }
    }
}

impl Analyzer for RedirectsAnalyzer {
    fn analyze(&mut self, status: &Status, output: &mut dyn Output) {
        let visited_urls = status.get_visited_urls();

        let url_redirects: Vec<_> = visited_urls
            .iter()
            .filter(|u| u.status_code >= 301 && u.status_code <= 308)
            .cloned()
            .collect();

        let console_width = utils::get_console_width();
        let url_column_width = ((console_width as i32 - 20) / 3).max(20);

        let columns = vec![
            SuperTableColumn::new(
                "statusCode".to_string(),
                "Status".to_string(),
                6,
                Some(Box::new(|value: &str, _render_into: &str| {
                    if let Ok(v) = value.parse::<i32>() {
                        utils::get_colored_status_code(v, 6)
                    } else {
                        value.to_string()
                    }
                })),
                None,
                false,
                false,
                false,
                true,
                None,
            ),
            SuperTableColumn::new(
                "url".to_string(),
                "Redirected URL".to_string(),
                url_column_width,
                None,
                None,
                true,
                true,
                false,
                true,
                None,
            ),
            SuperTableColumn::new(
                "targetUrl".to_string(),
                "Target URL".to_string(),
                url_column_width,
                None,
                None,
                true,
                true,
                false,
                true,
                None,
            ),
            SuperTableColumn::new(
                "sourceUqId".to_string(),
                "Found at URL".to_string(),
                url_column_width,
                None,
                None,
                true,
                true,
                false,
                true,
                None,
            ),
        ];

        let data: Vec<HashMap<String, String>> = url_redirects
            .iter()
            .map(|u| {
                let mut row = HashMap::new();
                row.insert("statusCode".to_string(), u.status_code.to_string());
                row.insert("url".to_string(), u.url.clone());
                // Target URL from the Location header in extras
                let target = u
                    .extras
                    .as_ref()
                    .and_then(|e| e.get("Location"))
                    .cloned()
                    .unwrap_or_else(|| "?".to_string());
                row.insert("targetUrl".to_string(), target);
                let source_url = if !u.source_uq_id.is_empty() {
                    status.get_url_by_uq_id(&u.source_uq_id).unwrap_or_default()
                } else {
                    String::new()
                };
                row.insert("sourceUqId".to_string(), source_url);
                row
            })
            .collect();

        let count_redirects = data.len();

        let mut super_table = SuperTable::new(
            SUPER_TABLE_REDIRECTS.to_string(),
            "Redirected URLs".to_string(),
            "No redirects found.".to_string(),
            columns,
            true,
            Some("url".to_string()),
            "ASC".to_string(),
            None,
            None,
            None,
        );

        super_table.set_data(data);
        status.configure_super_table_url_stripping(&mut super_table);
        output.add_super_table(&super_table);
        status.add_super_table_at_beginning(super_table);

        status.add_summary_item_by_ranges(
            "redirects",
            count_redirects as f64,
            &[(0.0, 0.0), (1.0, 2.0), (3.0, 9.0), (10.0, f64::MAX)],
            &[
                "Redirects - no redirects found",
                "Redirects - {} redirect(s) found",
                "Redirects - {} redirects found",
                "Redirects - {} redirects found",
            ],
        );
    }

    fn should_be_activated(&self) -> bool {
        true
    }

    fn get_order(&self) -> i32 {
        10
    }

    fn get_name(&self) -> &str {
        "RedirectsAnalyzer"
    }

    fn get_exec_times(&self) -> &HashMap<String, f64> {
        self.base.get_exec_times()
    }

    fn get_exec_counts(&self) -> &HashMap<String, usize> {
        self.base.get_exec_counts()
    }
}


================================================
FILE: src/analysis/result/analyzer_stats.rs
================================================
// SiteOne Crawler - AnalyzerStats
// (c) Jan Reges <jan.reges@siteone.cz>

use std::collections::HashMap;

#[derive(Debug, Clone, Default)]
pub struct AnalyzerStats {
    /// analysis_name -> severity -> set of subject hashes (or just counted entries)
    severity_counts_per_analysis: HashMap<String, SeverityCounts>,
}

#[derive(Debug, Clone, Default)]
struct SeverityCounts {
    ok: HashMap<String, bool>,
    notice: HashMap<String, bool>,
    warning: HashMap<String, bool>,
    critical: HashMap<String, bool>,
}

impl AnalyzerStats {
    pub fn new() -> Self {
        Self::default()
    }

    pub fn add_ok(&mut self, analysis_name: &str, subject: Option<&str>) {
        self.add_result(analysis_name, "ok", subject);
    }

    pub fn add_warning(&mut self, analysis_name: &str, subject: Option<&str>) {
        self.add_result(analysis_name, "warning", subject);
    }

    pub fn add_critical(&mut self, analysis_name: &str, subject: Option<&str>) {
        self.add_result(analysis_name, "critical", subject);
    }

    pub fn add_notice(&mut self, analysis_name: &str, subject: Option<&str>) {
        self.add_result(analysis_name, "notice", subject);
    }

    pub fn to_table_data(&self) -> Vec<HashMap<String, String>> {
        let mut result = Vec::new();
        for (analysis_name, counts) in &self.severity_counts_per_analysis {
            let mut row = HashMap::new();
            row.insert("analysisName".to_string(), analysis_name.clone());
            row.insert("ok".to_string(), counts.ok.len().to_string());
            row.insert("notice".to_string(), counts.notice.len().to_string());
            row.insert("warning".to_string(), counts.warning.len().to_string());
            row.insert("critical".to_string(), counts.critical.len().to_string());
            result.push(row);
        }
        result
    }

    fn add_result(&mut self, analysis_name: &str, severity: &str, subject: Option<&str>) {
        let counts = self
            .severity_counts_per_analysis
            .entry(analysis_name.to_string())
            .or_default();

        let subject_hash = subject.map(|s| {
            use md5::{Digest, Md5};
            let mut hasher = Md5::new();
            hasher.update(s.trim().as_bytes());
            let result = hasher.finalize();
            format!("{:x}", result)[..10].to_string()
        });

        let map = match severity {
            "ok" => &mut counts.ok,
            "notice" => &mut counts.notice,
            "warning" => &mut counts.warning,
            "critical" => &mut counts.critical,
            _ => return,
        };

        if let Some(hash) = subject_hash {
            map.insert(hash, true);
        } else {
            // Use a unique key based on current count
            let key = format!("_auto_{}", map.len());
            map.insert(key, true);
        }
    }
}


================================================
FILE: src/analysis/result/dns_analysis_result.rs
================================================
// SiteOne Crawler - DnsAnalysisResult
// (c) Jan Reges <jan.reges@siteone.cz>

#[derive(Debug, Clone)]
pub struct DnsAnalysisResult {
    pub dns_server_name: String,
    pub dns_server_ip_address: String,
    /// DNS resolved domain names (aliases) with all CNAMEs.
    /// First is the original domain name and last is the final resolved domain name.
    pub resolved_domains: Vec<String>,
    /// Final resolved IPv4 addresses
    pub ipv4_addresses: Vec<String>,
    /// Final resolved IPv6 addresses (when available)
    pub ipv6_addresses: Vec<String>,
}

impl DnsAnalysisResult {
    pub fn new(
        dns_server_name: String,
        dns_server_ip_address: String,
        resolved_domains: Vec<String>,
        ipv4_addresses: Vec<String>,
        ipv6_addresses: Vec<String>,
    ) -> Self {
        Self {
            dns_server_name,
            dns_server_ip_address,
            resolved_domains,
            ipv4_addresses,
            ipv6_addresses,
        }
    }

    /// Get text description of DNS analysis result in format respecting the
    /// hierarchy of resolved domains/CNAMEs and IPs.
    pub fn get_txt_description(&self) -> String {
        let mut result = String::new();

        for (i, domain) in self.resolved_domains.iter().enumerate() {
            result.push_str(&"  ".repeat(i));
            result.push_str(domain);
            result.push('\n');
        }

        let indent = "  ".repeat(self.resolved_domains.len());
        for ip in &self.ipv4_addresses {
            result.push_str(&indent);
            result.push_str(&format!("IPv4: {}\n", ip));
        }
        for ip in &self.ipv6_addresses {
            result.push_str(&indent);
            result.push_str(&format!("IPv6: {}\n", ip));
        }

        // Add DNS server info if available (0.0.0.0 means unknown, typical for CYGWIN)
        if self.dns_server_ip_address != "0.0.0.0" {
            if self.dns_server_name != self.dns_server_ip_address {
                result.push_str(&format!(
                    "\nDNS server: {} ({})\n",
                    self.dns_server_name, self.dns_server_ip_address
                ));
            } else {
                result.push_str(&format!("\nDNS server: {}\n", self.dns_server_name));
            }
        }

        result.trim().to_string()
    }
}


================================================
FILE: src/analysis/result/header_stats.rs
================================================
// SiteOne Crawler - HeaderStats
// (c) Jan Reges <jan.reges@siteone.cz>

use std::collections::HashMap;

use crate::utils;

const MAX_UNIQUE_VALUES: usize = 20;

#[derive(Debug, Clone)]
pub struct HeaderStats {
    pub header: String,
    pub occurrences: usize,
    pub unique_values: HashMap<String, usize>,
    pub unique_values_limit_reached: bool,
    pub min_date_value: Option<String>,
    pub max_date_value: Option<String>,
    pub min_int_value: Option<i64>,
    pub max_int_value: Option<i64>,
}

impl HeaderStats {
    pub fn new(header: String) -> Self {
        Self {
            header,
            occurrences: 0,
            unique_values: HashMap::new(),
            unique_values_limit_reached: false,
            min_date_value: None,
            max_date_value: None,
            min_int_value: None,
            max_int_value: None,
        }
    }

    pub fn add_value(&mut self, value: &str) {
        self.occurrences += 1;

        if self.ignore_header_values(&self.header.clone()) {
        } else if self.is_value_for_min_max_date(&self.header.clone()) {
            self.add_value_for_min_max_date(value);
        } else if self.is_value_for_min_max_int(&self.header.clone()) {
            self.add_value_for_min_max_int(value);
        } else {
            if self.unique_values.len() >= MAX_UNIQUE_VALUES {
                self.unique_values_limit_reached = true;
                return;
            }
            *self.unique_values.entry(value.to_string()).or_insert(0) += 1;
        }
    }

    pub fn get_sorted_unique_values(&self) -> Vec<(&String, &usize)> {
        let mut sorted: Vec<_> = self.unique_values.iter().collect();
        sorted.sort_by(|a, b| b.1.cmp(a.1));
        sorted
    }

    pub fn get_formatted_header_name(&self) -> String {
        let words: Vec<String> = self
            .header
            .split('-')
            .map(|w| {
                let mut chars = w.chars();
                match chars.next() {
                    Some(c) => format!("{}{}", c.to_uppercase(), chars.as_str()),
                    None => String::new(),
                }
            })
            .collect();
        words.join("-").replace("Xss", "XSS")
    }

    pub fn is_value_for_min_max_int(&self, header: &str) -> bool {
        header == "content-length" || header == "age"
    }

    pub fn is_value_for_min_max_date(&self, header: &str) -> bool {
        header == "date" || header == "expires" || header == "last-modified"
    }

    pub fn ignore_header_values(&self, header: &str) -> bool {
        matches!(header, "etag" | "cf-ray" | "set-cookie" | "content-disposition")
    }

    pub fn get_min_value(&self) -> Option<String> {
        self.min_int_value
            .map(|v| v.to_string())
            .or_else(|| self.min_date_value.clone())
    }

    pub fn get_max_value(&self) -> Option<String> {
        self.max_int_value
            .map(|v| v.to_string())
            .or_else(|| self.max_date_value.clone())
    }

    pub fn get_values_preview(&self, max_length: usize) -> String {
        if self.unique_values.len() == 1
            && let Some(first_value) = self.unique_values.keys().next()
        {
            if first_value.chars().count() > max_length {
                return utils::truncate_in_two_thirds(first_value, max_length, "\u{2026}", None);
            }
            return first_value.clone();
        }

        let values_length: usize = self.unique_values.keys().map(|k| k.len()).sum();

        if values_length < max_length.saturating_sub(10) {
            let mut sorted: Vec<_> = self.unique_values.iter().collect();
            sorted.sort_by(|a, b| b.1.cmp(a.1));

            let mut result = String::new();
            for (value, count) in sorted {
                result.push_str(&format!("{} ({}) / ", value, count));
            }

            let trimmed = result.trim().trim_end_matches(" /").to_string();
            if trimmed.is_empty() {
                return "[ignored generic values]".to_string();
            }

            return utils::truncate_in_two_thirds(&trimmed, max_length, "\u{2026}", None);
        }

        "[see values below]".to_string()
    }

    fn add_value_for_min_max_int(&mut self, value: &str) {
        if let Ok(int_val) = value.parse::<i64>() {
            match self.min_int_value {
                None => self.min_int_value = Some(int_val),
                Some(min) if int_val < min => self.min_int_value = Some(int_val),
                _ => {}
            }
            match self.max_int_value {
                None => self.max_int_value = Some(int_val),
                Some(max) if int_val > max => self.max_int_value = Some(int_val),
                _ => {}
            }
        }
    }

    fn add_value_for_min_max_date(&mut self, value: &str) {
        // Try to parse HTTP date format into a simple YYYY-MM-DD string
        if let Ok(dt) = chrono::DateTime::parse_from_rfc2822(value) {
            let date = dt.format("%Y-%m-%d").to_string();
            match &self.min_date_value {
                None => self.min_date_value = Some(date.clone()),
                Some(min) if &date < min => self.min_date_value = Some(date.clone()),
                _ => {}
            }
            match &self.max_date_value {
                None => self.max_date_value = Some(date),
                Some(max) if &date > max => self.max_date_value = Some(date),
                _ => {}
            }
        }
    }
}


================================================
FILE: src/analysis/result/heading_tree_item.rs
================================================
// SiteOne Crawler - HeadingTreeItem
// (c) Jan Reges <jan.reges@siteone.cz>

fn html_escape(s: &str) -> String {
    s.replace('&', "&amp;")
        .replace('<', "&lt;")
        .replace('>', "&gt;")
        .replace('"', "&quot;")
        .replace('\'', "&#39;")
}

#[derive(Debug, Clone)]
pub struct HeadingTreeItem {
    /// Heading level (1-6)
    pub level: i32,
    /// Real heading level by heading structure in HTML
    pub real_level: Option<i32>,
    /// Heading text
    pub text: String,
    /// Heading ID attribute
    pub id: Option<String>,
    /// Children headings
    pub children: Vec<HeadingTreeItem>,
    /// Error text in case of error (typically multiple H1s or wrong heading level)
    pub error_text: Option<String>,
}

impl HeadingTreeItem {
    pub fn new(level: i32, text: String, id: Option<String>) -> Self {
        Self {
            level,
            real_level: None,
            text,
            id,
            children: Vec::new(),
            error_text: None,
        }
    }

    pub fn has_error(&self) -> bool {
        self.error_text.is_some()
    }

    /// Get heading tree as a plain text list
    pub fn get_heading_tree_txt_list(items: &[HeadingTreeItem]) -> String {
        let mut result = String::new();
        for item in items {
            result.push_str(&Self::get_heading_tree_txt(item, true));
        }
        // Collapse whitespace
        let re = regex::Regex::new(r"\s+").unwrap_or_else(|_| regex::Regex::new(".^").unwrap());
        re.replace_all(&result, " ").trim().to_string()
    }

    fn get_heading_tree_txt(item: &HeadingTreeItem, add_item: bool) -> String {
        let mut result = String::new();
        if add_item {
            result.push_str(&format!("<h{}> {}", item.level, item.text));
            if let Some(ref id) = item.id {
                result.push_str(&format!(" [#{}]", id));
            }
            result.push('\n');
        }
        for child in &item.children {
            result.push_str(&"  ".repeat((child.level - 1) as usize));
            result.push_str(&format!("<h{}> {}", child.level, child.text));
            if let Some(ref id) = child.id {
                result.push_str(&format!(" [#{}]", id));
            }
            result.push('\n');
            result.push_str(&Self::get_heading_tree_txt(child, false));
        }
        result
    }

    /// Get heading tree as an HTML `<ul><li>` list.
    pub fn get_heading_tree_ul_li_list(items: &[HeadingTreeItem]) -> String {
        let mut result = String::from("<ul>");
        for item in items {
            result.push_str("<li>");
            result.push_str(&Self::get_heading_tree_ul_li(item, true));
            result.push_str("</li>");
        }
        result.push_str("</ul>");
        result
    }

    fn get_heading_tree_ul_li(item: &HeadingTreeItem, add_item: bool) -> String {
        let mut result = String::new();
        if add_item {
            let txt_row = format!(
                "&lt;h{}&gt; {}{}",
                item.level,
                html_escape(&item.text),
                item.id
                    .as_ref()
                    .map(|id| format!(" [#{}]", html_escape(id)))
                    .unwrap_or_default()
            );
            if item.has_error() {
                let error_text = html_escape(item.error_text.as_deref().unwrap_or(""));
                let colored = crate::utils::get_color_text(&txt_row, "magenta", false);
                let colored_html = crate::utils::convert_bash_colors_in_text_to_html(&colored);
                result.push_str(&format!(
                    "<span class=\"help\" title=\"{}\">{}</span>",
                    error_text, colored_html
                ));
            } else {
                result.push_str(&txt_row);
            }
        }

        if !item.children.is_empty() {
            result.push_str("<ul>");
            for child in &item.children {
                result.push_str("<li>");
                let txt_row = format!(
                    "&lt;h{}&gt; {}{}",
                    child.level,
                    html_escape(&child.text),
                    child
                        .id
                        .as_ref()
                        .map(|id| format!(" [#{}]", html_escape(id)))
                        .unwrap_or_default()
                );
                if child.has_error() {
                    let error_text = html_escape(child.error_text.as_deref().unwrap_or(""));
                    let colored = crate::utils::get_color_text(&txt_row, "magenta", false);
                    let colored_html = crate::utils::convert_bash_colors_in_text_to_html(&colored);
                    result.push_str(&format!(
                        "<span class=\"help\" title=\"{}\">{}</span>",
                        error_text, colored_html
                    ));
                } else {
                    result.push_str(&txt_row);
                }
                result.push_str(&Self::get_heading_tree_ul_li(child, false));
                result.push_str("</li>");
            }
            result.push_str("</ul>");
        }
        result
    }

    /// Count total headings in tree
    pub fn get_headings_count(items: &[HeadingTreeItem]) -> usize {
        let mut count = 0;
        for item in items {
            count += 1;
            count += Self::get_headings_count(&item.children);
        }
        count
    }

    /// Count headings with errors in tree
    pub fn get_headings_with_error_count(items: &[HeadingTreeItem]) -> usize {
        let mut count = 0;
        for item in items {
            if item.has_error() {
                count += 1;
            }
            count += Self::get_headings_with_error_count(&item.children);
        }
        count
    }
}


================================================
FILE: src/analysis/result/mod.rs
================================================
pub mod analyzer_stats;
pub mod dns_analysis_result;
pub mod header_stats;
pub mod heading_tree_item;
pub mod security_checked_header;
pub mod security_result;
pub mod seo_opengraph_result;
pub mod url_analysis_result;


================================================
FILE: src/analysis/result/security_checked_header.rs
================================================
// SiteOne Crawler - SecurityCheckedHeader
// (c) Jan Reges <jan.reges@siteone.cz>

use std::collections::HashMap;

pub const SEVERITY_OK: i32 = 1;
pub const SEVERITY_NOTICE: i32 = 2;
pub const SEVERITY_WARNING: i32 = 3;
pub const SEVERITY_CRITICAL: i32 = 4;

#[derive(Debug, Clone)]
pub struct SecurityCheckedHeader {
    pub header: String,
    pub highest_severity: Option<i32>,
    /// severity -> count
    pub count_per_severity: HashMap<i32, usize>,
    /// All unique values of this header
    pub values: Vec<String>,
    pub recommendations: Vec<String>,
}

impl SecurityCheckedHeader {
    pub fn new(header: String) -> Self {
        Self {
            header,
            highest_severity: None,
            count_per_severity: HashMap::new(),
            values: Vec::new(),
            recommendations: Vec::new(),
        }
    }

    pub fn set_finding(&mut self, value: Option<&str>, severity: i32, recommendation: Option<&str>) {
        if let Some(val) = value
            && !self.values.contains(&val.to_string())
        {
            self.values.push(val.to_string());
        }
        if let Some(rec) = recommendation
            && !self.recommendations.contains(&rec.to_string())
        {
            self.recommendations.push(rec.to_string());
        }
        if self.highest_severity.is_none() || severity > self.highest_severity.unwrap_or(0) {
            self.highest_severity = Some(severity);
        }
        *self.count_per_severity.entry(severity).or_insert(0) += 1;
    }

    pub fn get_formatted_header(&self) -> String {
        let words: Vec<String> = self
            .header
            .split('-')
            .map(|w| {
                let mut chars = w.chars();
                match chars.next() {
                    Some(c) => format!("{}{}", c.to_uppercase(), chars.as_str()),
                    None => String::new(),
                }
            })
            .collect();
        words.join("-").replace("Xss", "XSS")
    }

    pub fn get_severity_name(&self) -> &'static str {
        match self.highest_severity {
            Some(SEVERITY_OK) => "ok",
            Some(SEVERITY_NOTICE) => "notice",
            Some(SEVERITY_WARNING) => "warning",
            Some(SEVERITY_CRITICAL) => "critical",
            _ => "unknown",
        }
    }
}


================================================
FILE: src/analysis/result/security_result.rs
================================================
// SiteOne Crawler - SecurityResult
// (c) Jan Reges <jan.reges@siteone.cz>

use indexmap::IndexMap;

use super::security_checked_header::{SEVERITY_OK, SecurityCheckedHeader};

#[derive(Debug, Clone, Default)]
pub struct SecurityResult {
    pub checked_headers: IndexMap<String, SecurityCheckedHeader>,
}

impl SecurityResult {
    pub fn new() -> Self {
        Self::default()
    }

    pub fn get_checked_header(&mut self, header: &str) -> &mut SecurityCheckedHeader {
        self.checked_headers
            .entry(header.to_string())
            .or_insert_with(|| SecurityCheckedHeader::new(header.to_string()))
    }

    pub fn get_highest_severity(&self) -> i32 {
        let mut highest = SEVERITY_OK;
        for item in self.checked_headers.values() {
            if let Some(sev) = item.highest_severity
                && sev > highest
            {
                highest = sev;
            }
        }
        highest
    }
}


================================================
FILE: src/analysis/result/seo_opengraph_result.rs
================================================
// SiteOne Crawler - SeoAndOpenGraphResult
// (c) Jan Reges <jan.reges@siteone.cz>

use super::heading_tree_item::HeadingTreeItem;

pub const ROBOTS_INDEX: i32 = 1;
pub const ROBOTS_NOINDEX: i32 = 0;
pub const ROBOTS_FOLLOW: i32 = 1;
pub const ROBOTS_NOFOLLOW: i32 = 2;

#[derive(Debug, Clone)]
pub struct SeoAndOpenGraphResult {
    pub url_uq_id: String,
    pub url_path_and_query: String,

    pub title: Option<String>,
    pub description: Option<String>,
    pub keywords: Option<String>,
    pub h1: Option<String>,

    pub robots_index: Option<i32>,
    pub robots_follow: Option<i32>,
    pub denied_by_robots_txt: bool,

    pub og_title: Option<String>,
    pub og_type: Option<String>,
    pub og_image: Option<String>,
    pub og_url: Option<String>,
    pub og_description: Option<String>,
    pub og_site_name: Option<String>,

    pub twitter_card: Option<String>,
    pub twitter_site: Option<String>,
    pub twitter_creator: Option<String>,
    pub twitter_title: Option<String>,
    pub twitter_description: Option<String>,
    pub twitter_image: Option<String>,

    pub heading_tree_items: Vec<HeadingTreeItem>,
    pub headings_count: usize,
    pub headings_errors_count: usize,
}

impl SeoAndOpenGraphResult {
    pub fn new(url_uq_id: String, url_path_and_query: String) -> Self {
        Self {
            url_uq_id,
            url_path_and_query,
            title: None,
            description: None,
            keywords: None,
            h1: None,
            robots_index: None,
            robots_follow: None,
            denied_by_robots_txt: false,
            og_title: None,
            og_type: None,
            og_image: None,
            og_url: None,
            og_description: None,
            og_site_name: None,
            twitter_card: None,
            twitter_site: None,
            twitter_creator: None,
            twitter_title: None,
            twitter_description: None,
            twitter_image: None,
            heading_tree_items: Vec::new(),
            headings_count: 0,
            headings_errors_count: 0,
        }
    }

    /// Check if URL is denied by robots.txt
    pub fn is_denied_by_robots_txt(url_path_and_query: &str, robots_txt_content: &str) -> bool {
        if robots_txt_content.is_empty() {
            return false;
        }

        // Remove query string from URL
        let url_path = if let Some(pos) = url_path_and_query.find('?') {
            &url_path_and_query[..pos]
        } else {
            url_path_and_query
        };

        // Remove scheme and host from URL if present
        let url_path = if url_path.contains("://") {
            if let Ok(parsed) = url::Url::parse(url_path) {
                parsed.path().to_string()
            } else {
                url_path.to_string()
            }
        } else {
            url_path.to_string()
        };

        for line in robots_txt_content.lines() {
            let line = line.trim();
            if let Some(disallowed_path) = line.strip_prefix("Disallow:") {
                let disallowed_path = disallowed_path.trim();
                if !disallowed_path.is_empty() && url_path.starts_with(disallowed_path) {
                    return true;
                }
            }
        }

        false
    }
}


================================================
FILE: src/analysis/result/url_analysis_result.rs
================================================
// SiteOne Crawler - UrlAnalysisResult
// (c) Jan Reges <jan.reges@siteone.cz>

use std::collections::HashMap;

use crate::utils;

#[derive(Debug, Clone, Default)]
pub struct UrlAnalysisResult {
    ok: Vec<String>,
    notice: Vec<String>,
    warning: Vec<String>,
    critical: Vec<String>,

    ok_details: HashMap<String, Vec<String>>,
    notice_details: HashMap<String, Vec<String>>,
    warning_details: HashMap<String, Vec<String>>,
    critical_details: HashMap<String, Vec<String>>,

    /// Stats per analysis and severity: analysis_name -> severity -> count
    stats_per_analysis: HashMap<String, HashMap<String, usize>>,
}

impl UrlAnalysisResult {
    pub fn new() -> Self {
        Self::default()
    }

    pub fn add_ok(&mut self, message: String, analysis_name: &str, detail: Option<Vec<String>>) {
        self.ok.push(message);
        if let Some(d) = detail {
            self.ok_details.entry(analysis_name.to_string()).or_default().extend(d);
        }
        *self
            .stats_per_analysis
            .entry(analysis_name.to_string())
            .or_default()
            .entry("ok".to_string())
            .or_insert(0) += 1;
    }

    pub fn add_notice(&mut self, message: String, analysis_name: &str, detail: Option<Vec<String>>) {
        self.notice.push(message);
        if let Some(d) = detail {
            self.notice_details
                .entry(analysis_name.to_string())
                .or_default()
                .extend(d);
        }
        *self
            .stats_per_analysis
            .entry(analysis_name.to_string())
            .or_default()
            .entry("notice".to_string())
            .or_insert(0) += 1;
    }

    pub fn add_warning(&mut self, message: String, analysis_name: &str, detail: Option<Vec<String>>) {
        self.warning.push(message);
        if let Some(d) = detail {
            self.warning_details
                .entry(analysis_name.to_string())
                .or_default()
                .extend(d);
        }
        *self
            .stats_per_analysis
            .entry(analysis_name.to_string())
            .or_default()
            .entry("warning".to_string())
            .or_insert(0) += 1;
    }

    pub fn add_critical(&mut self, message: String, analysis_name: &str, detail: Option<Vec<String>>) {
        self.critical.push(message);
        if let Some(d) = detail {
            self.critical_details
                .entry(analysis_name.to_string())
                .or_default()
                .extend(d);
        }
        *self
            .stats_per_analysis
            .entry(analysis_name.to_string())
            .or_default()
            .entry("critical".to_string())
            .or_insert(0) += 1;
    }

    pub fn get_stats_per_analysis(&self) -> &HashMap<String, HashMap<String, usize>> {
        &self.stats_per_analysis
    }

    pub fn get_ok(&self) -> &[String] {
        &self.ok
    }

    pub fn get_notice(&self) -> &[String] {
        &self.notice
    }

    pub fn get_warning(&self) -> &[String] {
        &self.warning
    }

    pub fn get_critical(&self) -> &[String] {
        &self.critical
    }

    pub fn get_ok_details(&self) -> &HashMap<String, Vec<String>> {
        &self.ok_details
    }

    pub fn get_notice_details(&self) -> &HashMap<String, Vec<String>> {
        &self.notice_details
    }

    pub fn get_warning_details(&self) -> &HashMap<String, Vec<String>> {
        &self.warning_details
    }

    pub fn get_critical_details(&self) -> &HashMap<String, Vec<String>> {
        &self.critical_details
    }

    pub fn get_all_count(&self) -> usize {
        self.ok.len() + self.notice.len() + self.warning.len() + self.critical.len()
    }

    pub fn get_details_of_severity_and_analysis_name(&self, severity: &str, analysis_name: &str) -> Vec<String> {
        match severity {
            "ok" => self.ok_details.get(analysis_name).cloned().unwrap_or_default(),
            "notice" => self.notice_details.get(analysis_name).cloned().unwrap_or_default(),
            "warning" => self.warning_details.get(analysis_name).cloned().unwrap_or_default(),
            "critical" => self.critical_details.get(analysis_name).cloned().unwrap_or_default(),
            _ => Vec::new(),
        }
    }

    pub fn to_icon_string(&self) -> String {
        let mut result = String::new();

        let count_critical = self.critical.len();
        let count_warning = self.warning.len();
        let count_notice = self.notice.len();
        let count_ok = self.ok.len();

        if count_critical > 0 {
            result.push_str(&format!("{}\u{26d4} ", count_critical));
        }
        if count_warning > 0 {
            result.push_str(&format!("{}\u{26a0} ", count_warning));
        }
        if count_notice > 0 {
            result.push_str(&format!("{}\u{2139}\u{fe0f} ", count_notice));
        }
        if count_ok > 0 {
            result.push_str(&format!("{}\u{2705} ", count_ok));
        }

        result.trim().to_string()
    }

    pub fn to_colorized_string(&self, strip_whitespaces: bool) -> String {
        let mut result = String::new();

        let count_critical = self.critical.len();
        let count_warning = self.warning.len();
        let count_notice = self.notice.len();
        let count_ok = self.ok.len();

        if count_critical > 0 {
            result.push_str(&utils::get_color_text(&count_critical.to_string(), "red", true));
            result.push_str(" / ");
        }
        if count_warning > 0 {
            result.push_str(&utils::get_color_text(&count_warning.to_string(), "magenta", false));
            result.push_str(" / ");
        }
        if count_notice > 0 {
            result.push_str(&utils::get_color_text(&count_notice.to_string(), "blue", false));
            result.push_str(" / ");
        }
        if count_ok > 0 {
            result.push_str(&utils::get_color_text(&count_ok.to_string(), "green", false));
            result.push_str(" / ");
        }

        let trimmed = result.trim_end_matches(" / ").to_string();
        if strip_whitespaces {
            trimmed.replace(' ', "")
        } else {
            trimmed
        }
    }

    pub fn to_not_colorized_string(&self, strip_whitespaces: bool) -> String {
        let mut result = String::new();

        let count_critical = self.critical.len();
        let count_warning = self.warning.len();
        let count_notice = self.notice.len();
        let count_ok = self.ok.len();

        if count_critical > 0 {
            result.push_str(&format!("{} / ", count_critical));
        }
        if count_warning > 0 {
            result.push_str(&format!("{} / ", count_warning));
        }
        if count_notice > 0 {
            result.push_str(&format!("{} / ", count_notice));
        }
        if count_ok > 0 {
            result.push_str(&format!("{} / ", count_ok));
        }

        let trimmed = result.trim_end_matches(" / ").to_string();
        if strip_whitespaces {
            trimmed.replace(' ', "")
        } else {
            trimmed
        }
    }

    pub fn get_all_details_for_analysis(&self, analysis_name: &str) -> HashMap<String, Vec<String>> {
        let mut result = HashMap::new();
        result.insert(
            "ok".to_string(),
            self.ok_details.get(analysis_name).cloned().unwrap_or_default(),
        );
        result.insert(
            "notice".to_string(),
            self.notice_details.get(analysis_name).cloned().unwrap_or_default(),
        );
        result.insert(
            "warning".to_string(),
            self.warning_details.get(analysis_name).cloned().unwrap_or_default(),
        );
        result.insert(
            "critical".to_string(),
            self.critical_details.get(analysis_name).cloned().unwrap_or_default(),
        );
        result
    }
}

impl std::fmt::Display for UrlAnalysisResult {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        write!(f, "{}", self.to_colorized_string(true))
    }
}


================================================
FILE: src/analysis/security_analyzer.rs
================================================
// SiteOne Crawler - SecurityAnalyzer
// (c) Jan Reges <jan.reges@siteone.cz>

use std::collections::HashMap;
use std::time::Instant;

use regex::Regex;

use crate::analysis::analyzer::Analyzer;
use crate::analysis::base_analyzer::BaseAnalyzer;
use crate::analysis::result::security_checked_header::{
    SEVERITY_CRITICAL, SEVERITY_NOTICE, SEVERITY_OK, SEVERITY_WARNING,
};
use crate::analysis::result::security_result::SecurityResult;
use crate::analysis::result::url_analysis_result::UrlAnalysisResult;
use crate::components::super_table::SuperTable;
use crate::components::super_table_column::SuperTableColumn;
use crate::output::output::Output;
use crate::result::status::Status;
use crate::result::visited_url::VisitedUrl;
use crate::types::ContentTypeId;
use crate::utils;

const SUPER_TABLE_SECURITY: &str = "security";
const ANALYSIS_HEADERS: &str = "Security headers";

const HEADER_ACCESS_CONTROL_ALLOW_ORIGIN: &str = "access-control-allow-origin";
const HEADER_STRICT_TRANSPORT_SECURITY: &str = "strict-transport-security";
const HEADER_X_FRAME_OPTIONS: &str = "x-frame-options";
const HEADER_X_XSS_PROTECTION: &str = "x-xss-protection";
const HEADER_X_CONTENT_TYPE_OPTIONS: &str = "x-content-type-options";
const HEADER_REFERRER_POLICY: &str = "referrer-policy";
const HEADER_CONTENT_SECURITY_POLICY: &str = "content-security-policy";
const HEADER_FEATURE_POLICY: &str = "feature-policy";
const HEADER_PERMISSIONS_POLICY: &str = "permissions-policy";
const HEADER_SERVER: &str = "server";
const HEADER_X_POWERED_BY: &str = "x-powered-by";
const HEADER_SET_COOKIE: &str = "set-cookie";

const CHECKED_HEADERS: &[&str] = &[
    HEADER_ACCESS_CONTROL_ALLOW_ORIGIN,
    HEADER_STRICT_TRANSPORT_SECURITY,
    HEADER_X_FRAME_OPTIONS,
    HEADER_X_XSS_PROTECTION,
    HEADER_X_CONTENT_TYPE_OPTIONS,
    HEADER_REFERRER_POLICY,
    HEADER_CONTENT_SECURITY_POLICY,
    HEADER_FEATURE_POLICY,
    HEADER_PERMISSIONS_POLICY,
    HEADER_SERVER,
    HEADER_X_POWERED_BY,
    HEADER_SET_COOKIE,
];

pub struct SecurityAnalyzer {
    base: BaseAnalyzer,
    result: SecurityResult,
    pages_with_critical: usize,
    pages_with_warning: usize,
    pages_with_notice: usize,
}

impl Default for SecurityAnalyzer {
    fn default() -> Self {
        Self::new()
    }
}

impl SecurityAnalyzer {
    pub fn new() -> Self {
        Self {
            base: BaseAnalyzer::new(),
            result: SecurityResult::new(),
            pages_with_critical: 0,
            pages_with_warning: 0,
            pages_with_notice: 0,
        }
    }

    fn check_headers(&mut self, headers: &HashMap<String, String>, is_https: bool, url_result: &mut UrlAnalysisResult) {
        for &header in CHECKED_HEADERS {
            match header {
                HEADER_ACCESS_CONTROL_ALLOW_ORIGIN => {
                    self.check_access_control_allow_origin(headers, url_result);
                }
                HEADER_STRICT_TRANSPORT_SECURITY => {
                    if is_https {
                        self.check_strict_transport_security(headers, url_result);
                    }
                }
                HEADER_X_FRAME_OPTIONS => {
                    self.check_x_frame_options(headers, url_result);
                }
                HEADER_X_XSS_PROTECTION => {
                    self.check_x_xss_protection(headers, url_result);
                }
                HEADER_X_CONTENT_TYPE_OPTIONS => {
                    self.check_x_content_type_options(headers, url_result);
                }
                HEADER_REFERRER_POLICY => {
                    self.check_referrer_policy(headers, url_result);
                }
                HEADER_CONTENT_SECURITY_POLICY => {
                    self.check_content_security_policy(headers, url_result);
                }
                HEADER_FEATURE_POLICY => {
                    self.check_feature_policy(headers, url_result);
                }
                HEADER_PERMISSIONS_POLICY => {
                    self.check_permissions_policy(headers, url_result);
                }
                HEADER_SERVER => {
                    self.check_server(headers, url_result);
                }
                HEADER_X_POWERED_BY => {
                    self.check_x_powered_by(headers, url_result);
                }
                HEADER_SET_COOKIE => {
                    self.check_set_cookie(headers, is_https, url_result);
                }
                _ => {}
            }
        }
    }

    fn check_html_security(&mut self, html: &str, is_https: bool, url_result: &mut UrlAnalysisResult) {
        if !is_https {
            return;
        }

        use once_cell::sync::Lazy;
        static RE_FORM_HTTP: Lazy<Regex> =
            Lazy::new(|| Regex::new(r#"(?i)<form[^>]*action=["']http://[^"']+["'][^>]*>"#).unwrap());
        static RE_IFRAME_HTTP: Lazy<Regex> =
            Lazy::new(|| Regex::new(r#"(?i)<iframe[^>]*src=["']http://[^"']+["'][^>]*>"#).unwrap());

        // Check for form actions over non-secure HTTP
        for mat in RE_FORM_HTTP.find_iter(html) {
            let finding = format!(
                "Form actions that send data over non-secure HTTP detected in {}",
                mat.as_str()
            );
            url_result.add_critical(finding.clone(), ANALYSIS_HEADERS, Some(vec![finding]));
        }

        // Check for iframes with non-secure HTTP
        for mat in RE_IFRAME_HTTP.find_iter(html) {
            let finding = format!("Iframe with non-secure HTTP detected in {}", mat.as_str());
            url_result.add_critical(finding.clone(), ANALYSIS_HEADERS, Some(vec![finding]));
        }
    }

    fn get_header_value(headers: &HashMap<String, String>, header: &str) -> Option<String> {
        headers.get(header).map(|s| s.to_string())
    }

    fn check_access_control_allow_origin(
        &mut self,
        headers: &HashMap<String, String>,
        url_result: &mut UrlAnalysisResult,
    ) {
        let value = Self::get_header_value(headers, HEADER_ACCESS_CONTROL_ALLOW_ORIGIN);

        let value_ref = value.as_deref();
        match value_ref {
            None => {}
            Some("*") => {
                let rec = "Access-Control-Allow-Origin is set to '*' which allows any origin to access the resource. This can be a security risk.";
                url_result.add_warning(rec.to_string(), ANALYSIS_HEADERS, Some(vec![rec.to_string()]));
                self.result
                    .get_checked_header(HEADER_ACCESS_CONTROL_ALLOW_ORIGIN)
                    .set_finding(value_ref, SEVERITY_WARNING, Some(rec));
            }
            Some(v) if v != "same-origin" && v != "none" => {
                let rec = format!(
                    "Access-Control-Allow-Origin is set to '{}' which allows this origin to access the resource.",
                    v
                );
                url_result.add_notice(rec.clone(), ANALYSIS_HEADERS, Some(vec![rec.clone()]));
                self.result
                    .get_checked_header(HEADER_ACCESS_CONTROL_ALLOW_ORIGIN)
                    .set_finding(value_ref, SEVERITY_NOTICE, Some(&rec));
            }
            _ => {
                self.result
                    .get_checked_header(HEADER_ACCESS_CONTROL_ALLOW_ORIGIN)
                    .set_finding(value_ref, SEVERITY_OK, None);
            }
        }
    }

    fn check_strict_transport_security(
        &mut self,
        headers: &HashMap<String, String>,
        url_result: &mut UrlAnalysisResult,
    ) {
        let value = Self::get_header_value(headers, HEADER_STRICT_TRANSPORT_SECURITY);
        let value_ref = value.as_deref();

        match value_ref {
            None => {
                let rec = "Strict-Transport-Security header is not set. It enforces secure connections and protects against MITM attacks.";
                url_result.add_critical(rec.to_string(), ANALYSIS_HEADERS, Some(vec![rec.to_string()]));
                self.result
                    .get_checked_header(HEADER_STRICT_TRANSPORT_SECURITY)
                    .set_finding(None, SEVERITY_CRITICAL, Some(rec));
            }
            Some(v) if v.contains("max-age=0") => {
                let rec = "Strict-Transport-Security header is set to max-age=0 which disables HSTS. This can be a security risk.";
                url_result.add_critical(rec.to_string(), ANALYSIS_HEADERS, Some(vec![rec.to_string()]));
                self.result
                    .get_checked_header(HEADER_STRICT_TRANSPORT_SECURITY)
                    .set_finding(value_ref, SEVERITY_CRITICAL, Some(rec));
            }
            Some(v) => {
                use once_cell::sync::Lazy;
                static RE_MAX_AGE: Lazy<Regex> = Lazy::new(|| Regex::new(r"(?i)max-age=([0-9]+)").unwrap());
                if let Some(caps) = RE_MAX_AGE.captures(v)
                    && let Some(age_str) = caps.get(1)
                    && let Ok(age) = age_str.as_str().parse::<i64>()
                    && age < 31 * 24 * 60 * 60
                {
                    let rec = format!(
                        "Strict-Transport-Security header is set to max-age={} which is less than 31 days. This can be a security risk.",
                        age
                    );
                    url_result.add_warning(rec.clone(), ANALYSIS_HEADERS, Some(vec![rec.clone()]));
                    self.result
                        .get_checked_header(HEADER_STRICT_TRANSPORT_SECURITY)
                        .set_finding(value_ref, SEVERITY_WARNING, Some(&rec));
                    return;
                }
                self.result
                    .get_checked_header(HEADER_STRICT_TRANSPORT_SECURITY)
                    .set_finding(value_ref, SEVERITY_OK, None);
            }
        }
    }

    fn check_x_frame_options(&mut self, headers: &HashMap<String, String>, url_result: &mut UrlAnalysisResult) {
        let value = Self::get_header_value(headers, HEADER_X_FRAME_OPTIONS);
        let value_ref = value.as_deref();

        match value_ref {
            None => {
                let rec = "X-Frame-Options header is not set. It prevents clickjacking attacks when set to 'deny' or 'sameorigin.";
                url_result.add_warning(rec.to_string(), ANALYSIS_HEADERS, Some(vec![rec.to_string()]));
                self.result
                    .get_checked_header(HEADER_X_FRAME_OPTIONS)
                    .set_finding(None, SEVERITY_WARNING, Some(rec));
            }
            Some("DENY") => {
                self.result
                    .get_checked_header(HEADER_X_FRAME_OPTIONS)
                    .set_finding(value_ref, SEVERITY_OK, None);
            }
            Some("SAMEORIGIN") => {
                let rec = "X-Frame-Options header is set to SAMEORIGIN which allows this origin to embed the resource in a frame.";
                url_result.add_notice(rec.to_string(), ANALYSIS_HEADERS, Some(vec![rec.to_string()]));
                self.result.get_checked_header(HEADER_X_FRAME_OPTIONS).set_finding(
                    value_ref,
                    SEVERITY_NOTICE,
                    Some(rec),
                );
            }
            Some("ALLOW-FROM") => {
                let rec = "X-Frame-Options header is set to ALLOW-FROM which allows this origin to embed the resource in a frame.";
                url_result.add_notice(rec.to_string(), ANALYSIS_HEADERS, Some(vec![rec.to_string()]));
                self.result.get_checked_header(HEADER_X_FRAME_OPTIONS).set_finding(
                    value_ref,
                    SEVERITY_NOTICE,
                    Some(rec),
                );
            }
            Some(v) => {
                let rec = format!(
                    "X-Frame-Options header is set to '{}' which allows this origin to embed the resource in a frame. This can be a security risk.",
                    v
                );
                url_result.add_warning(rec.clone(), ANALYSIS_HEADERS, Some(vec![rec.clone()]));
                self.result.get_checked_header(HEADER_X_FRAME_OPTIONS).set_finding(
                    value_ref,
                    SEVERITY_WARNING,
                    Some(&rec),
                );
            }
        }
    }

    fn check_x_xss_protection(&mut self, headers: &HashMap<String, String>, url_result: &mut UrlAnalysisResult) {
        let value = Self::get_header_value(headers, HEADER_X_XSS_PROTECTION);
        let value_ref = value.as_deref();

        // X-XSS-Protection is deprecated (MDN) and non-standard. Modern browsers have removed
        // XSS auditor support. The recommended approach is to use Content-Security-Policy instead.
        // Not setting this header is the correct modern behavior.
        match value_ref {
            None | Some("0") => {
                // Not set or explicitly disabled — correct modern behavior
                self.result
                    .get_checked_header(HEADER_X_XSS_PROTECTION)
                    .set_finding(value_ref, SEVERITY_OK, None);
            }
            Some("1") | Some("1; mode=block") | Some("1;mode=block") => {
                let rec = "X-XSS-Protection header is set but deprecated. Consider removing it and using Content-Security-Policy instead.";
                url_result.add_notice(rec.to_string(), ANALYSIS_HEADERS, Some(vec![rec.to_string()]));
                self.result.get_checked_header(HEADER_X_XSS_PROTECTION).set_finding(
                    value_ref,
                    SEVERITY_NOTICE,
                    Some(rec),
                );
            }
            Some(v) => {
                let rec = format!(
                    "X-XSS-Protection header is set to '{}'. This header is deprecated; use Content-Security-Policy instead.",
                    v
                );
                url_result.add_notice(rec.clone(), ANALYSIS_HEADERS, Some(vec![rec.clone()]));
                self.result.get_checked_header(HEADER_X_XSS_PROTECTION).set_finding(
                    value_ref,
                    SEVERITY_NOTICE,
                    Some(&rec),
                );
            }
        }
    }

    fn check_x_content_type_options(&mut self, headers: &HashMap<String, String>, url_result: &mut UrlAnalysisResult) {
        let value = Self::get_header_value(headers, HEADER_X_CONTENT_TYPE_OPTIONS);
        let value_ref = value.as_deref();

        match value_ref {
            None => {
                let rec = "X-Content-Type-Options header is not set. It stops MIME type sniffing and mitigates content type attacks.";
                url_result.add_warning(rec.to_string(), ANALYSIS_HEADERS, Some(vec![rec.to_string()]));
                self.result
                    .get_checked_header(HEADER_X_CONTENT_TYPE_OPTIONS)
                    .set_finding(None, SEVERITY_WARNING, Some(rec));
            }
            Some("nosniff") => {
                self.result
                    .get_checked_header(HEADER_X_CONTENT_TYPE_OPTIONS)
                    .set_finding(value_ref, SEVERITY_OK, None);
            }
            Some(v) => {
                let rec = format!(
                    "X-Content-Type-Options header is set to '{}'. This can be a security risk.",
                    v
                );
                url_result.add_warning(rec.clone(), ANALYSIS_HEADERS, Some(vec![rec.clone()]));
                self.result
                    .get_checked_header(HEADER_X_CONTENT_TYPE_OPTIONS)
                    .set_finding(value_ref, SEVERITY_WARNING, Some(&rec));
            }
        }
    }

    fn check_referrer_policy(&mut self, headers: &HashMap<String, String>, url_result: &mut UrlAnalysisResult) {
        let value = Self::get_header_value(headers, HEADER_REFERRER_POLICY);
        let value_ref = value.as_deref();

        let ok_values = [
            "no-referrer",
            "no-referrer-when-downgrade",
            "origin",
            "origin-when-cross-origin",
            "same-origin",
            "strict-origin",
            "strict-origin-when-cross-origin",
            "unsafe-url",
        ];

        match value_ref {
            None => {
                let rec = "Referrer-Policy header is not set. It controls referrer header sharing and enhances privacy and security.";
                url_result.add_warning(rec.to_string(), ANALYSIS_HEADERS, Some(vec![rec.to_string()]));
                self.result
                    .get_checked_header(HEADER_REFERRER_POLICY)
                    .set_finding(None, SEVERITY_WARNING, Some(rec));
            }
            Some(v) if ok_values.contains(&v) => {
                self.result
                    .get_checked_header(HEADER_REFERRER_POLICY)
                    .set_finding(value_ref, SEVERITY_OK, None);
            }
            Some(v) => {
                let rec = format!("Referrer-Policy header is set to '{}'. This can be a security risk.", v);
                url_result.add_notice(rec.clone(), ANALYSIS_HEADERS, Some(vec![rec.clone()]));
                self.result.get_checked_header(HEADER_REFERRER_POLICY).set_finding(
                    value_ref,
                    SEVERITY_NOTICE,
                    Some(&rec),
                );
            }
        }
    }

    fn check_content_security_policy(&mut self, headers: &HashMap<String, String>, url_result: &mut UrlAnalysisResult) {
        let value = Self::get_header_value(headers, HEADER_CONTENT_SECURITY_POLICY);
        let value_ref = value.as_deref();

        match value_ref {
            None => {
                let rec = "Content-Security-Policy header is not set. It restricts resources the page can load and prevents XSS attacks.";
                url_result.add_critical(rec.to_string(), ANALYSIS_HEADERS, Some(vec![rec.to_string()]));
                self.result
                    .get_checked_header(HEADER_CONTENT_SECURITY_POLICY)
                    .set_finding(None, SEVERITY_CRITICAL, Some(rec));
            }
            _ => {
                self.result
                    .get_checked_header(HEADER_CONTENT_SECURITY_POLICY)
                    .set_finding(value_ref, SEVERITY_OK, None);
            }
        }
    }

    fn check_feature_policy(&mut self, headers: &HashMap<String, String>, url_result: &mut UrlAnalysisResult) {
        let value = Self::get_header_value(headers, HEADER_FEATURE_POLICY);
        let value_ref = value.as_deref();

        let has_permissions_policy = Self::get_header_value(headers, HEADER_PERMISSIONS_POLICY).is_some();

        match value_ref {
            None if has_permissions_policy => {
                let rec = "Feature-Policy header is not set but Permissions-Policy is set. That's enough.";
                url_result.add_notice(rec.to_string(), ANALYSIS_HEADERS, Some(vec![rec.to_string()]));
                self.result
                    .get_checked_header(HEADER_FEATURE_POLICY)
                    .set_finding(None, SEVERITY_NOTICE, Some(rec));
            }
            None => {
                let rec = "Feature-Policy header is not set. It allows enabling/disabling browser APIs and features for security. Not important if Permissions-Policy is set.";
                url_result.add_warning(rec.to_string(), ANALYSIS_HEADERS, Some(vec![rec.to_string()]));
                self.result
                    .get_checked_header(HEADER_FEATURE_POLICY)
                    .set_finding(None, SEVERITY_WARNING, Some(rec));
            }
            _ => {
                self.result
                    .get_checked_header(HEADER_FEATURE_POLICY)
                    .set_finding(value_ref, SEVERITY_OK, None);
            }
        }
    }

    fn check_permissions_policy(&mut self, headers: &HashMap<String, String>, url_result: &mut UrlAnalysisResult) {
        let value = Self::get_header_value(headers, HEADER_PERMISSIONS_POLICY);
        let value_ref = value.as_deref();

        let has_feature_policy = Self::get_header_value(headers, HEADER_FEATURE_POLICY).is_some();

        match value_ref {
            None if has_feature_policy => {
                let rec = "Permissions-Policy header is not set but Feature-Policy is. We recommend transforming it to this newer header.";
                url_result.add_warning(rec.to_string(), ANALYSIS_HEADERS, Some(vec![rec.to_string()]));
                self.result.get_checked_header(HEADER_PERMISSIONS_POLICY).set_finding(
                    None,
                    SEVERITY_WARNING,
                    Some(rec),
                );
            }
            None => {
                let rec = "Permissions-Policy header is not set. It allows enabling/disabling browser APIs and features for security.";
                url_result.add_warning(rec.to_string(), ANALYSIS_HEADERS, Some(vec![rec.to_string()]));
                self.result.get_checked_header(HEADER_PERMISSIONS_POLICY).set_finding(
                    None,
                    SEVERITY_WARNING,
                    Some(rec),
                );
            }
            _ => {
                self.result
                    .get_checked_header(HEADER_PERMISSIONS_POLICY)
                    .set_finding(value_ref, SEVERITY_OK, None);
            }
        }
    }

    fn check_server(&mut self, headers: &HashMap<String, String>, url_result: &mut UrlAnalysisResult) {
        let value = Self::get_header_value(headers, HEADER_SERVER);
        let value_ref = value.as_deref();

        let known_values = ["Apache", "nginx", "Microsoft-IIS"];

        let check_for_known = |v: &str| -> bool {
            known_values
                .iter()
                .any(|kv| v.to_lowercase().contains(&kv.to_lowercase()))
        };

        let is_empty_or_whitespace = value_ref
            .map(|v| v.trim_matches(|c: char| " /-.~:".contains(c)).is_empty())
            .unwrap_or(true);

        if value_ref.is_none() || is_empty_or_whitespace {
            let rec = "Server header is not set or empty. This is recommended.";
            url_result.add_notice(rec.to_string(), ANALYSIS_HEADERS, Some(vec![rec.to_string()]));
            self.result
                .get_checked_header(HEADER_SERVER)
                .set_finding(value_ref, SEVERITY_OK, Some(rec));
        } else if let Some(v) = value_ref {
            let has_version = v.chars().any(|c| c.is_ascii_digit());

            if has_version {
                let rec = format!(
                    "Server header is set to '{}'. It is better not to reveal the technologies used and especially their versions.",
                    v
                );
                url_result.add_critical(rec.clone(), ANALYSIS_HEADERS, Some(vec![rec.clone()]));
                self.result
                    .get_checked_header(HEADER_SERVER)
                    .set_finding(value_ref, SEVERITY_CRITICAL, Some(&rec));
            } else if check_for_known(v) {
                let rec = format!(
                    "Server header is set to known '{}'. It is better not to reveal used technologies.",
                    v
                );
                url_result.add_notice(rec.clone(), ANALYSIS_HEADERS, Some(vec![rec.clone()]));
                self.result
                    .get_checked_header(HEADER_SERVER)
                    .set_finding(value_ref, SEVERITY_WARNING, Some(&rec));
            } else {
                let rec = format!(
                    "Server header is set to '{}'. It is better not to reveal used technologies.",
                    v
                );
                url_result.add_notice(rec.clone(), ANALYSIS_HEADERS, Some(vec![rec.clone()]));
                self.result
                    .get_checked_header(HEADER_SERVER)
                    .set_finding(value_ref, SEVERITY_NOTICE, Some(&rec));
            }
        }
    }

    fn check_x_powered_by(&mut self, headers: &HashMap<String, String>, url_result: &mut UrlAnalysisResult) {
        let value = Self::get_header_value(headers, HEADER_X_POWERED_BY);
        let value_ref = value.as_deref();

        if let Some(v) = value_ref {
            let has_version = v.chars().any(|c| c.is_ascii_digit());

            if has_version {
                let rec = format!(
                    "X-Powered-By header is set to '{}'. It is better not to reveal the technologies used and especially their versions.",
                    v
                );
                url_result.add_critical(rec.clone(), ANALYSIS_HEADERS, Some(vec![rec.clone()]));
                self.result.get_checked_header(HEADER_X_POWERED_BY).set_finding(
                    value_ref,
                    SEVERITY_CRITICAL,
                    Some(&rec),
                );
            } else {
                let rec = format!(
                    "X-Powered-By header is set to '{}'. It is better not to reveal used technologies.",
                    v
                );
                url_result.add_warning(rec.clone(), ANALYSIS_HEADERS, Some(vec![rec.clone()]));
                self.result.get_checked_header(HEADER_X_POWERED_BY).set_finding(
                    value_ref,
                    SEVERITY_WARNING,
                    Some(&rec),
                );
            }
        }
    }

    fn check_set_cookie(
        &mut self,
        headers: &HashMap<String, String>,
        is_https: bool,
        url_result: &mut UrlAnalysisResult,
    ) {
        let value = match headers.get(HEADER_SET_COOKIE) {
            Some(v) => v,
            None => return,
        };

        // Multiple cookies may be separated by newlines or exist as a single value
        for cookie in value.split('\n') {
            let cookie = cookie.trim();
            if !cookie.is_empty() {
                self.check_set_cookie_value(cookie, is_https, url_result);
            }
        }
    }

    fn check_set_cookie_value(&mut self, set_cookie: &str, is_https: bool, url_result: &mut UrlAnalysisResult) {
        let mut severity = SEVERITY_OK;
        let cookie_name = set_cookie.split('=').next().unwrap_or("unknown");

        let set_cookie_lower = set_cookie.to_lowercase();

        if !set_cookie_lower.contains("samesite") {
            severity = SEVERITY_NOTICE;
            let rec = format!(
                "Set-Cookie header for '{}' does not have 'SameSite' flag. Consider using 'SameSite=Strict' or 'SameSite=Lax'.",
                cookie_name
            );
            url_result.add_notice(rec.clone(), ANALYSIS_HEADERS, Some(vec![rec.clone()]));
        }
        if !set_cookie_lower.contains("httponly") {
            severity = SEVERITY_WARNING;
            let rec = format!(
                "Set-Cookie header for '{}' does not have 'HttpOnly' flag. Attacker can steal the cookie using XSS. Consider using 'HttpOnly' when cookie is not used by JavaScript.",
                cookie_name
            );
            url_result.add_warning(rec.clone(), ANALYSIS_HEADERS, Some(vec![rec.clone()]));
        }
        if is_https && !set_cookie_lower.contains("secure") {
            severity = SEVERITY_CRITICAL;
            let rec = format!(
                "Set-Cookie header for '{}' does not have 'Secure' flag. Attacker can steal the cookie over HTTP.",
                cookie_name
            );
            url_result.add_critical(rec.clone(), ANALYSIS_HEADERS, Some(vec![rec.clone()]));
        }

        self.result
            .get_checked_header(HEADER_SET_COOKIE)
            .set_finding(Some(cookie_name), severity, None);
    }

    fn set_findings_to_summary(&mut self, status: &Status) {
        self.pages_with_critical = 0;
        self.pages_with_warning = 0;
        self.pages_with_notice = 0;

        for header in self.result.checked_headers.values() {
            self.pages_with_critical += header.count_per_severity.get(&SEVERITY_CRITICAL).copied().unwrap_or(0);
            self.pages_with_warning += header.count_per_severity.get(&SEVERITY_WARNING).copied().unwrap_or(0);
            self.pages_with_notice += header.count_per_severity.get(&SEVERITY_NOTICE).copied().unwrap_or(0);
        }

        if self.pages_with_critical > 0 {
            status.add_critical_to_summary(
                "security",
                &format!(
                    "Security - {} pages(s) with critical finding(s).",
                    self.pages_with_critical
                ),
            );
        } else if self.pages_with_warning > 0 {
            status.add_warning_to_summary(
                "security",
                &format!("Security - {} pages(s) with warning(s).", self.pages_with_warning),
            );
        } else if self.pages_with_notice > 0 {
            status.add_notice_to_summary(
                "security",
                &format!("Security - {} pages(s) with notice(s).", self.pages_with_notice),
            );
        } else {
            status.add_ok_to_summary("security", "Security - no findings.");
        }
    }
}

impl Analyzer for SecurityAnalyzer {
    fn analyze(&mut self, status: &Status, output: &mut dyn Output) {
        let console_width = utils::get_console_width();
        let recommendation_col_width = (console_width as i32 - 70).max(20);

        let columns = vec![
            SuperTableColumn::new(
                "header".to_string(),
                "Header".to_string(),
                26,
                None,
                None,
                true,
                false,
                false,
                true,
                None,
            ),
            SuperTableColumn::new(
                "ok".to_string(),
                "OK".to_string(),
                5,
                Some(Box::new(|value: &str, _render_into: &str| {
                    if let Ok(v) = value.parse::<usize>()
                        && v > 0
                    {
                        return utils::get_color_text(&v.to_string(), "green", false);
                    }
                    "0".to_string()
                })),
                None,
                false,
                false,
                false,
                true,
                None,
            ),
            SuperTableColumn::new(
                "notice".to_string(),
                "Notice".to_string(),
                6,
                Some(Box::new(|value: &str, _render_into: &str| {
                    if let Ok(v) = value.parse::<usize>()
                        && v > 0
                    {
                        return utils::get_color_text(&v.to_string(), "blue", false);
                    }
                    "0".to_string()
                })),
                None,
                false,
                false,
                false,
                true,
                None,
            ),
            SuperTableColumn::new(
                "warning".to_string(),
                "Warning".to_string(),
                7,
                Some(Box::new(|value: &str, _render_into: &str| {
                    if let Ok(v) = value.parse::<usize>()
                        && v > 0
                    {
                        return utils::get_color_text(&v.to_string(), "magenta", true);
                    }
                    "0".to_string()
                })),
                None,
                false,
                false,
                false,
                true,
                None,
            ),
            SuperTableColumn::new(
                "critical".to_string(),
                "Critical".to_string(),
                8,
                Some(Box::new(|value: &str, _render_into: &str| {
                    if let Ok(v) = value.parse::<usize>()
                        && v > 0
                    {
                        return utils::get_color_text(&v.to_string(), "red", true);
                    }
                    "0".to_string()
                })),
                None,
                false,
                false,
                false,
                true,
                None,
            ),
            SuperTableColumn::new(
                "recommendation".to_string(),
                "Recommendation".to_string(),
                recommendation_col_width,
                None,
                None,
                true,
                true,
                false,
                false,
                None,
            ),
        ];

        let mut data: Vec<HashMap<String, String>> = Vec::new();
        for header in self.result.checked_headers.values() {
            let mut row = HashMap::new();
            row.insert("header".to_string(), header.get_formatted_header());
            row.insert(
                "highestSeverity".to_string(),
                header.highest_severity.unwrap_or(0).to_string(),
            );
            row.insert(
                "ok".to_string(),
                header
                    .count_per_severity
                    .get(&SEVERITY_OK)
                    .copied()
                    .unwrap_or(0)
                    .to_string(),
            );
            row.insert(
                "notice".to_string(),
                header
                    .count_per_severity
                    .get(&SEVERITY_NOTICE)
                    .copied()
                    .unwrap_or(0)
                    .to_string(),
            );
            row.insert(
                "warning".to_string(),
                header
                    .count_per_severity
                    .get(&SEVERITY_WARNING)
                    .copied()
                    .unwrap_or(0)
                    .to_string(),
            );
            row.insert(
                "critical".to_string(),
                header
                    .count_per_severity
                    .get(&SEVERITY_CRITICAL)
                    .copied()
                    .unwrap_or(0)
                    .to_string(),
            );
            row.insert("recommendation".to_string(), header.recommendations.join(". "));
            data.push(row);
        }

        let mut super_table = SuperTable::new(
            SUPER_TABLE_SECURITY.to_string(),
            "Security".to_string(),
            "Nothing to report.".to_string(),
            columns,
            true,
            Some("highestSeverity".to_string()),
            "DESC".to_string(),
            None,
            None,
            None,
        );

        super_table.set_data(data);
        status.configure_super_table_url_stripping(&mut super_table);
        output.add_super_table(&super_table);
        status.add_super_table_at_end(super_table);

        self.set_findings_to_summary(status);
    }

    fn analyze_visited_url(
        &mut self,
        visited_url: &VisitedUrl,
        body: Option<&str>,
        headers: Option<&HashMap<String, String>>,
    ) -> Option<UrlAnalysisResult> {
        if !visited_url.is_allowed_for_crawling
            || visited_url.content_type != ContentTypeId::Html
            || visited_url.looks_like_static_file_by_url()
        {
            return None;
        }

        let mut result = UrlAnalysisResult::new();

        let start = Instant::now();
        if let Some(hdrs) = headers {
            self.check_headers(hdrs, visited_url.is_https(), &mut result);
        }
        self.base.measure_exec_time("SecurityAnalyzer", "checkHeaders", start);

        if let Some(html) = body
            && !html.trim().is_empty()
        {
            let start2 = Instant::now();
            self.check_html_security(html, visited_url.is_https(), &mut result);
            self.base
                .measure_exec_time("SecurityAnalyzer", "checkHtmlSecurity", start2);
        }

        Some(result)
    }

    fn should_be_activated(&self) -> bool {
        true
    }

    fn get_order(&self) -> i32 {
        215
    }

    fn get_name(&self) -> &str {
        "SecurityAnalyzer"
    }

    fn get_exec_times(&self) -> &HashMap<String, f64> {
        self.base.get_exec_times()
    }

    fn get_exec_counts(&self) -> &HashMap<String, usize> {
        self.base.get_exec_counts()
    }
}


================================================
FILE: src/analysis/seo_opengraph_analyzer.rs
================================================
// SiteOne Crawler - SeoAndOpenGraphAnalyzer
// (c) Jan Reges <jan.reges@siteone.cz>

use std::collections::HashMap;
use std::time::Instant;

use scraper::{Html, Selector};

use crate::analysis::analyzer::Analyzer;
use crate::analysis::base_analyzer::BaseAnalyzer;
use crate::analysis::result::heading_tree_item::HeadingTreeItem;
use crate::analysis::result::seo_opengraph_result::{ROBOTS_NOINDEX, SeoAndOpenGraphResult};
use crate::components::super_table::SuperTable;
use crate::components::super_table_column::SuperTableColumn;
use crate::output::output::Output;
use crate::result::status::Status;
use crate::result::visited_url::VisitedUrl;
use crate::types::ContentTypeId;
use crate::utils;

const SUPER_TABLE_SEO: &str = "seo";
const SUPER_TABLE_OPEN_GRAPH: &str = "open-graph";
const SUPER_TABLE_SEO_HEADINGS: &str = "seo-headings";

pub struct SeoAndOpenGraphAnalyzer {
    base: BaseAnalyzer,
    max_heading_level: i32,
    has_og_tags: bool,
    has_twitter_tags: bool,
}

impl Default for SeoAndOpenGraphAnalyzer {
    fn default() -> Self {
        Self::new()
    }
}

impl SeoAndOpenGraphAnalyzer {
    pub fn new() -> Self {
        Self {
            base: BaseAnalyzer::new(),
            max_heading_level: 3,
            has_og_tags: false,
            has_twitter_tags: false,
        }
    }

    /// Set configuration from CoreOptions.
    pub fn set_config(&mut self, max_heading_level: i32) {
        self.max_heading_level = max_heading_level;
    }

    fn get_seo_and_opengraph_results(&self, status: &Status) -> Vec<SeoAndOpenGraphResult> {
        let visited_urls = status.get_visited_urls();
        let html_urls: Vec<&VisitedUrl> = visited_urls
            .iter()
            .filter(|u| u.status_code == 200 && u.is_allowed_for_crawling && u.content_type == ContentTypeId::Html)
            .collect();

        let mut results = Vec::new();

        for visited_url in html_urls {
            let html_body = match status.get_url_body_text(&visited_url.uq_id) {
                Some(body) => body,
                None => continue,
            };

            let url_path_and_query = get_url_path_and_query(&visited_url.url);
            let mut url_result = SeoAndOpenGraphResult::new(visited_url.uq_id.clone(), url_path_and_query);

            let document = Html::parse_document(&html_body);
            extract_seo_metadata(&document, &mut url_result);
            extract_opengraph_metadata(&document, &mut url_result);
            extract_twitter_metadata(&document, &mut url_result);
            build_heading_tree(&document, &mut url_result, self.max_heading_level);

            results.push(url_result);
        }

        results
    }

    fn analyze_seo(&self, url_results: &[SeoAndOpenGraphResult], status: &Status, output: &mut dyn Output) {
        let console_width = utils::get_console_width();
        let url_col_width = 50;
        let indexing_col_width = 20;
        let common_col_count = 4;
        let spaces_and_pipes = 6 * 3;
        let common_col_width =
            ((console_width as i32 - url_col_width - indexing_col_width - spaces_and_pipes) / common_col_count).max(10);

        let columns = vec![
            SuperTableColumn::new(
                "urlPathAndQuery".to_string(),
                "URL".to_string(),
                url_col_width,
                None,
                None,
                true,
                false,
                false,
                true,
                None,
            ),
            SuperTableColumn::new(
                "indexing".to_string(),
                "Indexing".to_string(),
                indexing_col_width,
                None,
                Some(Box::new(|row: &HashMap<String, String>, _render_into: &str| {
                    let denied = row.get("deniedByRobotsTxt").map(|v| v == "true").unwrap_or(false);
                    let robots_index = row.get("robotsIndex").and_then(|v| v.parse::<i32>().ok()).unwrap_or(1);

                    if denied {
                        utils::get_color_text("DENY (robots.txt)", "magenta", false)
                    } else if robots_index == ROBOTS_NOINDEX {
                        utils::get_color_text("DENY (meta)", "magenta", false)
                    } else {
                        "Allowed".to_string()
                    }
                })),
                false,
                false,
                false,
                true,
                None,
            ),
            SuperTableColumn::new(
                "title".to_string(),
                "Title".to_string(),
                common_col_width,
                None,
                None,
                true,
                false,
                false,
                true,
                None,
            ),
            SuperTableColumn::new(
                "h1".to_string(),
                "H1".to_string(),
                common_col_width,
                Some(Box::new(|value: &str, _render_into: &str| {
                    if value.is_empty() {
                        utils::get_color_text("Missing H1", "red", true)
                    } else {
                        value.to_string()
                    }
                })),
                None,
                true,
                false,
                false,
                true,
                None,
            ),
            SuperTableColumn::new(
                "description".to_string(),
                "Description".to_string(),
                common_col_width,
                None,
                None,
                true,
                false,
                false,
                true,
                None,
            ),
            SuperTableColumn::new(
                "keywords".to_string(),
                "Keywords".to_string(),
                common_col_width,
                None,
                None,
                true,
                false,
                false,
                true,
                None,
            ),
        ];

        let data = seo_results_to_table_data(url_results);

        let mut super_table = SuperTable::new(
            SUPER_TABLE_SEO.to_string(),
            "SEO metadata".to_string(),
            "No URLs.".to_string(),
            columns,
            true,
            Some("urlPathAndQuery".to_string()),
            "ASC".to_string(),
            None,
            None,
            None,
        );

        super_table.set_visibility_in_console(true, Some(10));
        super_table.set_data(data);
        status.configure_super_table_url_stripping(&mut super_table);
        output.add_super_table(&super_table);
        status.add_super_table_at_beginning(super_table);
    }

    fn analyze_open_graph(&self, url_results: &[SeoAndOpenGraphResult], status: &Status, output: &mut dyn Output) {
        let console_width = utils::get_console_width();
        let url_col_width = 50;
        let image_col_width = 18;
        let image_col_count = (if self.has_og_tags { 1 } else { 0 }) + (if self.has_twitter_tags { 1 } else { 0 });
        let common_col_count = (if self.has_og_tags { 2 } else { 0 }) + (if self.has_twitter_tags { 2 } else { 0 });
        let spaces_and_pipes = (1 + image_col_count + common_col_count) * 3;
        let common_col_width =
            ((console_width as i32 - url_col_width - (image_col_count * image_col_width) - spaces_and_pipes)
                / common_col_count.max(1))
            .max(10);

        let mut columns = vec![SuperTableColumn::new(
            "urlPathAndQuery".to_string(),
            "URL".to_string(),
            url_col_width,
            None,
            None,
            true,
            false,
            false,
            true,
            None,
        )];

        if self.has_og_tags {
            columns.push(SuperTableColumn::new(
                "ogTitle".to_string(),
                "OG Title".to_string(),
                common_col_width,
                None,
                None,
                true,
                false,
                false,
                true,
                None,
            ));
            columns.push(SuperTableColumn::new(
                "ogDescription".to_string(),
                "OG Description".to_string(),
                common_col_width,
                None,
                None,
                true,
                false,
                false,
                true,
                None,
            ));
            columns.push(SuperTableColumn::new(
                "ogImage".to_string(),
                "OG Image".to_string(),
                image_col_width,
                None,
                None,
                true,
                false,
                false,
                true,
                None,
            ));
        }

        if self.has_twitter_tags {
            columns.push(SuperTableColumn::new(
                "twitterTitle".to_string(),
                "Twitter Title".to_string(),
                common_col_width,
                None,
                None,
                true,
                false,
                false,
                true,
                None,
            ));
            columns.push(SuperTableColumn::new(
                "twitterDescription".to_string(),
                "Twitter Description".to_string(),
                common_col_width,
                None,
                None,
                true,
                false,
                false,
                true,
                None,
            ));
            columns.push(SuperTableColumn::new(
                "twitterImage".to_string(),
                "Twitter Image".to_string(),
                image_col_width,
                None,
                None,
                true,
                false,
                false,
                true,
                None,
            ));
        }

        let data = if self.has_og_tags || self.has_twitter_tags {
            og_results_to_table_data(url_results)
        } else {
            Vec::new()
        };

        let mut super_table = SuperTable::new(
            SUPER_TABLE_OPEN_GRAPH.to_string(),
            "OpenGraph metadata".to_string(),
            "No URLs with OpenGraph data (og:* or twitter:* meta tags).".to_string(),
            columns,
            true,
            Some("urlPathAndQuery".to_string()),
            "ASC".to_string(),
            None,
            None,
            None,
        );

        super_table.set_visibility_in_console(true, Some(10));
        super_table.set_data(data);
        status.configure_super_table_url_stripping(&mut super_table);
        output.add_super_table(&super_table);
        status.add_super_table_at_beginning(super_table);
    }

    fn analyze_headings(&self, url_results: &[SeoAndOpenGraphResult], status: &Status, output: &mut dyn Output) {
        let console_width = utils::get_console_width();
        let url_col_width = 30;
        let heading_col_width = (console_width as i32 - url_col_width - 24).max(20);

        let columns = vec![
            SuperTableColumn::new(
                "headings".to_string(),
                "Heading structure".to_string(),
                heading_col_width,
                None,
                Some(Box::new(|row: &HashMap<String, String>, render_into: &str| {
                    if render_into == "html" {
                        row.get("headingsHtml").cloned().unwrap_or_default()
                    } else {
                        row.get("headings").cloned().unwrap_or_default()
                    }
                })),
                true,
                false,
                false,
                false,
                None,
            ),
            SuperTableColumn::new(
                "headingsCount".to_string(),
                "Count".to_string(),
                5,
                None,
                None,
                false,
                false,
                false,
                true,
                None,
            ),
            SuperTableColumn::new(
                "headingsErrorsCount".to_string(),
                "Errors".to_string(),
                6,
                Some(Box::new(|value: &str, _render_into: &str| {
                    if let Ok(v) = value.parse::<usize>() {
                        if v > 0 {
                            return utils::get_color_text(&v.to_string(), "red", true);
                        }
                        return utils::get_color_text(&v.to_string(), "green", true);
                    }
                    value.to_string()
                })),
                None,
                false,
                false,
                false,
                true,
                None,
            ),
            SuperTableColumn::new(
                "urlPathAndQuery".to_string(),
                "URL".to_string(),
                url_col_width,
                None,
                None,
                true,
                false,
                false,
                true,
                None,
            ),
        ];

        let data = headings_to_table_data(url_results);

        let mut super_table = SuperTable::new(
            SUPER_TABLE_SEO_HEADINGS.to_string(),
            "Heading structure".to_string(),
            "No URLs to analyze heading structure.".to_string(),
            columns,
            true,
            Some("headingsErrorsCount".to_string()),
            "DESC".to_string(),
            None,
            None,
            None,
        );

        super_table.set_visibility_in_console(true, Some(10));
        super_table.set_data(data);
        status.configure_super_table_url_stripping(&mut super_table);
        output.add_super_table(&super_table);
        status.add_super_table_at_beginning(super_table);
    }
}

impl Analyzer for SeoAndOpenGraphAnalyzer {
    fn analyze(&mut self, status: &Status, output: &mut dyn Output) {
        let url_results = self.get_seo_and_opengraph_results(status);

        // Check for OG and Twitter tags
        for r in &url_results {
            if self.has_og_tags && self.has_twitter_tags {
                break;
            }
            if r.og_title.is_some() || r.og_description.is_some() || r.og_image.is_some() {
                self.has_og_tags = true;
            }
            if r.twitter_card.is_some()
                || r.twitter_title.is_some()
                || r.twitter_description.is_some()
                || r.twitter_image.is_some()
            {
                self.has_twitter_tags = true;
            }
        }

        let s = Instant::now();
        self.analyze_seo(&url_results, status, output);
        self.base.measure_exec_time("SeoAndOpenGraphAnalyzer", "analyzeSeo", s);

        let s = Instant::now();
        self.analyze_open_graph(&url_results, status, output);
        self.base
            .measure_exec_time("SeoAndOpenGraphAnalyzer", "analyzeOpenGraph", s);

        let s = Instant::now();
        self.analyze_headings(&url_results, status, output);
        self.base
            .measure_exec_time("SeoAndOpenGraphAnalyzer", "analyzeHeadings", s);
    }

    fn should_be_activated(&self) -> bool {
        true
    }

    fn get_order(&self) -> i32 {
        113
    }

    fn get_name(&self) -> &str {
        "SeoAndOpenGraphAnalyzer"
    }

    fn get_exec_times(&self) -> &HashMap<String, f64> {
        self.base.get_exec_times()
    }

    fn get_exec_counts(&self) -> &HashMap<String, usize> {
        self.base.get_exec_counts()
    }
}

fn get_url_path_and_query(url: &str) -> String {
    if let Ok(parsed) = url::Url::parse(url) {
        let path = parsed.path().to_string();
        if let Some(query) = parsed.query() {
            format!("{}?{}", path, query)
        } else {
            path
        }
    } else {
        url.to_string()
    }
}

fn extract_seo_metadata(document: &Html, result: &mut SeoAndOpenGraphResult) {
    // Title
    if let Ok(sel) = Selector::parse("title")
        && let Some(el) = document.select(&sel).next()
    {
        let text = el.text().collect::<String>().trim().to_string();
        if !text.is_empty() {
            result.title = Some(text);
        }
    }

    // Meta description
    if let Ok(sel) = Selector::parse("meta[name='description']")
        && let Some(el) = document.select(&sel).next()
        && let Some(content) = el.value().attr("content")
    {
        result.description = Some(content.to_string());
    }

    // Meta keywords
    if let Ok(sel) = Selector::parse("meta[name='keywords']")
        && let Some(el) = document.select(&sel).next()
        && let Some(content) = el.value().attr("content")
    {
        result.keywords = Some(content.to_string());
    }

    // H1
    if let Ok(sel) = Selector::parse("h1")
        && let Some(el) = document.select(&sel).next()
    {
        let text = el.text().collect::<String>().trim().to_string();
        if !text.is_empty() {
            result.h1 = Some(text);
        }
    }

    // Robots meta
    if let Ok(sel) = Selector::parse("meta[name='robots']")
        && let Some(el) = document.select(&sel).next()
        && let Some(content) = el.value().attr("content")
    {
        let content_lower = content.to_lowercase();
        if content_lower.contains("noindex") {
            result.robots_index = Some(ROBOTS_NOINDEX);
        }
        if content_lower.contains("nofollow") {
            result.robots_follow = Some(crate::analysis::result::seo_opengraph_result::ROBOTS_NOFOLLOW);
        }
    }
}

fn extract_opengraph_metadata(document: &Html, result: &mut SeoAndOpenGraphResult) {
    // Extract OG tags
    if let Ok(sel) = Selector::parse("meta[property='og:title']")
        && let Some(el) = document.select(&sel).next()
    {
        result.og_title = el.value().attr("content").map(|s| s.to_string());
    }
    if let Ok(sel) = Selector::parse("meta[property='og:description']")
        && let Some(el) = document.select(&sel).next()
    {
        result.og_description = el.value().attr("content").map(|s| s.to_string());
    }
    if let Ok(sel) = Selector::parse("meta[property='og:image']")
        && let Some(el) = document.select(&sel).next()
    {
        result.og_image = el.value().attr("content").map(|s| s.to_string());
    }
    if let Ok(sel) = Selector::parse("meta[property='og:url']")
        && let Some(el) = document.select(&sel).next()
    {
        result.og_url = el.value().attr("content").map(|s| s.to_string());
    }
    if let Ok(sel) = Selector::parse("meta[property='og:type']")
        && let Some(el) = document.select(&sel).next()
    {
        result.og_type = el.value().attr("content").map(|s| s.to_string());
    }
    if let Ok(sel) = Selector::parse("meta[property='og:site_name']")
        && let Some(el) = document.select(&sel).next()
    {
        result.og_site_name = el.value().attr("content").map(|s| s.to_string());
    }
}

fn extract_twitter_metadata(document: &Html, result: &mut SeoAndOpenGraphResult) {
    if let Ok(sel) = Selector::parse("meta[name='twitter:card']")
        && let Some(el) = document.select(&sel).next()
    {
        result.twitter_card = el.value().attr("content").map(|s| s.to_string());
    }
    if let Ok(sel) = Selector::parse("meta[name='twitter:site']")
        && let Some(el) = document.select(&sel).next()
    {
        result.twitter_site = el.value().attr("content").map(|s| s.to_string());
    }
    if let Ok(sel) = Selector::parse("meta[name='twitter:creator']")
        && let Some(el) = document.select(&sel).next()
    {
        result.twitter_creator = el.value().attr("content").map(|s| s.to_string());
    }
    if let Ok(sel) = Selector::parse("meta[name='twitter:title']")
        && let Some(el) = document.select(&sel).next()
    {
        result.twitter_title = el.value().attr("content").map(|s| s.to_string());
    }
    if let Ok(sel) = Selector::parse("meta[name='twitter:description']")
        && let Some(el) = document.select(&sel).next()
    {
        result.twitter_description = el.value().attr("content").map(|s| s.to_string());
    }
    if let Ok(sel) = Selector::parse("meta[name='twitter:image']")
        && let Some(el) = document.select(&sel).next()
    {
        result.twitter_image = el.value().attr("content").map(|s| s.to_string());
    }
}

fn build_heading_tree(document: &Html, result: &mut SeoAndOpenGraphResult, max_level: i32) {
    let selector = match Selector::parse("h1, h2, h3, h4, h5, h6") {
        Ok(s) => s,
        Err(_) => return,
    };

    let headings: Vec<(i32, String, Option<String>)> = document
        .select(&selector)
        .filter_map(|el| {
            let tag = el.value().name();
            let level = tag.strip_prefix('h').and_then(|s| s.parse::<i32>().ok())?;
            if level > max_level {
                return None;
            }
            let text = el.text().collect::<String>().trim().to_string();
            // Strip JS from text
            let text = text.split('\n').map(|l| l.trim()).collect::<Vec<_>>().join(" ");
            use once_cell::sync::Lazy;
            static RE_WS: Lazy<regex::Regex> = Lazy::new(|| regex::Regex::new(r"\s+").unwrap());
            let text = RE_WS.replace_all(&text, " ").trim().to_string();
            let id = el.value().attr("id").map(|s| s.to_string());
            Some((level, text, id))
        })
        .collect();

    if headings.is_empty() {
        return;
    }

    // Build tree structure: use a root node at level 0 and insert children based on heading levels

    let mut items: Vec<Option<HeadingTreeItem>> = headings
        .iter()
        .map(|(level, text, id)| Some(HeadingTreeItem::new(*level, text.clone(), id.clone())))
        .collect();

    // Compute parent relationships using a stack
    let headings_ref: Vec<(i32, Option<usize>)> = {
        let mut result_vec = Vec::new();
        let mut stack2: Vec<(i32, usize)> = Vec::new(); // (level, index)
        for (idx, (level, _text, _id)) in headings.iter().enumerate() {
            while let Some(&(top_level, _)) = stack2.last() {
                if top_level >= *level {
                    stack2.pop();
                } else {
                    break;
                }
            }
            let parent_idx = stack2.last().map(|&(_, idx)| idx);
            result_vec.push((*level, parent_idx));
            stack2.push((*level, idx));
        }
        result_vec
    };

    // Build tree bottom-up
    for idx in (0..items.len()).rev() {
        if let Some(parent_idx) = headings_ref[idx].1
            && let Some(child) = items[idx].take()
            && let Some(ref mut parent) = items[parent_idx]
        {
            parent.children.insert(0, child);
        }
    }

    // Collect root items (those without parents)
    let mut root_children: Vec<HeadingTreeItem> = items
        .into_iter()
        .enumerate()
        .filter(|(idx, _)| headings_ref[*idx].1.is_none())
        .filter_map(|(_, item)| item)
        .collect();

    // Set error for multiple H1s
    let h1_count = Selector::parse("h1").map(|s| document.select(&s).count()).unwrap_or(0);
    if h1_count > 1 {
        fn mark_h1_errors(items: &mut [HeadingTreeItem], h1_count: usize) {
            for item in items.iter_mut() {
                if item.level == 1 {
                    item.error_text = Some(format!("Multiple H1s ({}) found.", h1_count));
                }
                mark_h1_errors(&mut item.children, h1_count);
            }
        }
        mark_h1_errors(&mut root_children, h1_count);
    }

    // Set real_level and check for level mismatches
    fn fix_real_levels(items: &mut [HeadingTreeItem], real_level: i32) {
        for item in items.iter_mut() {
            item.real_level = Some(real_level);
            if item.level != real_level && item.error_text.is_none() {
                item.error_text = Some(format!(
                    "Heading level {} is not correct. Should be {}.",
                    item.level, real_level
                ));
            }
            fix_real_levels(&mut item.children, real_level + 1);
        }
    }
    fix_real_levels(&mut root_children, 1);

    let total_count = HeadingTreeItem::get_headings_count(&root_children);
    let errors_count = HeadingTreeItem::get_headings_with_error_count(&root_children);

    result.heading_tree_items = root_children;
    result.headings_count = total_count;
    result.headings_errors_count = errors_count;
}

fn seo_results_to_table_data(results: &[SeoAndOpenGraphResult]) -> Vec<HashMap<String, String>> {
    results
        .iter()
        .map(|r| {
            let mut row = HashMap::new();
            row.insert("urlPathAndQuery".to_string(), r.url_path_and_query.clone());
            row.insert("title".to_string(), r.title.clone().unwrap_or_default());
            row.insert("h1".to_string(), r.h1.clone().unwrap_or_default());
            row.insert("description".to_string(), r.description.clone().unwrap_or_default());
            row.insert("keywords".to_string(), r.keywords.clone().unwrap_or_default());
            row.insert("deniedByRobotsTxt".to_string(), r.denied_by_robots_txt.to_string());
            row.insert("robotsIndex".to_string(), r.robots_index.unwrap_or(1).to_string());
            row.insert(
                "indexing".to_string(),
                String::new(), // Will be rendered by renderer
            );
            row
        })
        .collect()
}

fn og_results_to_table_data(results: &[SeoAndOpenGraphResult]) -> Vec<HashMap<String, String>> {
    results
        .iter()
        .map(|r| {
            let mut row = HashMap::new();
            row.insert("urlPathAndQuery".to_string(), r.url_path_and_query.clone());
            row.insert("ogTitle".to_string(), r.og_title.clone().unwrap_or_default());
            row.insert(
                "ogDescription".to_string(),
                r.og_description.clone().unwrap_or_default(),
            );
            row.insert("ogImage".to_string(), r.og_image.clone().unwrap_or_default());
            row.insert("twitterTitle".to_string(), r.twitter_title.clone().unwrap_or_default());
            row.insert(
                "twitterDescription".to_string(),
                r.twitter_description.clone().unwrap_or_default(),
            );
            row.insert("twitterImage".to_string(), r.twitter_image.clone().unwrap_or_default());
            row
        })
        .collect()
}

fn headings_to_table_data(results: &[SeoAndOpenGraphResult]) -> Vec<HashMap<String, String>> {
    results
        .iter()
        .map(|r| {
            let mut row = HashMap::new();
            row.insert("urlPathAndQuery".to_string(), r.url_path_and_query.clone());
            row.insert(
                "headings".to_string(),
                HeadingTreeItem::get_heading_tree_txt_list(&r.heading_tree_items),
            );
            row.insert(
                "headingsHtml".to_string(),
                HeadingTreeItem::get_heading_tree_ul_li_list(&r.heading_tree_items),
            );
            row.insert("headingsCount".to_string(), r.headings_count.to_string());
            row.insert("headingsErrorsCount".to_string(), r.headings_errors_count.to_string());
            row
        })
        .collect()
}


================================================
FILE: src/analysis/skipped_urls_analyzer.rs
================================================
// SiteOne Crawler - SkippedUrlsAnalyzer
// (c) Jan Reges <jan.reges@siteone.cz>

use std::collections::HashMap;

use crate::analysis::analyzer::Analyzer;
use crate::analysis::base_analyzer::BaseAnalyzer;
use crate::components::super_table::SuperTable;
use crate::components::super_table_column::SuperTableColumn;
use crate::output::output::Output;
use crate::result::status::Status;
use crate::result::visited_url::VisitedUrl;
use crate::types::SkippedReason;

const SUPER_TABLE_SKIPPED_SUMMARY: &str = "skipped-summary";
const SUPER_TABLE_SKIPPED: &str = "skipped";

pub struct SkippedUrlsAnalyzer {
    base: BaseAnalyzer,
}

impl Default for SkippedUrlsAnalyzer {
    fn default() -> Self {
        Self::new()
    }
}

impl SkippedUrlsAnalyzer {
    pub fn new() -> Self {
        Self {
            base: BaseAnalyzer::new(),
        }
    }

    fn get_reason_label(reason: &SkippedReason) -> &'static str {
        match reason {
            SkippedReason::NotAllowedHost => "Not allowed host",
            SkippedReason::RobotsTxt => "Robots.txt",
            SkippedReason::ExceedsMaxDepth => "Max depth",
        }
    }

    fn get_source_short_name(source_attr: i32) -> &'static str {
        match source_attr {
            5 => "Initial URL",
            10 => "<a href>",
            20 => "<img src>",
            21 => "<img srcset>",
            22 => "<input src>",
            23 => "<source src>",
            24 => "<video src>",
            25 => "<audio src>",
            30 => "<script src>",
            40 => "inline <script src>",
            50 => "<link href>",
            60 => "css url()",
            70 => "js url",
            80 => "redirect",
            90 => "sitemap",
            _ => "unknown",
        }
    }
}

impl Analyzer for SkippedUrlsAnalyzer {
    fn analyze(&mut self, status: &Status, output: &mut dyn Output) {
        let skipped_entries = status.get_skipped_urls();

        // Get initial host and scheme from the first visited URL
        let visited = status.get_visited_urls();
        let (initial_host, initial_scheme) = visited
            .first()
            .and_then(|v| url::Url::parse(&v.url).ok())
            .map(|parsed| {
                (
                    Some(parsed.host_str().unwrap_or("").to_string()),
                    Some(parsed.scheme().to_string()),
                )
            })
            .unwrap_or((None, None));

        // Build summary: group by reason + domain
        let mut summary_map: HashMap<(String, String), usize> = HashMap::new();
        for entry in &skipped_entries {
            let reason_label = Self::get_reason_label(&entry.reason).to_string();
            let domain = url::Url::parse(&entry.url)
                .ok()
                .and_then(|u| u.host_str().map(|h| h.to_string()))
                .unwrap_or_else(|| {
                    // For relative URLs, extract domain from path
                    let visited = status.get_visited_urls();
                    visited
                        .first()
                        .and_then(|v| v.get_host())
                        .unwrap_or_else(|| "unknown".to_string())
                });
            *summary_map.entry((reason_label, domain)).or_insert(0) += 1;
        }

        let mut skipped_urls_summary: Vec<HashMap<String, String>> = summary_map
            .iter()
            .map(|((reason, domain), count)| {
                let mut row = HashMap::new();
                row.insert("reason".to_string(), reason.clone());
                row.insert("domain".to_string(), domain.clone());
                row.insert("count".to_string(), count.to_string());
                row
            })
            .collect();
        skipped_urls_summary.sort_by(|a, b| {
            let count_a: usize = a.get("count").and_then(|c| c.parse().ok()).unwrap_or(0);
            let count_b: usize = b.get("count").and_then(|c| c.parse().ok()).unwrap_or(0);
            count_b.cmp(&count_a)
        });

        // Build detail: each skipped URL as a row
        let visited_urls = status.get_visited_urls();
        let visited_map: HashMap<String, &VisitedUrl> = visited_urls.iter().map(|v| (v.uq_id.clone(), v)).collect();

        let mut skipped_urls: Vec<HashMap<String, String>> = skipped_entries
            .iter()
            .map(|entry| {
                let mut row = HashMap::new();
                row.insert("reason".to_string(), Self::get_reason_label(&entry.reason).to_string());

                // Strip scheme and host only for same-domain URLs
                let skipped_url = crate::utils::get_url_without_scheme_and_host(
                    &entry.url,
                    initial_host.as_deref(),
                    initial_scheme.as_deref(),
                );
                row.insert("url".to_string(), skipped_url);
                row.insert(
                    "sourceAttr".to_string(),
                    Self::get_source_short_name(entry.source_attr).to_string(),
                );

                // Resolve source URL from source_uq_id
                let source_url = visited_map
                    .get(&entry.source_uq_id)
                    .map(|v| {
                        crate::utils::get_url_without_scheme_and_host(
                            &v.url,
                            initial_host.as_deref(),
                            initial_scheme.as_deref(),
                        )
                    })
                    .unwrap_or_default();
                row.insert("sourceUqId".to_string(), source_url);
                row
            })
            .collect();
        skipped_urls.sort_by(|a, b| {
            let url_a = a.get("url").map(|s| s.as_str()).unwrap_or("");
            let url_b = b.get("url").map(|s| s.as_str()).unwrap_or("");
            url_a.cmp(url_b)
        });

        let url_column_width = 60;

        // Skipped URLs summary table
        let summary_columns = vec![
            SuperTableColumn::new(
                "reason".to_string(),
                "Reason".to_string(),
                18,
                None,
                None,
                false,
                false,
                false,
                true,
                None,
            ),
            SuperTableColumn::new(
                "domain".to_string(),
                "Domain".to_string(),
                -1, // AUTO_WIDTH
                None,
                None,
                false,
                false,
                false,
                true,
                None,
            ),
            SuperTableColumn::new(
                "count".to_string(),
                "Unique URLs".to_string(),
                11,
                None,
                None,
                false,
                false,
                false,
                true,
                None,
            ),
        ];

        let mut super_table_summary = SuperTable::new(
            SUPER_TABLE_SKIPPED_SUMMARY.to_string(),
            "Skipped URLs Summary".to_string(),
            "No skipped URLs found.".to_string(),
            summary_columns,
            true,
            Some("count".to_string()),
            "DESC".to_string(),
            None,
            None,
            Some("Skipped URLs".to_string()),
        );

        super_table_summary.set_data(skipped_urls_summary);
        status.configure_super_table_url_stripping(&mut super_table_summary);
        output.add_super_table(&super_table_summary);
        status.add_super_table_at_beginning(super_table_summary);

        // Skipped URLs table
        let detail_columns = vec![
            SuperTableColumn::new(
                "reason".to_string(),
                "Reason".to_string(),
                18,
                None,
                None,
                false,
                false,
                false,
                true,
                None,
            ),
            SuperTableColumn::new(
                "url".to_string(),
                "Skipped URL".to_string(),
                url_column_width,
                None,
                None,
                true,
                true,
                false,
                true,
                None,
            ),
            SuperTableColumn::new(
                "sourceAttr".to_string(),
                "Source".to_string(),
                19,
                None,
                None,
                false,
                false,
                false,
                true,
                None,
            ),
            SuperTableColumn::new(
                "sourceUqId".to_string(),
                "Found at URL".to_string(),
                url_column_width,
                None,
                None,
                true,
                true,
                false,
                true,
                None,
            ),
        ];

        let count_skipped = skipped_urls.len();

        let mut super_table = SuperTable::new(
            SUPER_TABLE_SKIPPED.to_string(),
            "Skipped URLs".to_string(),
            "No skipped URLs found.".to_string(),
            detail_columns,
            true,
            Some("url".to_string()),
            "ASC".to_string(),
            None,
            None,
            None,
        );

        super_table.set_data(skipped_urls);
        status.configure_super_table_url_stripping(&mut super_table);
        output.add_super_table(&super_table);
        status.add_super_table_at_beginning(super_table);

        status.add_summary_item_by_ranges(
            "skipped",
            count_skipped as f64,
            &[(0.0, 0.0), (1.0, 2.0), (3.0, 9.0), (10.0, f64::MAX)],
            &[
                "Skipped URLs - no skipped URLs found",
                "Skipped URLs - {} skipped URLs found",
                "Skipped URLs - {} skipped URLs found",
                "Skipped URLs - {} skipped URLs found",
            ],
        );
    }

    fn should_be_activated(&self) -> bool {
        true
    }

    fn get_order(&self) -> i32 {
        6
    }

    fn get_name(&self) -> &str {
        "SkippedUrlsAnalyzer"
    }

    fn get_exec_times(&self) -> &HashMap<String, f64> {
        self.base.get_exec_times()
    }

    fn get_exec_counts(&self) -> &HashMap<String, usize> {
        self.base.get_exec_counts()
    }
}


================================================
FILE: src/analysis/slowest_analyzer.rs
================================================
// SiteOne Crawler - SlowestAnalyzer
// (c) Jan Reges <jan.reges@siteone.cz>

use std::collections::HashMap;

use crate::analysis::analyzer::Analyzer;
use crate::analysis::base_analyzer::BaseAnalyzer;
use crate::components::super_table::SuperTable;
use crate::components::super_table_column::SuperTableColumn;
use crate::output::output::Output;
use crate::result::status::Status;
use crate::types::ContentTypeId;
use crate::utils;

const SUPER_TABLE_SLOWEST_URLS: &str = "slowest-urls";

pub struct SlowestAnalyzer {
    base: BaseAnalyzer,
    slowest_top_limit: usize,
    slowest_min_time: f64,
    slowest_max_time: f64,
}

impl Default for SlowestAnalyzer {
    fn default() -> Self {
        Self::new()
    }
}

impl SlowestAnalyzer {
    pub fn new() -> Self {
        Self {
            base: BaseAnalyzer::new(),
            slowest_top_limit: 20,
            slowest_min_time: 0.01,
            slowest_max_time: 3.0,
        }
    }

    /// Set configuration from CoreOptions.
    pub fn set_config(&mut self, slowest_top_limit: usize, slowest_min_time: f64, slowest_max_time: f64) {
        self.slowest_top_limit = slowest_top_limit;
        self.slowest_min_time = slowest_min_time;
        self.slowest_max_time = slowest_max_time;
    }
}

impl Analyzer for SlowestAnalyzer {
    fn analyze(&mut self, status: &Status, output: &mut dyn Output) {
        let visited_urls = status.get_visited_urls();

        let mut slow_urls: Vec<_> = visited_urls
            .iter()
            .filter(|u| {
                u.is_allowed_for_crawling
                    && u.content_type == ContentTypeId::Html
                    && u.request_time >= self.slowest_min_time
            })
            .cloned()
            .collect();

        slow_urls.sort_by(|a, b| {
            b.request_time
                .partial_cmp(&a.request_time)
                .unwrap_or(std::cmp::Ordering::Equal)
        });
        slow_urls.truncate(self.slowest_top_limit);

        let console_width = utils::get_console_width();
        let url_column_width = (console_width as i32 - 25).max(20);

        let columns = vec![
            SuperTableColumn::new(
                "requestTime".to_string(),
                "Time".to_string(),
                6,
                Some(Box::new(|value: &str, _render_into: &str| {
                    if let Ok(v) = value.parse::<f64>() {
                        utils::get_colored_request_time(v, 6)
                    } else {
                        value.to_string()
                    }
                })),
                None,
                false,
                false,
                false,
                true,
                None,
            ),
            SuperTableColumn::new(
                "statusCode".to_string(),
                "Status".to_string(),
                6,
                Some(Box::new(|value: &str, _render_into: &str| {
                    if let Ok(v) = value.parse::<i32>() {
                        utils::get_colored_status_code(v, 6)
                    } else {
                        value.to_string()
                    }
                })),
                None,
                false,
                false,
                false,
                true,
                None,
            ),
            SuperTableColumn::new(
                "url".to_string(),
                "Slow URL".to_string(),
                url_column_width,
                None,
                None,
                true,
                true,
                false,
                true,
                None,
            ),
        ];

        let data: Vec<HashMap<String, String>> = slow_urls
            .iter()
            .map(|u| {
                let mut row = HashMap::new();
                row.insert("requestTime".to_string(), format!("{:.4}", u.request_time));
                row.insert("statusCode".to_string(), u.status_code.to_string());
                row.insert("url".to_string(), u.url.clone());
                row
            })
            .collect();

        let mut super_table = SuperTable::new(
            SUPER_TABLE_SLOWEST_URLS.to_string(),
            "TOP slowest URLs".to_string(),
            format!("No slow URLs slower than {} second(s) found.", self.slowest_min_time),
            columns,
            true,
            Some("requestTime".to_string()),
            "DESC".to_string(),
            None,
            None,
            None,
        );

        super_table.set_data(data);
        status.configure_super_table_url_stripping(&mut super_table);
        output.add_super_table(&super_table);
        status.add_super_table_at_beginning(super_table);

        // Summary for very slow URLs
        let very_slow_count = visited_urls
            .iter()
            .filter(|u| u.content_type == ContentTypeId::Html && u.request_time >= self.slowest_max_time)
            .count();

        status.add_summary_item_by_ranges(
            "slowUrls",
            very_slow_count as f64,
            &[(0.0, 0.0), (1.0, 2.0), (3.0, 5.0), (6.0, f64::MAX)],
            &[
                &format!(
                    "Performance OK - all non-media URLs are faster than {} seconds",
                    self.slowest_max_time
                ),
                &format!(
                    "Performance NOTICE - {{}} slow non-media URL(s) found (slower than {} seconds)",
                    self.slowest_max_time
                ),
                &format!(
                    "Performance WARNING - {{}} slow non-media URLs found (slower than {} seconds)",
                    self.slowest_max_time
                ),
                &format!(
                    "Performance CRITICAL - {{}} slow non-media URLs found (slower than {} seconds)",
                    self.slowest_max_time
                ),
            ],
        );
    }

    fn should_be_activated(&self) -> bool {
        true
    }

    fn get_order(&self) -> i32 {
        110
    }

    fn get_name(&self) -> &str {
        "SlowestAnalyzer"
    }

    fn get_exec_times(&self) -> &HashMap<String, f64> {
        self.base.get_exec_times()
    }

    fn get_exec_counts(&self) -> &HashMap<String, usize> {
        self.base.get_exec_counts()
    }
}


================================================
FILE: src/analysis/source_domains_analyzer.rs
================================================
// SiteOne Crawler - SourceDomainsAnalyzer
// (c) Jan Reges <jan.reges@siteone.cz>

use std::collections::HashMap;

use crate::analysis::analyzer::Analyzer;
use crate::analysis::base_analyzer::BaseAnalyzer;
use crate::components::super_table::SuperTable;
use crate::components::super_table_column::SuperTableColumn;
use crate::output::output::Output;
use crate::result::status::Status;
use crate::types::ContentTypeId;
use crate::utils;

const SUPER_TABLE_SOURCE_DOMAINS: &str = "source-domains";

pub struct SourceDomainsAnalyzer {
    base: BaseAnalyzer,
}

impl Default for SourceDomainsAnalyzer {
    fn default() -> Self {
        Self::new()
    }
}

impl SourceDomainsAnalyzer {
    pub fn new() -> Self {
        Self {
            base: BaseAnalyzer::new(),
        }
    }
}

impl Analyzer for SourceDomainsAnalyzer {
    fn analyze(&mut self, status: &Status, output: &mut dyn Output) {
        let visited_urls = status.get_visited_urls();
        let content_type_ids = get_all_content_type_ids();

        // Gather stats per domain and content type
        let mut stats: HashMap<String, HashMap<String, DomainContentTypeStat>> = HashMap::new();

        for visited_url in &visited_urls {
            if visited_url.has_error_status_code() {
                continue;
            }
            let url_host = visited_url.get_host().unwrap_or_else(|| "unknown".to_string());

            let host_stats = stats.entry(url_host.clone()).or_default();
            let content_type_id = visited_url.content_type;
            let key = format!("{:?}", content_type_id);

            let stat = host_stats.entry(key).or_insert_with(|| DomainContentTypeStat {
                count: 0,
                total_size: 0,
                total_exec_time: 0.0,
            });

            stat.count += 1;
            stat.total_size += visited_url.size.unwrap_or(0);
            stat.total_exec_time += visited_url.request_time;
        }

        // Convert stats to data rows
        let delimiter = utils::get_color_text("/", "dark-gray", false);
        let mut data: Vec<HashMap<String, String>> = Vec::new();
        let mut used_content_types: Vec<String> = Vec::new();

        for (domain, host_stats) in &stats {
            let mut row = HashMap::new();
            row.insert("domain".to_string(), domain.clone());

            let mut total_count: usize = 0;
            let mut total_size: i64 = 0;
            let mut total_time: f64 = 0.0;

            for ct_id in &content_type_ids {
                let key = format!("{:?}", ct_id);
                let ct_name = ct_id.name().to_string();

                if let Some(stat) = host_stats.get(&key) {
                    total_count += stat.count;
                    total_size += stat.total_size;
                    total_time += stat.total_exec_time;

                    let value = format!(
                        "{}/{}/{}",
                        stat.count,
                        utils::get_formatted_size(stat.total_size, 0).replace(' ', ""),
                        utils::get_formatted_duration(stat.total_exec_time).replace(' ', ""),
                    );
                    row.insert(ct_name.clone(), value);

                    if !used_content_types.contains(&ct_name) {
                        used_content_types.push(ct_name);
                    }
                } else {
                    row.insert(ct_name, String::new());
                }
            }

            row.insert(
                "totals".to_string(),
                format!(
                    "{}/{}/{}",
                    total_count,
                    utils::get_formatted_size(total_size, 0).replace(' ', ""),
                    utils::get_formatted_duration(total_time).replace(' ', ""),
                ),
            );
            row.insert("totalCount".to_string(), total_count.to_string());
            data.push(row);
        }

        // Build columns
        let mut columns = vec![
            SuperTableColumn::new(
                "domain".to_string(),
                "Domain".to_string(),
                -1, // AUTO_WIDTH
                None,
                None,
                false,
                false,
                false,
                true,
                None,
            ),
            SuperTableColumn::new(
                "totals".to_string(),
                "Totals".to_string(),
                -1, // AUTO_WIDTH
                Some(Box::new({
                    let delim = delimiter.clone();
                    move |value: &str, render_into: &str| {
                        if render_into == "html" {
                            value.replace('/', &format!(" {} ", delim))
                        } else {
                            value.replace('/', &delim)
                        }
                    }
                })),
                None,
                false,
                false,
                false,
                true,
                None,
            ),
        ];

        for ct_name in &used_content_types {
            let delim = delimiter.clone();
            columns.push(SuperTableColumn::new(
                ct_name.clone(),
                ct_name.clone(),
                -1, // AUTO_WIDTH
                Some(Box::new(move |value: &str, render_into: &str| {
                    if render_into == "html" {
                        value.replace('/', &format!(" {} ", delim))
                    } else {
                        value.replace('/', &delim)
                    }
                })),
                None,
                false,
                false,
                false,
                true,
                None,
            ));
        }

        let mut super_table = SuperTable::new(
            SUPER_TABLE_SOURCE_DOMAINS.to_string(),
            "Source domains".to_string(),
            "No source domains found.".to_string(),
            columns,
            false,
            Some("totalCount".to_string()),
            "DESC".to_string(),
            None,
            None,
            None,
        );

        super_table.set_data(data);
        status.configure_super_table_url_stripping(&mut super_table);
        output.add_super_table(&super_table);
        status.add_super_table_at_beginning(super_table);
    }

    fn should_be_activated(&self) -> bool {
        true
    }

    fn get_order(&self) -> i32 {
        205
    }

    fn get_name(&self) -> &str {
        "SourceDomainsAnalyzer"
    }

    fn get_exec_times(&self) -> &HashMap<String, f64> {
        self.base.get_exec_times()
    }

    fn get_exec_counts(&self) -> &HashMap<String, usize> {
        self.base.get_exec_counts()
    }
}

struct DomainContentTypeStat {
    count: usize,
    total_size: i64,
    total_exec_time: f64,
}

fn get_all_content_type_ids() -> Vec<ContentTypeId> {
    vec![
        ContentTypeId::Html,
        ContentTypeId::Image,
        ContentTypeId::Script,
        ContentTypeId::Stylesheet,
        ContentTypeId::Font,
        ContentTypeId::Document,
        ContentTypeId::Audio,
        ContentTypeId::Video,
        ContentTypeId::Json,
        ContentTypeId::Xml,
        ContentTypeId::Redirect,
        ContentTypeId::Other,
    ]
}


================================================
FILE: src/analysis/ssl_tls_analyzer.rs
================================================
// SiteOne Crawler - SslTlsAnalyzer
// (c) Jan Reges <jan.reges@siteone.cz>

use std::collections::HashMap;
use std::net::TcpStream;
use std::process::Command;
use std::sync::Arc;
use std::time::Instant;

use rustls::pki_types::ServerName;
use x509_parser::prelude::*;

use crate::analysis::analyzer::Analyzer;
use crate::analysis::base_analyzer::BaseAnalyzer;
use crate::components::super_table::SuperTable;
use crate::components::super_table_column::SuperTableColumn;
use crate::output::output::Output;
use crate::result::status::Status;
use crate::utils;

const SUPER_TABLE_CERTIFICATE_INFO: &str = "certificate-info";

pub struct SslTlsAnalyzer {
    base: BaseAnalyzer,
}

impl Default for SslTlsAnalyzer {
    fn default() -> Self {
        Self::new()
    }
}

impl SslTlsAnalyzer {
    pub fn new() -> Self {
        Self {
            base: BaseAnalyzer::new(),
        }
    }

    fn get_tls_certificate_info(&self, hostname: &str, port: u16, status: &Status) -> HashMap<String, String> {
        if !is_hostname_shell_safe(hostname) {
            let mut result = HashMap::new();
            let error = format!("Hostname '{}' contains unsafe characters for shell commands.", hostname);
            status.add_critical_to_summary("ssl-hostname-unsafe", &error);
            result.insert("Errors".to_string(), error);
            return result;
        }

        let mut result = HashMap::new();
        let mut errors: Vec<String> = Vec::new();

        // Build a TLS config that captures the certificate
        let mut root_store = rustls::RootCertStore::empty();

        // Add webpki roots
        for cert in rustls_native_certs::load_native_certs().certs {
            let _ = root_store.add(cert);
        }

        let config = rustls::ClientConfig::builder()
            .with_root_certificates(root_store)
            .with_no_client_auth();

        let server_name = match ServerName::try_from(hostname.to_string()) {
            Ok(sn) => sn,
            Err(e) => {
                let error = format!("Invalid hostname '{}': {}", hostname, e);
                status.add_critical_to_summary("ssl-certificate-connect", &error);
                errors.push(error);
                result.insert("Errors".to_string(), errors.join(", "));
                return result;
            }
        };

        let mut conn = match rustls::ClientConnection::new(Arc::new(config), server_name) {
            Ok(c) => c,
            Err(e) => {
                let error = format!("Unable to create TLS connection to {}:{}: {}", hostname, port, e);
                status.add_critical_to_summary("ssl-certificate-connect", &error);
                errors.push(error);
                result.insert("Errors".to_string(), errors.join(", "));
                return result;
            }
        };

        let addr = format!("{}:{}", hostname, port);
        let mut sock = match TcpStream::connect(&addr) {
            Ok(s) => s,
            Err(e) => {
                let error = format!("Unable to connect to {}:{}: {}", hostname, port, e);
                status.add_critical_to_summary("ssl-certificate-connect", &error);
                errors.push(error);
                result.insert("Errors".to_string(), errors.join(", "));
                return result;
            }
        };

        // Set a short timeout - we only need the TLS handshake, not data
        let _ = sock.set_read_timeout(Some(std::time::Duration::from_secs(5)));
        let _ = sock.set_write_timeout(Some(std::time::Duration::from_secs(5)));

        // Complete the TLS handshake
        loop {
            if conn.is_handshaking() {
                match conn.complete_io(&mut sock) {
                    Ok(_) => {}
                    Err(_) => break,
                }
            } else {
                break;
            }
        }

        // Extract peer certificates
        let peer_certs = match conn.peer_certificates() {
            Some(certs) if !certs.is_empty() => certs.to_vec(),
            _ => {
                let error = "No certificate found.".to_string();
                status.add_critical_to_summary("ssl-certificate-missing", &error);
                errors.push(error);
                result.insert("Errors".to_string(), errors.join(", "));
                return result;
            }
        };

        // Parse the first (leaf) certificate
        let leaf_cert = &peer_certs[0];
        let (_, cert) = match X509Certificate::from_der(leaf_cert.as_ref()) {
            Ok(parsed) => parsed,
            Err(e) => {
                let error = format!("Unable to parse certificate: {}", e);
                status.add_critical_to_summary("ssl-certificate-parse", &error);
                errors.push(error);
                result.insert("Errors".to_string(), errors.join(", "));
                return result;
            }
        };

        // Issuer - add spaces around '='
        let issuer = add_spaces_around_equals(&cert.issuer().to_string());
        result.insert("Issuer".to_string(), issuer.clone());

        // Subject - add spaces around '='
        let subject = add_spaces_around_equals(&cert.subject().to_string());
        result.insert("Subject".to_string(), subject.clone());

        // Valid from
        let not_before = cert.validity().not_before;
        let valid_from_str = format_asn1_time(&not_before);
        let now = chrono::Utc::now();

        if let Some(nb_dt) = asn1_time_to_datetime(&not_before) {
            if now < nb_dt {
                let diff = (nb_dt - now).num_seconds().unsigned_abs() as i64;
                let error = format!(
                    "SSL/TLS certificate is not yet valid, it will be in {}.",
                    utils::get_formatted_age(diff)
                );
                status.add_critical_to_summary("ssl-certificate-valid-from", &error);
                errors.push(error);
                result.insert("Valid from".to_string(), format!("{} (NOT YET VALID)", valid_from_str));
            } else {
                let diff = (now - nb_dt).num_seconds().unsigned_abs() as i64;
                result.insert(
                    "Valid from".to_string(),
                    format!("{} (VALID already {})", valid_from_str, utils::get_formatted_age(diff)),
                );
            }
        } else {
            result.insert("Valid from".to_string(), valid_from_str);
        }

        // Valid to
        let not_after = cert.validity().not_after;
        let valid_to_str = format_asn1_time(&not_after);
        let valid_to_orig = valid_to_str.clone();

        if let Some(na_dt) = asn1_time_to_datetime(&not_after) {
            if now > na_dt {
                let diff = (now - na_dt).num_seconds().unsigned_abs() as i64;
                let expired_ago = format!("{} ago", utils::get_formatted_age(diff));
                let error = format!("SSL/TLS certificate expired {}.", expired_ago);
                status.add_critical_to_summary("ssl-certificate-valid-to", &error);
                errors.push(error);
                result.insert(
                    "Valid to".to_string(),
                    format!("{} (EXPIRED {})", valid_to_str, expired_ago),
                );
            } else {
                let diff = (na_dt - now).num_seconds().unsigned_abs() as i64;
                result.insert(
                    "Valid to".to_string(),
                    format!("{} (VALID still for {})", valid_to_str, utils::get_formatted_age(diff)),
                );
            }
        } else {
            result.insert("Valid to".to_string(), valid_to_str);
        }

        // RAW certificate output - get via openssl command
        let certificate_output = Command::new("sh")
            .arg("-c")
            .arg(format!(
                "timeout 3s sh -c \"echo | openssl s_client -connect {}:{} -servername {} 2>/dev/null | openssl x509 -text -noout\"",
                hostname, port, hostname
            ))
            .output()
            .map(|o| {
                let stdout = String::from_utf8_lossy(&o.stdout).to_string();
                if stdout.trim().is_empty() {
                    // Fallback to stderr if stdout is empty
                    String::from_utf8_lossy(&o.stderr).to_string()
                } else {
                    stdout
                }
            })
            .unwrap_or_default();

        if !certificate_output.trim().is_empty() {
            result.insert("RAW certificate output".to_string(), certificate_output);
        }

        // Supported protocols - test each protocol via openssl s_client
        let protocols = [
            ("ssl2", "SSLv2"),
            ("ssl3", "SSLv3"),
            ("tls1", "TLSv1.0"),
            ("tls1_1", "TLSv1.1"),
            ("tls1_2", "TLSv1.2"),
            ("tls1_3", "TLSv1.3"),
        ];
        let unsafe_protocols = ["ssl2", "ssl3", "tls1", "tls1_1"];
        let mut supported_protocols: Vec<String> = Vec::new();
        let mut protocols_output = String::new();

        for (protocol_code, protocol_name) in &protocols {
            let output = Command::new("sh")
                .arg("-c")
                .arg(format!(
                    "timeout 3s sh -c \"echo 'Q' | openssl s_client -connect {}:{} -servername {} -{} 2>&1\"",
                    hostname, port, hostname, protocol_code
                ))
                .output();

            let output_str = match output {
                Ok(o) => String::from_utf8_lossy(&o.stdout).to_string() + &String::from_utf8_lossy(&o.stderr),
                Err(_) => String::new(),
            };

            protocols_output.push_str(&format!("\n=== {} ===\n{}", protocol_code, output_str));

            if output_str.contains("Certificate chain") {
                supported_protocols.push(protocol_name.to_string());
                if unsafe_protocols.contains(protocol_code) {
                    status.add_critical_to_summary(
                        "ssl-protocol-unsafe",
                        &format!("SSL/TLS protocol {} is unsafe.", protocol_name),
                    );
                }
            }
        }

        if !supported_protocols.is_empty() {
            result.insert("Supported protocols".to_string(), supported_protocols.join(", "));
        } else {
            // Fallback to rustls-detected protocol if openssl is not available
            let protocol_version = conn
                .protocol_version()
                .map(|v| {
                    let raw = format!("{:?}", v);
                    raw.replace('_', ".")
                })
                .unwrap_or_else(|| "Unknown".to_string());
            result.insert("Supported protocols".to_string(), protocol_version.clone());
        }

        // Add TLSv1.3 support warning
        let has_tls13 = supported_protocols.iter().any(|p| p.contains("1.3"));
        let has_tls12 = supported_protocols.iter().any(|p| p.contains("1.2"));
        if !has_tls13 {
            if !has_tls12 {
                status.add_critical_to_summary(
                    "ssl-protocol-hint",
                    "SSL/TLS protocol TLSv1.2 is not supported. Ask your admin/provider to add TLSv1.2 support.",
                );
            } else {
                status.add_warning_to_summary(
                    "ssl-protocol-hint",
                    "Latest SSL/TLS protocol TLSv1.3 is not supported. Ask your admin/provider to add TLSv1.3 support.",
                );
            }
        }

        if !protocols_output.is_empty() {
            result.insert("RAW protocols output".to_string(), protocols_output);
        }

        // Set summary based on errors
        if errors.is_empty() && !issuer.is_empty() {
            status.add_ok_to_summary(
                "ssl-certificate-valid",
                &format!(
                    "SSL/TLS certificate is valid until {}. Issued by {}. Subject is {}.",
                    valid_to_orig, issuer, subject
                ),
            );
            status.add_ok_to_summary(
                "certificate-info",
                &format!("SSL/TLS certificate issued by '{}'.", issuer),
            );
        } else if !errors.is_empty() {
            result.insert("Errors".to_string(), errors.join(", "));
        }

        if issuer.is_empty() && errors.is_empty() {
            status.add_critical_to_summary("certificate-info", "SSL/TLS: unable to load certificate info");
        }

        result
    }
}

impl Analyzer for SslTlsAnalyzer {
    fn analyze(&mut self, status: &Status, output: &mut dyn Output) {
        // Find the initial URL from visited URLs (the one with SOURCE_INIT_URL source_attr)
        let visited_urls = status.get_visited_urls();
        let initial_url = visited_urls
            .iter()
            .find(|u| u.source_attr == crate::result::visited_url::SOURCE_INIT_URL)
            .map(|u| u.url.clone())
            .or_else(|| visited_urls.first().map(|u| u.url.clone()));

        let initial_url = match initial_url {
            Some(url) => url,
            None => return,
        };

        if !initial_url.starts_with("https://") {
            status.add_notice_to_summary("ssl-tls-analyzer", "SSL/TLS not supported, analyzer skipped.");
            return;
        }

        // Extract hostname from URL
        let hostname = match url::Url::parse(&initial_url) {
            Ok(parsed) => parsed.host_str().unwrap_or("").to_string(),
            Err(_) => {
                status.add_critical_to_summary("ssl-tls-analyzer", "SSL/TLS: unable to parse initial URL");
                return;
            }
        };

        if hostname.is_empty() {
            return;
        }

        let s = Instant::now();
        let cert_info = self.get_tls_certificate_info(&hostname, 443, status);
        self.base
            .measure_exec_time("SslTlsAnalyzer", "getTLSandSSLCertificateInfo", s);

        let console_width = utils::get_console_width();

        let mut table_data: Vec<HashMap<String, String>> = Vec::new();
        let display_order = [
            "Issuer",
            "Subject",
            "Valid from",
            "Valid to",
            "Supported protocols",
            "Errors",
            "RAW certificate output",
            "RAW protocols output",
        ];

        for key in &display_order {
            if let Some(value) = cert_info.get(*key)
                && !value.is_empty()
            {
                let mut row = HashMap::new();
                row.insert("info".to_string(), key.to_string());
                row.insert("value".to_string(), value.clone());
                table_data.push(row);
            }
        }

        let columns = vec![
            SuperTableColumn::new(
                "info".to_string(),
                "Info".to_string(),
                -1, // AUTO_WIDTH
                None,
                None,
                false,
                false,
                false,
                true,
                None,
            ),
            SuperTableColumn::new(
                "value".to_string(),
                "Text".to_string(),
                (console_width as i32 - 30).max(20),
                Some(Box::new(|value: &str, render_into: &str| {
                    if render_into == "html" {
                        value.replace(' ', "&nbsp;").replace('\n', "<br>")
                    } else {
                        value.to_string()
                    }
                })),
                None,
                true,
                true,
                false,
                false,
                None,
            ),
        ];

        let mut super_table = SuperTable::new(
            SUPER_TABLE_CERTIFICATE_INFO.to_string(),
            "SSL/TLS info".to_string(),
            "No SSL/TLS info.".to_string(),
            columns,
            true,
            None,
            "ASC".to_string(),
            None,
            None,
            None,
        );

        super_table.set_data(table_data);
        status.configure_super_table_url_stripping(&mut super_table);
        output.add_super_table(&super_table);
        status.add_super_table_at_beginning(super_table);
    }

    fn should_be_activated(&self) -> bool {
        true
    }

    fn get_order(&self) -> i32 {
        20
    }

    fn get_name(&self) -> &str {
        "SslTlsAnalyzer"
    }

    fn get_exec_times(&self) -> &HashMap<String, f64> {
        self.base.get_exec_times()
    }

    fn get_exec_counts(&self) -> &HashMap<String, usize> {
        self.base.get_exec_counts()
    }
}

fn format_asn1_time(time: &ASN1Time) -> String {
    // ASN1Time implements Display, but we replace "+00:00" with "GMT"
    format!("{}", time).replace("+00:00", "GMT")
}

fn add_spaces_around_equals(s: &str) -> String {
    use once_cell::sync::Lazy;
    static RE_EQUALS: Lazy<regex::Regex> = Lazy::new(|| regex::Regex::new(r"(\w)=(\S)").unwrap());
    RE_EQUALS.replace_all(s, "$1 = $2").to_string()
}

fn asn1_time_to_datetime(time: &ASN1Time) -> Option<chrono::DateTime<chrono::Utc>> {
    // ASN1Time has a timestamp() method that gives epoch seconds
    let epoch = time.timestamp();
    chrono::DateTime::from_timestamp(epoch, 0)
}

/// Validate that a hostname is safe to use in shell commands.
/// Only allows alphanumeric chars, dots, and hyphens to prevent command injection.
fn is_hostname_shell_safe(hostname: &str) -> bool {
    !hostname.is_empty()
        && hostname
            .chars()
            .all(|c| c.is_ascii_alphanumeric() || c == '.' || c == '-')
}


================================================
FILE: src/components/mod.rs
================================================
pub mod summary;
pub mod super_table;
pub mod super_table_column;


================================================
FILE: src/components/summary/item.rs
================================================
// SiteOne Crawler - Summary Item
// (c) Jan Reges <jan.reges@siteone.cz>

use serde::{Deserialize, Serialize};

use crate::components::summary::item_status::ItemStatus;
use crate::utils;

#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(rename_all = "camelCase")]
pub struct Item {
    pub apl_code: String,
    pub text: String,
    pub status: ItemStatus,
}

impl Item {
    pub fn new(apl_code: String, text: String, status: ItemStatus) -> Self {
        Self { apl_code, text, status }
    }

    pub fn get_as_html(&self) -> String {
        let icon = match self.status {
            ItemStatus::Ok => "\u{2705}",              // checkmark
            ItemStatus::Notice => "\u{23E9}",          // fast forward
            ItemStatus::Warning => "\u{26A0}\u{FE0F}", // warning
            ItemStatus::Critical => "\u{26D4}",        // no entry
            ItemStatus::Info => "\u{1F4CC}",           // pushpin
        };

        let clean_text = utils::remove_ansi_colors(&self.text);
        let escaped = html_escape(&clean_text);
        let trimmed = escaped.trim_end_matches(['.', ' ']);
        format!("{} {}.", icon, trimmed)
    }

    pub fn get_as_console_text(&self) -> String {
        let icon = match self.status {
            ItemStatus::Ok => "\u{2705}",
            ItemStatus::Notice => "\u{23E9}",
            ItemStatus::Warning => "\u{26A0}\u{FE0F}",
            ItemStatus::Critical => "\u{26D4}",
            ItemStatus::Info => "\u{1F4CC}",
        };

        let trimmed = self.text.trim_end_matches(['.', ' ']);
        format!("{} {}.", icon, trimmed)
    }
}

fn html_escape(s: &str) -> String {
    s.replace('&', "&amp;")
        .replace('<', "&lt;")
        .replace('>', "&gt;")
        .replace('"', "&quot;")
        .replace('\'', "&#39;")
}


================================================
FILE: src/components/summary/item_status.rs
================================================
// SiteOne Crawler - Summary ItemStatus
// (c) Jan Reges <jan.reges@siteone.cz>

use serde::{Deserialize, Serialize};

use crate::error::CrawlerError;

#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
#[serde(rename_all = "UPPERCASE")]
pub enum ItemStatus {
    Ok,
    Notice,
    Warning,
    Critical,
    Info,
}

impl ItemStatus {
    pub fn from_range_id(range_id: i32) -> Result<Self, CrawlerError> {
        match range_id {
            0 => Ok(ItemStatus::Ok),
            1 => Ok(ItemStatus::Notice),
            2 => Ok(ItemStatus::Warning),
            3 => Ok(ItemStatus::Critical),
            4 => Ok(ItemStatus::Info),
            _ => Err(CrawlerError::Parse(format!(
                "ItemStatus::from_range_id: Unknown range ID '{}'",
                range_id
            ))),
        }
    }

    pub fn from_text(text: &str) -> Result<Self, CrawlerError> {
        match text.to_uppercase().as_str() {
            "OK" => Ok(ItemStatus::Ok),
            "NOTICE" => Ok(ItemStatus::Notice),
            "WARNING" => Ok(ItemStatus::Warning),
            "CRITICAL" => Ok(ItemStatus::Critical),
            "INFO" => Ok(ItemStatus::Info),
            _ => Err(CrawlerError::Parse(format!(
                "ItemStatus::from_text: Unknown status '{}'",
                text
            ))),
        }
    }

    pub fn sort_order(&self) -> i32 {
        match self {
            ItemStatus::Critical => 1,
            ItemStatus::Warning => 2,
            ItemStatus::Notice => 3,
            ItemStatus::Ok => 4,
            ItemStatus::Info => 5,
        }
    }
}


================================================
FILE: src/components/summary/mod.rs
================================================
pub mod item;
pub mod item_status;
#[allow(clippy::module_inception)]
pub mod summary;


================================================
FILE: src/components/summary/summary.rs
================================================
// SiteOne Crawler - Summary
// (c) Jan Reges <jan.reges@siteone.cz>

use serde::{Deserialize, Serialize};

use crate::components::summary::item::Item;
use crate::components::summary::item_status::ItemStatus;
use crate::utils;

#[derive(Debug, Clone, Default, Serialize, Deserialize)]
pub struct Summary {
    items: Vec<Item>,
}

impl Summary {
    pub fn new() -> Self {
        Self { items: Vec::new() }
    }

    pub fn add_item(&mut self, item: Item) {
        self.items.push(item);
    }

    pub fn get_items(&self) -> &[Item] {
        &self.items
    }

    fn sort_items(&mut self) {
        self.items.sort_by_key(|item| item.status.sort_order());
    }

    pub fn get_as_html(&mut self) -> String {
        let mut result = String::from("<ul>\n");
        self.sort_items();
        for item in &self.items {
            result.push_str(&format!("    <li>{}</li>\n", item.get_as_html()));
        }
        result.push_str("</ul>");
        result
    }

    pub fn get_as_console_text(&mut self) -> String {
        let title = "Summary";
        let title_output = format!("{}\n{}\n\n", title, "-".repeat(title.len()));
        let mut result = utils::get_color_text(&title_output, "blue", false);

        self.sort_items();
        for item in &self.items {
            result.push_str(&item.get_as_console_text());
            result.push('\n');
        }
        result
    }

    pub fn get_count_by_item_status(&self, status: ItemStatus) -> usize {
        self.items.iter().filter(|item| item.status == status).count()
    }
}


================================================
FILE: src/components/super_table.rs
================================================
// SiteOne Crawler - SuperTable
// (c) Jan Reges <jan.reges@siteone.cz>

use std::collections::HashMap;
use std::sync::RwLock;

use once_cell::sync::Lazy;
use regex::Regex;
use serde::Serialize;

use crate::components::super_table_column::SuperTableColumn;
use crate::utils;

static RE_RELATIVE_URL_PATH: Lazy<Regex> = Lazy::new(|| Regex::new(r"(?i)^/[a-z0-9\-_./?\&#+=%%@()|]*$").unwrap());

pub const POSITION_BEFORE_URL_TABLE: &str = "before-url-table";
pub const POSITION_AFTER_URL_TABLE: &str = "after-url-table";

pub const RENDER_INTO_HTML: &str = "html";
pub const RENDER_INTO_CONSOLE: &str = "console";

static HARD_ROWS_LIMIT: RwLock<usize> = RwLock::new(200);

#[derive(Debug, Serialize)]
pub struct SuperTable {
    pub apl_code: String,
    pub title: String,
    pub description: Option<String>,
    pub max_rows: Option<usize>,
    pub forced_tab_label: Option<String>,

    #[serde(skip)]
    visible_in_html: bool,
    #[serde(skip)]
    visible_in_json: bool,
    #[serde(skip)]
    visible_in_console: bool,
    #[serde(skip)]
    visible_in_console_rows_limit: Option<usize>,
    #[serde(skip)]
    show_only_columns_with_values: bool,

    #[serde(skip)]
    columns: Vec<SuperTableColumn>,
    #[serde(skip)]
    position_before_url_table: bool,
    #[serde(skip)]
    data: Vec<HashMap<String, String>>,
    #[serde(skip)]
    empty_table_message: String,
    #[serde(skip)]
    current_order_column: Option<String>,
    #[serde(skip)]
    current_order_direction: String,
    #[serde(skip)]
    unique_id: String,
    #[serde(skip)]
    host_to_strip_from_urls: Option<String>,
    #[serde(skip)]
    scheme_of_host_to_strip_from_urls: Option<String>,
    #[serde(skip)]
    initial_url: Option<String>,
    #[serde(skip)]
    fulltext_enabled: bool,
    #[serde(skip)]
    min_rows_for_fulltext: usize,
    #[serde(skip)]
    ignore_hard_rows_limit: bool,
    #[serde(skip)]
    max_hard_rows_limit_reached: bool,
}

impl SuperTable {
    #[allow(clippy::too_many_arguments)]
    pub fn new(
        apl_code: String,
        title: String,
        empty_table_message: String,
        columns: Vec<SuperTableColumn>,
        position_before_url_table: bool,
        current_order_column: Option<String>,
        current_order_direction: String,
        description: Option<String>,
        max_rows: Option<usize>,
        forced_tab_label: Option<String>,
    ) -> Self {
        let unique_id = generate_unique_id();

        Self {
            apl_code,
            title,
            empty_table_message,
            columns,
            position_before_url_table,
            current_order_column,
            current_order_direction,
            description,
            max_rows,
            forced_tab_label,
            unique_id,
            visible_in_html: true,
            visible_in_json: true,
            visible_in_console: true,
            visible_in_console_rows_limit: None,
            show_only_columns_with_values: false,
            data: Vec::new(),
            host_to_strip_from_urls: None,
            scheme_of_host_to_strip_from_urls: None,
            initial_url: None,
            fulltext_enabled: true,
            min_rows_for_fulltext: 10,
            ignore_hard_rows_limit: false,
            max_hard_rows_limit_reached: false,
        }
    }

    pub fn set_data(&mut self, data: Vec<HashMap<String, String>>) {
        self.data = data;
        if let Some(ref col) = self.current_order_column.clone() {
            let dir = self.current_order_direction.clone();
            self.sort_data(col, &dir);
        }
        self.apply_hard_rows_limit();
        self.remove_columns_with_empty_data();
    }

    pub fn get_html_output(&self) -> String {
        if !self.visible_in_html {
            return String::new();
        }

        let mut output = format!("<h2>{}</h2>", html_escape(&self.title));

        if self.data.is_empty() {
            output.push_str(&format!("<p>{}</p>", html_escape(&self.empty_table_message)));
            return output;
        } else if let Some(ref desc) = self.description {
            output.push_str(desc);
            output.push_str("<br>");
        }

        if self.is_fulltext_enabled() {
            output.push_str("<div class=\"fulltext-container\">");
            output.push_str(&format!(
                "    <input type=\"text\" class=\"fulltext\" data-uq-id=\"{}\" style=\"width: 300px;\" placeholder=\"Fulltext search\">",
                html_escape(&self.unique_id)
            ));
            output.push_str(&format!(
                "    <span id=\"foundRows_{}\" class=\"found-rows\">Found {} row(s).</span>",
                html_escape(&self.unique_id),
                self.data.len()
            ));
            output.push_str("</div>");
        }

        let show_more = self.data.len() > 20;

        let mut extra_classes = vec![self.apl_code.clone()];
        if show_more {
            extra_classes.push("table-with-show-more".to_string());
        }

        output.push_str(&format!(
            "<div class='table-container-top{}'>",
            if show_more { " show-more" } else { "" }
        ));
        if show_more {
            output.push_str(&format!(
                "<input id='showMore_{}' name='showMore' class='show-more-checkbox' type='checkbox' />",
                html_escape(&self.unique_id)
            ));
        }
        output.push_str(&format!(
            "<div class='table-container{}'>",
            if show_more { " show-more" } else { "" }
        ));
        output.push_str(&format!(
            "<table id='{}' border='1' class='table table-bordered table-hover table-sortable {}' style='border-collapse: collapse;'>",
            html_escape(&self.unique_id),
            extra_classes.join(" ")
        ));

        // thead
        output.push_str("<thead>");
        for column in &self.columns {
            let direction = if self.current_order_column.as_deref() == Some(&column.apl_code)
                && self.current_order_direction == "ASC"
            {
                "DESC"
            } else {
                "ASC"
            };

            let arrow = if self.current_order_column.as_deref() == Some(&column.apl_code) {
                if self.current_order_direction == "ASC" {
                    "&nbsp;&#128316;"
                } else {
                    "&nbsp;&#128317;"
                }
            } else {
                ""
            };

            let data_type = column.forced_data_type.as_deref().unwrap_or_else(|| {
                if let Some(first_row) = self.data.first()
                    && let Some(val) = first_row.get(&column.apl_code)
                    && val.parse::<f64>().is_ok()
                {
                    return "number";
                }
                "string"
            });

            output.push_str(&format!(
                "<th class='sortable-th' data-key='{}' data-type='{}' data-direction='{}' data-label='{}' data-uq-id='{}'>{}{}</th>",
                column.apl_code,
                data_type,
                direction,
                html_escape(&column.name),
                html_escape(&self.unique_id),
                html_escape(&column.name),
                arrow
            ));
        }

        let initial_root_url = self.initial_url.as_ref().and_then(|url| {
            let re = regex::Regex::new(r"^(https?://[^/]+).*$").ok()?;
            re.captures(url)
                .and_then(|caps| caps.get(1))
                .map(|m| m.as_str().to_string())
        });

        output.push_str("</thead>");
        output.push_str("<tbody>");

        let mut counter = 1usize;
        let mut max_rows_reached = false;

        for row in &self.data {
            if let Some(max) = self.max_rows
                && counter > max
            {
                max_rows_reached = true;
                break;
            }

            output.push_str("<tr>");
            for column in &self.columns {
                let value = row.get(&column.apl_code).cloned().unwrap_or_default();
                let mut formatted_value = value.clone();

                if let Some(ref fmt) = column.formatter {
                    formatted_value = fmt(&value, RENDER_INTO_HTML);
                } else if let Some(ref rend) = column.renderer {
                    formatted_value = rend(row, RENDER_INTO_HTML);
                }

                if column.escape_output_html {
                    formatted_value = html_escape(&formatted_value);
                }

                if column.non_breaking_spaces {
                    formatted_value = formatted_value
                        .replace(' ', "&nbsp;")
                        .replace('\t', "&nbsp;&nbsp;&nbsp;&nbsp;");
                }

                // colored text
                if formatted_value.contains("[0;") || formatted_value.contains("[1;") || formatted_value.contains("[0m")
                {
                    formatted_value = crate::utils::convert_bash_colors_in_text_to_html(&formatted_value);
                }

                // full URL in value — skip if a renderer/formatter already produced custom HTML
                let has_custom_formatter = column.formatter.is_some() || column.renderer.is_some();
                if !has_custom_formatter && value.starts_with("http") {
                    let truncated = utils::truncate_url(
                        &value,
                        100,
                        "\u{2026}",
                        self.host_to_strip_from_urls.as_deref(),
                        self.scheme_of_host_to_strip_from_urls.as_deref(),
                        Some(false),
                    );
                    formatted_value = format!("<a href='{}' target='_blank'>{}</a>", html_escape(&value), truncated);
                } else if !has_custom_formatter && formatted_value.starts_with("http") {
                    let truncated = utils::truncate_url(
                        &formatted_value,
                        100,
                        "\u{2026}",
                        self.host_to_strip_from_urls.as_deref(),
                        self.scheme_of_host_to_strip_from_urls.as_deref(),
                        Some(false),
                    );
                    formatted_value = format!(
                        "<a href='{}' target='_blank'>{}</a>",
                        html_escape(&formatted_value),
                        truncated
                    );
                } else if !has_custom_formatter
                    && let Some(ref root_url) = initial_root_url
                    && formatted_value.starts_with('/')
                    && RE_RELATIVE_URL_PATH.is_match(&formatted_value)
                {
                    let final_url = format!("{}{}", root_url, formatted_value);
                    let truncated = utils::truncate_url(
                        &formatted_value,
                        100,
                        "\u{2026}",
                        self.host_to_strip_from_urls.as_deref(),
                        self.scheme_of_host_to_strip_from_urls.as_deref(),
                        Some(false),
                    );
                    formatted_value = format!(
                        "<a href='{}' target='_blank'>{}</a>",
                        html_escape(&final_url),
                        truncated
                    );
                }

                let data_value = if column.get_data_value_callback.is_some() {
                    column.get_data_value(row)
                } else if value.len() < 200 {
                    value.clone()
                } else if formatted_value.len() < 50 {
                    formatted_value.clone()
                } else {
                    "complex-data".to_string()
                };

                output.push_str(&format!(
                    "<td data-value='{}' class='{}'>{}</td>",
                    html_escape(&data_value),
                    html_escape(&column.apl_code),
                    formatted_value
                ));
            }
            output.push_str("</tr>");
            counter += 1;
        }

        if self.data.is_empty() {
            output.push_str(&format!(
                "<tr><td colspan='{}' class='warning'>{}</td></tr>",
                self.columns.len(),
                html_escape(&self.empty_table_message)
            ));
        } else if max_rows_reached {
            output.push_str(&format!(
                "<tr><td colspan='{}' class='warning'>You have reached the limit of {} rows as a protection against very large output or exhausted memory.</td></tr>",
                self.columns.len(),
                self.max_rows.unwrap_or(0)
            ));
        } else if self.max_hard_rows_limit_reached {
            let limit = HARD_ROWS_LIMIT.read().map(|v| *v).unwrap_or(200);
            output.push_str(&format!(
                "<tr><td colspan='{}' class='warning'>You have reached the hard limit of {} rows as a protection against very large output or exhausted memory. You can change this with <code>--rows-limit</code>.</td></tr>",
                self.columns.len(),
                limit
            ));
        }

        output.push_str("</tbody>");

        if self.is_fulltext_enabled() {
            output.push_str("<tfoot>");
            output.push_str(&format!(
                "  <tr class='empty-fulltext'><td colspan='{}' class='warning'>No rows found, please edit your search term.</td></tr>",
                self.columns.len()
            ));
            output.push_str("</tfoot>");
        }

        output.push_str("</table></div>");

        if show_more {
            output.push_str(&format!(
                "<label for='showMore_{}' class='show-more-label'>(+) Show entire table</label>",
                html_escape(&self.unique_id)
            ));
        }
        output.push_str("</div>");

        output
    }

    pub fn get_console_output(&self) -> String {
        let title_output = format!("{}\n{}\n\n", self.title, "-".repeat(self.title.chars().count()));
        let mut output = utils::get_color_text(&title_output, "blue", false);

        let data = &self.data;

        if data.is_empty() {
            output.push_str(&utils::get_color_text(&self.empty_table_message, "gray", false));
            output.push_str("\n\n");
            return output;
        } else if !self.visible_in_console {
            output.push_str(&utils::get_color_text(
                "This table contains large data. To see them, use output to HTML using `--output-html-report=tmp/myreport.html`.",
                "yellow",
                false,
            ));
            output.push_str("\n\n");
            return output;
        }

        let display_data: &[HashMap<String, String>] = if let Some(limit) = self.visible_in_console_rows_limit {
            output.push_str(&utils::get_color_text(
                    &format!(
                        "This table contains large data and shows max {} rows. To see them all, use output to HTML using `--output-html-report=tmp/myreport.html`.",
                        limit
                    ),
                    "yellow",
                    false,
                ));
            output.push_str("\n\n");
            &data[..limit.min(data.len())]
        } else {
            data
        };

        // Calculate column widths
        let column_widths: Vec<usize> = self
            .columns
            .iter()
            .map(|col| {
                if col.width == super::super_table_column::AUTO_WIDTH {
                    col.get_auto_width_by_data(&self.data)
                } else {
                    col.width as usize
                }
            })
            .collect();

        // Headers
        let headers: Vec<String> = self
            .columns
            .iter()
            .enumerate()
            .map(|(i, col)| utils::mb_str_pad(&col.name, column_widths[i], ' '))
            .collect();
        output.push_str(&utils::get_color_text(&headers.join(" | "), "gray", false));
        output.push('\n');

        // Separator
        let total_width: usize = column_widths.iter().sum::<usize>() + (self.columns.len() * 3) - 1;
        output.push_str(&"-".repeat(total_width));
        output.push('\n');

        // Rows
        for row in display_data {
            let mut row_data = Vec::new();
            for (i, column) in self.columns.iter().enumerate() {
                let value = row.get(&column.apl_code).cloned().unwrap_or_default();
                let col_width = column_widths[i];

                let mut display_value = if let Some(ref fmt) = column.formatter {
                    fmt(&value, RENDER_INTO_CONSOLE)
                } else if let Some(ref rend) = column.renderer {
                    rend(row, RENDER_INTO_CONSOLE)
                } else {
                    value
                };

                // Strip protocol+domain from same-domain URLs in console output
                if display_value.starts_with("http") {
                    display_value = utils::truncate_url(
                        &display_value,
                        col_width,
                        "\u{2026}",
                        self.host_to_strip_from_urls.as_deref(),
                        self.scheme_of_host_to_strip_from_urls.as_deref(),
                        None,
                    );
                }

                if column.truncate_if_longer && display_value.chars().count() > col_width {
                    display_value = utils::truncate_in_two_thirds(&display_value, col_width, "\u{2026}", None);
                }

                // Always use ANSI-aware padding: truncation may add colored "…" to any column
                let stripped_len = utils::remove_ansi_colors(&display_value).chars().count();
                let padding = col_width.saturating_sub(stripped_len);
                row_data.push(format!("{}{}", display_value, " ".repeat(padding)));
            }
            output.push_str(&row_data.join(" | "));
            output.push('\n');
        }
        output.push('\n');

        output
    }

    pub fn get_json_output(&self) -> Option<serde_json::Value> {
        if !self.visible_in_json {
            return None;
        }

        // Build columns as a dict keyed by aplCode
        let mut columns_map = serde_json::Map::new();
        for col in &self.columns {
            let col_json = serde_json::json!({
                "aplCode": col.apl_code,
                "name": col.name,
                "width": col.width,
                "formatter": if col.formatter.is_some() { serde_json::json!({}) } else { serde_json::Value::Null },
                "renderer": if col.renderer.is_some() { serde_json::json!({}) } else { serde_json::Value::Null },
                "truncateIfLonger": col.truncate_if_longer,
                "formatterWillChangeValueLength": col.formatter_will_change_value_length,
                "nonBreakingSpaces": col.non_breaking_spaces,
                "escapeOutputHtml": col.escape_output_html,
                "getDataValueCallback": if col.get_data_value_callback.is_some() { serde_json::json!({}) } else { serde_json::Value::Null },
                "forcedDataType": col.forced_data_type,
            });
            columns_map.insert(col.apl_code.clone(), col_json);
        }

        Some(serde_json::json!({
            "aplCode": self.apl_code,
            "title": self.title,
            "columns": columns_map,
            "rows": self.data,
            "position": if self.position_before_url_table { POSITION_BEFORE_URL_TABLE } else { POSITION_AFTER_URL_TABLE },
        }))
    }

    pub fn is_position_before_url_table(&self) -> bool {
        self.position_before_url_table
    }

    pub fn get_data(&self) -> &[HashMap<String, String>] {
        &self.data
    }

    pub fn get_total_rows(&self) -> usize {
        self.data.len()
    }

    pub fn set_host_to_strip_from_urls(&mut self, host: Option<String>, scheme: Option<String>) {
        self.host_to_strip_from_urls = host;
        self.scheme_of_host_to_strip_from_urls = scheme;
    }

    pub fn set_initial_url(&mut self, url: Option<String>) {
        self.initial_url = url;
    }

    pub fn set_visibility_in_html(&mut self, visible: bool) {
        self.visible_in_html = visible;
    }

    pub fn set_visibility_in_console(&mut self, visible: bool, rows_limit: Option<usize>) {
        self.visible_in_console = visible;
        self.visible_in_console_rows_limit = rows_limit;
    }

    pub fn set_visibility_in_json(&mut self, visible: bool) {
        self.visible_in_json = visible;
    }

    pub fn is_visible_in_html(&self) -> bool {
        self.visible_in_html
    }

    pub fn is_visible_in_console(&self) -> bool {
        self.visible_in_console
    }

    pub fn is_visible_in_json(&self) -> bool {
        self.visible_in_json
    }

    pub fn disable_fulltext(&mut self) {
        self.fulltext_enabled = false;
    }

    pub fn set_show_only_columns_with_values(&mut self, show_only: bool) {
        self.show_only_columns_with_values = show_only;
    }

    pub fn get_columns(&self) -> &[SuperTableColumn] {
        &self.columns
    }

    pub fn set_hard_rows_limit(limit: usize) {
        if let Ok(mut v) = HARD_ROWS_LIMIT.write() {
            *v = limit;
        }
    }

    pub fn set_ignore_hard_rows_limit(&mut self, ignore: bool) {
        self.ignore_hard_rows_limit = ignore;
    }

    fn sort_data(&mut self, column_key: &str, direction: &str) {
        let dir_upper = direction.to_uppercase();
        let key = column_key.to_string();
        self.data.sort_by(|a, b| {
            let a_val = a.get(&key).cloned().unwrap_or_default();
            let b_val = b.get(&key).cloned().unwrap_or_default();

            // Try numeric comparison first
            let cmp = match (a_val.parse::<f64>(), b_val.parse::<f64>()) {
                (Ok(a_num), Ok(b_num)) => a_num.partial_cmp(&b_num).unwrap_or(std::cmp::Ordering::Equal),
                _ => a_val.cmp(&b_val),
            };

            if dir_upper == "ASC" { cmp } else { cmp.reverse() }
        });
    }

    fn is_fulltext_enabled(&self) -> bool {
        self.fulltext_enabled && self.data.len() >= self.min_rows_for_fulltext
    }

    fn remove_columns_with_empty_data(&mut self) {
        if !self.show_only_columns_with_values {
            return;
        }

        let columns_to_remove: Vec<String> = self
            .columns
            .iter()
            .filter(|col| {
                !self.data.iter().any(|row| {
                    let value = row.get(&col.apl_code).cloned().unwrap_or_default();
                    let trimmed = value.trim().trim_matches(|c: char| c == '0' || c == '.' || c == ',');
                    !trimmed.is_empty()
                })
            })
            .map(|col| col.apl_code.clone())
            .collect();

        self.columns.retain(|col| !columns_to_remove.contains(&col.apl_code));

        for row in &mut self.data {
            for key in &columns_to_remove {
                row.remove(key);
            }
        }
    }

    fn apply_hard_rows_limit(&mut self) {
        let limit = HARD_ROWS_LIMIT.read().map(|v| *v).unwrap_or(200);
        if limit > 0 && !self.ignore_hard_rows_limit && self.data.len() > limit {
            self.data.truncate(limit);
            self.max_hard_rows_limit_reached = true;
        }
    }
}

fn html_escape(s: &str) -> String {
    s.replace('&', "&amp;")
        .replace('<', "&lt;")
        .replace('>', "&gt;")
        .replace('"', "&quot;")
        .replace('\'', "&#39;")
}

fn generate_unique_id() -> String {
    use std::time::SystemTime;
    let nanos = SystemTime::now()
        .duration_since(SystemTime::UNIX_EPOCH)
        .map(|d| d.as_nanos())
        .unwrap_or(42);

    use ::md5::{Digest, Md5};
    let mut hasher = Md5::new();
    hasher.update(nanos.to_string().as_bytes());
    let result = hasher.finalize();
    format!("t{}", &format!("{:x}", result)[..6])
}


================================================
FILE: src/components/super_table_column.rs
================================================
// SiteOne Crawler - SuperTableColumn
// (c) Jan Reges <jan.reges@siteone.cz>

use serde::Serialize;
use std::collections::HashMap;

pub const AUTO_WIDTH: i32 = -1;

pub type FormatterFn = Box<dyn Fn(&str, &str) -> String + Send + Sync>;
pub type RendererFn = Box<dyn Fn(&HashMap<String, String>, &str) -> String + Send + Sync>;
pub type DataValueCallbackFn = Box<dyn Fn(&HashMap<String, String>) -> String + Send + Sync>;

#[derive(Serialize)]
pub struct SuperTableColumn {
    pub apl_code: String,
    pub name: String,
    pub width: i32,
    #[serde(skip)]
    pub formatter: Option<FormatterFn>,
    #[serde(skip)]
    pub renderer: Option<RendererFn>,
    pub truncate_if_longer: bool,
    pub formatter_will_change_value_length: bool,
    pub non_breaking_spaces: bool,
    pub escape_output_html: bool,
    #[serde(skip)]
    pub get_data_value_callback: Option<DataValueCallbackFn>,
    pub forced_data_type: Option<String>,
}

impl std::fmt::Debug for SuperTableColumn {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        f.debug_struct("SuperTableColumn")
            .field("apl_code", &self.apl_code)
            .field("name", &self.name)
            .field("width", &self.width)
            .field("truncate_if_longer", &self.truncate_if_longer)
            .field(
                "formatter_will_change_value_length",
                &self.formatter_will_change_value_length,
            )
            .field("non_breaking_spaces", &self.non_breaking_spaces)
            .field("escape_output_html", &self.escape_output_html)
            .field("forced_data_type", &self.forced_data_type)
            .finish()
    }
}

impl SuperTableColumn {
    #[allow(clippy::too_many_arguments)]
    pub fn new(
        apl_code: String,
        name: String,
        width: i32,
        formatter: Option<FormatterFn>,
        renderer: Option<RendererFn>,
        truncate_if_longer: bool,
        formatter_will_change_value_length: bool,
        non_breaking_spaces: bool,
        escape_output_html: bool,
        get_data_value_callback: Option<DataValueCallbackFn>,
    ) -> Self {
        Self {
            apl_code,
            name,
            width,
            formatter,
            renderer,
            truncate_if_longer,
            formatter_will_change_value_length,
            non_breaking_spaces,
            escape_output_html,
            get_data_value_callback,
            forced_data_type: None,
        }
    }

    pub fn get_width_px(&self) -> i32 {
        self.width * 8
    }

    pub fn get_auto_width_by_data(&self, data: &[HashMap<String, String>]) -> usize {
        let mut max_width = self.name.chars().count();

        for row in data {
            let value = row.get(&self.apl_code);
            match value {
                None => continue,
                Some(v) if v.is_empty() => continue,
                Some(v) => {
                    if self.formatter.is_some() && self.formatter_will_change_value_length {
                        if let Some(ref fmt) = self.formatter {
                            let formatted = fmt(v, "console");
                            max_width = max_width.max(formatted.chars().count());
                        }
                    } else {
                        max_width = max_width.max(v.chars().count());
                    }
                }
            }
        }

        max_width.min(1000)
    }

    pub fn get_data_value(&self, row: &HashMap<String, String>) -> String {
        if let Some(ref callback) = self.get_data_value_callback {
            return callback(row);
        }
        row.get(&self.apl_code).cloned().unwrap_or_default()
    }
}


================================================
FILE: src/content_processor/astro_processor.rs
================================================
// SiteOne Crawler - AstroProcessor
// (c) Jan Reges <jan.reges@siteone.cz>
//
// Handles Astro specific patterns - extracts component-url and renderer-url,
// and inlines modules for offline version (CORS blocking with file:// protocol).

use std::collections::HashSet;

use md5::{Digest, Md5};
use once_cell::sync::Lazy;
use regex::Regex;

use crate::content_processor::base_processor::{ProcessorConfig, is_relevant};
use crate::content_processor::content_processor::ContentProcessor;
use crate::engine::found_url::{FoundUrl, UrlSource};
use crate::engine::found_urls::FoundUrls;
use crate::engine::parsed_url::ParsedUrl;
use crate::types::ContentTypeId;

static RE_ASTRO_URLS: Lazy<Regex> =
    Lazy::new(|| Regex::new(r#"(?i)(component-url|renderer-url)=["']([^"']+)["']"#).unwrap());

// For offline version - match <script type="module" src="..."> tags
static RE_MODULE_SCRIPT_SRC_FIRST: Lazy<Regex> =
    Lazy::new(|| Regex::new(r#"(?im)<script[^>]+type="module"[^>]+src="([^"]+)"[^>]*>\s*</script>"#).unwrap());

static RE_MODULE_SCRIPT_SRC_SECOND: Lazy<Regex> =
    Lazy::new(|| Regex::new(r#"(?im)<script[^>]+src="([^"]+)"[^>]+type="module"[^>]*>\s*</script>"#).unwrap());

static RE_IMPORT_STATEMENT: Lazy<Regex> = Lazy::new(|| Regex::new(r#"(?i)import\s*["']([^"']+)["']\s*;?"#).unwrap());

pub struct AstroProcessor {
    #[allow(dead_code)]
    config: ProcessorConfig,
    debug_mode: bool,
    relevant_content_types: Vec<ContentTypeId>,
}

impl AstroProcessor {
    pub fn new(config: ProcessorConfig) -> Self {
        Self {
            config,
            debug_mode: false,
            relevant_content_types: vec![ContentTypeId::Html, ContentTypeId::Script],
        }
    }
}

impl AstroProcessor {
    /// Recursively detect and inline imported modules.
    #[allow(clippy::only_used_in_recursion)]
    fn detect_and_include_other_modules(
        &self,
        module_content: &str,
        module_url: &ParsedUrl,
        inline_modules: &mut Vec<String>,
        content_loader: &dyn Fn(&str) -> Option<String>,
        depth: u32,
    ) -> String {
        if depth > 10 {
            return module_content.to_string();
        }

        RE_IMPORT_STATEMENT
            .replace_all(module_content, |caps: &regex::Captures| {
                let src = caps.get(1).map_or("", |m| m.as_str()).trim();
                let src_parsed_url = ParsedUrl::parse(src, Some(module_url));
                let src_full_url = src_parsed_url.get_full_url(true, false);

                if let Some(mut src_content) = content_loader(&src_full_url) {
                    if src_content.contains("import") {
                        src_content = self.detect_and_include_other_modules(
                            &src_content,
                            &src_parsed_url,
                            inline_modules,
                            content_loader,
                            depth + 1,
                        );
                    }
                    inline_modules.push(src_content);

                    if depth == 0 {
                        "/* SiteOne Crawler: imported as inline modules recursively */".to_string()
                    } else {
                        src.to_string()
                    }
                } else {
                    // Module not found in storage, keep original import
                    caps[0].to_string()
                }
            })
            .to_string()
    }

    /// Replace module script tag with inlined content.
    fn inline_module_script(
        &self,
        src: &str,
        url: &ParsedUrl,
        already_included: &mut HashSet<String>,
        content_loader: &dyn Fn(&str) -> Option<String>,
    ) -> String {
        let src_parsed_url = ParsedUrl::parse(src, Some(url));
        let src_full_url = src_parsed_url.get_full_url(true, false);

        if let Some(src_content) = content_loader(&src_full_url) {
            let mut inline_modules: Vec<String> = Vec::new();
            let processed_content = self.detect_and_include_other_modules(
                &src_content,
                &src_parsed_url,
                &mut inline_modules,
                content_loader,
                0,
            );

            let mut result = String::new();
            for inline_module in &inline_modules {
                let mut hasher = Md5::new();
                hasher.update(inline_module.as_bytes());
                let module_md5 = format!("{:x}", hasher.finalize());
                if already_included.contains(&module_md5) {
                    continue;
                }
                result.push_str(&format!("<script type=\"module\">{}</script>\n", inline_module));
                already_included.insert(module_md5);
            }

            result.push_str(&format!("<script type=\"module\">{}</script>", processed_content));
            result
        } else {
            // Module not found - keep script tag but remove type="module" for offline compatibility
            format!("<script src=\"{}\"></script>", src)
        }
    }
}

impl ContentProcessor for AstroProcessor {
    fn find_urls(&self, content: &str, source_url: &ParsedUrl) -> Option<FoundUrls> {
        // Only process content containing "astro"
        if !content.contains("astro") {
            return None;
        }

        let source_url_str = source_url.get_full_url(true, false);
        let mut found_urls = FoundUrls::new();

        for caps in RE_ASTRO_URLS.captures_iter(content) {
            if let Some(m) = caps.get(2) {
                let parsed = ParsedUrl::parse(m.as_str(), Some(source_url));
                found_urls.add_url(FoundUrl::new(
                    &parsed.get_full_url(true, false),
                    &source_url_str,
                    UrlSource::JsUrl,
                ));
            }
        }

        if found_urls.get_count() > 0 {
            Some(found_urls)
        } else {
            None
        }
    }

    fn apply_content_changes_before_url_parsing(
        &self,
        _content: &mut String,
        _content_type: ContentTypeId,
        _url: &ParsedUrl,
    ) {
        // No changes needed before URL parsing in AstroProcessor
    }

    fn apply_content_changes_for_offline_version(
        &self,
        content: &mut String,
        _content_type: ContentTypeId,
        _url: &ParsedUrl,
        _remove_unwanted_code: bool,
    ) {
        // Without a content loader, we can only remove type="module" for offline compatibility.
        // Full module inlining happens in apply_content_changes_for_offline_version_with_loader.
        if !content.contains("astro") || self.config.disable_astro_inline_modules {
            return;
        }

        *content = RE_MODULE_SCRIPT_SRC_FIRST
            .replace_all(content, |caps: &regex::Captures| {
                let src = caps.get(1).map_or("", |m| m.as_str());
                format!("<script src=\"{}\"></script>", src)
            })
            .to_string();

        *content = RE_MODULE_SCRIPT_SRC_SECOND
            .replace_all(content, |caps: &regex::Captures| {
                let src = caps.get(1).map_or("", |m| m.as_str());
                format!("<script src=\"{}\"></script>", src)
            })
            .to_string();
    }

    fn apply_content_changes_for_offline_version_with_loader(
        &self,
        content: &mut String,
        _content_type: ContentTypeId,
        url: &ParsedUrl,
        _remove_unwanted_code: bool,
        content_loader: &dyn Fn(&str) -> Option<String>,
    ) {
        if !content.contains("astro") || self.config.disable_astro_inline_modules {
            return;
        }

        let mut already_included: HashSet<String> = HashSet::new();

        // Inline module scripts - pattern 1: <script type="module" src="...">
        *content = RE_MODULE_SCRIPT_SRC_FIRST
            .replace_all(content, |caps: &regex::Captures| {
                let src = caps.get(1).map_or("", |m| m.as_str());
                self.inline_module_script(src, url, &mut already_included, content_loader)
            })
            .to_string();

        // Inline module scripts - pattern 2: <script src="..." type="module">
        *content = RE_MODULE_SCRIPT_SRC_SECOND
            .replace_all(content, |caps: &regex::Captures| {
                let src = caps.get(1).map_or("", |m| m.as_str());
                self.inline_module_script(src, url, &mut already_included, content_loader)
            })
            .to_string();
    }

    fn is_content_type_relevant(&self, content_type: ContentTypeId) -> bool {
        is_relevant(content_type, &self.relevant_content_types)
    }

    fn get_name(&self) -> &str {
        "AstroProcessor"
    }

    fn set_debug_mode(&mut self, debug_mode: bool) {
        self.debug_mode = debug_mode;
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    fn make_config() -> ProcessorConfig {
        ProcessorConfig::new(ParsedUrl::parse("https://example.com/", None))
    }

    #[test]
    fn test_find_astro_urls() {
        let processor = AstroProcessor::new(make_config());
        let html = r#"<astro-island component-url="/_astro/TestSlider.fb32dc5a.js" component-export="default" renderer-url="/_astro/client.c4e17359.js">"#;
        let source = ParsedUrl::parse("https://example.com/page", None);
        let result = processor.find_urls(html, &source);
        assert!(result.is_some());
        assert_eq!(result.unwrap().get_count(), 2);
    }

    #[test]
    fn test_no_astro_content() {
        let processor = AstroProcessor::new(make_config());
        let html = r#"<html><body>Regular page</body></html>"#;
        let source = ParsedUrl::parse("https://example.com/page", None);
        let result = processor.find_urls(html, &source);
        assert!(result.is_none());
    }

    #[test]
    fn test_module_inlining_with_loader() {
        let processor = AstroProcessor::new(make_config());
        let mut content =
            r#"<html><head><!-- astro --><script type="module" src="/_astro/app.js"></script></head></html>"#
                .to_string();
        let url = ParsedUrl::parse("https://example.com/page", None);

        let content_loader = |url_str: &str| -> Option<String> {
            if url_str.contains("app.js") {
                Some("console.log('hello');".to_string())
            } else {
                None
            }
        };

        processor.apply_content_changes_for_offline_version_with_loader(
            &mut content,
            ContentTypeId::Html,
            &url,
            false,
            &content_loader,
        );

        // Should have inlined the module content
        assert!(content.contains("console.log('hello');"));
        // Should not have the original src attribute anymore
        assert!(!content.contains(r#"src="/_astro/app.js""#));
    }

    #[test]
    fn test_module_inlining_without_loader_falls_back() {
        let processor = AstroProcessor::new(make_config());
        let mut content =
            r#"<html><head><!-- astro --><script type="module" src="/_astro/app.js"></script></head></html>"#
                .to_string();
        let url = ParsedUrl::parse("https://example.com/page", None);

        processor.apply_content_changes_for_offline_version(&mut content, ContentTypeId::Html, &url, false);

        // Without loader, should remove type="module" but keep src
        assert!(content.contains(r#"<script src="/_astro/app.js"></script>"#));
        assert!(!content.contains("type=\"module\""));
    }
}


================================================
FILE: src/content_processor/base_processor.rs
================================================
// SiteOne Crawler - BaseProcessor shared utilities
// (c) Jan Reges <jan.reges@siteone.cz>
//
// Provides shared utility methods used by all content processors.

use crate::engine::parsed_url::ParsedUrl;
use crate::export::utils::offline_url_converter::OfflineUrlConverter;
use crate::types::ContentTypeId;

/// Configuration extracted from CoreOptions, shared across processors.
/// This avoids each processor needing a reference to the full crawler.
#[derive(Debug, Clone)]
pub struct ProcessorConfig {
    pub single_page: bool,
    pub single_foreign_page: bool,
    pub max_depth: i64,
    pub files_enabled: bool,
    pub images_enabled: bool,
    pub scripts_enabled: bool,
    pub styles_enabled: bool,
    pub fonts_enabled: bool,
    pub disable_javascript: bool,
    pub remove_all_anchor_listeners: bool,
    pub ignore_regex: Vec<String>,
    /// Pre-compiled ignore regexes for hot path usage
    pub compiled_ignore_regex: Vec<regex::Regex>,
    pub disable_astro_inline_modules: bool,
    pub offline_export_preserve_urls: bool,
    pub initial_url: ParsedUrl,
}

impl ProcessorConfig {
    pub fn new(initial_url: ParsedUrl) -> Self {
        Self {
            single_page: false,
            single_foreign_page: false,
            max_depth: 0,
            files_enabled: true,
            images_enabled: true,
            scripts_enabled: true,
            styles_enabled: true,
            fonts_enabled: true,
            disable_javascript: false,
            remove_all_anchor_listeners: false,
            ignore_regex: Vec::new(),
            compiled_ignore_regex: Vec::new(),
            disable_astro_inline_modules: false,
            offline_export_preserve_urls: false,
            initial_url,
        }
    }

    /// Compile ignore_regex patterns into Regex objects for hot path usage.
    /// Call this after setting ignore_regex.
    pub fn compile_ignore_regex(&mut self) {
        self.compiled_ignore_regex = self
            .ignore_regex
            .iter()
            .filter_map(|pattern| regex::Regex::new(pattern).ok())
            .collect();
    }
}

/// Check if a content type is in the list of relevant types
pub fn is_relevant(content_type: ContentTypeId, relevant_types: &[ContentTypeId]) -> bool {
    relevant_types.contains(&content_type)
}

/// Normalize a URL path by resolving `.` and `..` segments.
fn normalize_path(path: &str) -> String {
    let mut segments: Vec<&str> = Vec::new();
    for segment in path.split('/') {
        match segment {
            "." => {}
            ".." => {
                segments.pop();
            }
            s => segments.push(s),
        }
    }
    let result = segments.join("/");
    if result.starts_with('/') {
        result
    } else {
        format!("/{}", result)
    }
}

/// Convert a URL to a relative path for offline use.
/// When `preserve_urls` is true, same-domain links become root-relative and cross-domain links stay absolute.
pub fn convert_url_to_relative(
    base_url: &ParsedUrl,
    target_url: &str,
    initial_url: &ParsedUrl,
    attribute: Option<&str>,
    preserve_urls: bool,
) -> String {
    // If it's a data URI, anchor, or non-http scheme, return as-is
    if target_url.starts_with("data:")
        || target_url.starts_with("javascript:")
        || target_url.starts_with("mailto:")
        || target_url.starts_with("tel:")
    {
        return target_url.to_string();
    }

    // Normalize HTML entities in URL before parsing so it matches what FoundUrl stored.
    // Only decode entities (not full normalize_url which also trims trailing &, quotes, etc.
    // — those transformations are for discovery, not for offline conversion of already-parsed URLs).
    let normalized = target_url.replace("&#38;", "&").replace("&amp;", "&");
    let parsed_target = ParsedUrl::parse(&normalized, Some(base_url));

    if preserve_urls {
        let target_host = parsed_target.host.as_deref().unwrap_or("");
        let initial_host = initial_url.host.as_deref().unwrap_or("");
        if target_host.is_empty() || target_host == initial_host {
            // Same domain → root-relative (path + query + fragment)
            // Normalize path segments (resolve .. and .)
            let normalized_path = normalize_path(&parsed_target.path);
            let mut result = normalized_path;
            if let Some(ref q) = parsed_target.query {
                result.push('?');
                result.push_str(q);
            }
            if let Some(ref f) = parsed_target.fragment {
                result.push('#');
                result.push_str(f);
            }
            return result;
        } else {
            // Cross domain → full absolute URL
            return parsed_target.get_full_url(true, true);
        }
    }

    let mut converter = OfflineUrlConverter::new(
        initial_url.clone(),
        base_url.clone(),
        parsed_target,
        None,
        None,
        attribute,
    );

    converter.convert_url_to_relative(true)
}

#[cfg(test)]
mod tests {
    use super::*;

    fn initial_url() -> ParsedUrl {
        ParsedUrl::parse("https://example.com/", None)
    }

    #[test]
    fn decode_amp_entity_before_offline_conversion() {
        let base = ParsedUrl::parse("https://example.com/blog/", None);
        let result = convert_url_to_relative(&base, "/style.css?v=1&amp;t=2", &initial_url(), Some("href"), false);
        // &amp; must be decoded to & so the query hash matches what FoundUrl stored
        assert!(
            !result.contains("&amp;"),
            "HTML entity &amp; should be decoded before conversion"
        );
    }

    #[test]
    fn decode_numeric_entity_before_offline_conversion() {
        let base = ParsedUrl::parse("https://example.com/", None);
        let result = convert_url_to_relative(&base, "/page?a=1&#38;b=2", &initial_url(), Some("href"), false);
        assert!(
            !result.contains("&#38;"),
            "HTML entity &#38; should be decoded before conversion"
        );
    }

    #[test]
    fn preserve_trailing_ampersand() {
        // Trailing & in a query string should NOT be stripped (unlike in FoundUrl discovery)
        let base = ParsedUrl::parse("https://example.com/", None);
        let a = convert_url_to_relative(&base, "/page?a=1&", &initial_url(), Some("href"), false);
        let b = convert_url_to_relative(&base, "/page?a=1&b=", &initial_url(), Some("href"), false);
        // Both should produce different results (trailing & matters for hash)
        assert_ne!(a, b, "trailing & should be preserved, not stripped");
    }

    #[test]
    fn skip_data_uri() {
        let base = ParsedUrl::parse("https://example.com/", None);
        let result = convert_url_to_relative(&base, "data:image/png;base64,abc", &initial_url(), None, false);
        assert_eq!(result, "data:image/png;base64,abc");
    }

    #[test]
    fn skip_javascript_uri() {
        let base = ParsedUrl::parse("https://example.com/", None);
        let result = convert_url_to_relative(&base, "javascript:void(0)", &initial_url(), None, false);
        assert_eq!(result, "javascript:void(0)");
    }

    // --- preserve_urls tests ---

    #[test]
    fn preserve_urls_same_domain_absolute() {
        let base = ParsedUrl::parse("https://example.com/blog/post", None);
        let result = convert_url_to_relative(
            &base,
            "https://example.com/designy/classic",
            &initial_url(),
            Some("href"),
            true,
        );
        assert_eq!(result, "/designy/classic");
    }

    #[test]
    fn preserve_urls_same_domain_root_relative() {
        let base = ParsedUrl::parse("https://example.com/blog/post", None);
        let result = convert_url_to_relative(&base, "/about", &initial_url(), Some("href"), true);
        assert_eq!(result, "/about");
    }

    #[test]
    fn preserve_urls_same_domain_relative() {
        let base = ParsedUrl::parse("https://example.com/blog/post", None);
        let result = convert_url_to_relative(&base, "../images/logo.png", &initial_url(), Some("src"), true);
        assert_eq!(result, "/images/logo.png");
    }

    #[test]
    fn preserve_urls_cross_domain() {
        let base = ParsedUrl::parse("https://example.com/page", None);
        let result = convert_url_to_relative(
            &base,
            "https://cdn.other.com/style.css",
            &initial_url(),
            Some("href"),
            true,
        );
        assert_eq!(result, "https://cdn.other.com/style.css");
    }

    #[test]
    fn preserve_urls_with_query_and_fragment() {
        let base = ParsedUrl::parse("https://example.com/", None);
        let result = convert_url_to_relative(&base, "/page?key=val#section", &initial_url(), Some("href"), true);
        assert_eq!(result, "/page?key=val#section");
    }

    #[test]
    fn preserve_urls_data_uri_unchanged() {
        let base = ParsedUrl::parse("https://example.com/", None);
        let result = convert_url_to_relative(&base, "data:image/png;base64,abc", &initial_url(), None, true);
        assert_eq!(result, "data:image/png;base64,abc");
    }

    #[test]
    fn preserve_urls_mailto_unchanged() {
        let base = ParsedUrl::parse("https://example.com/", None);
        let result = convert_url_to_relative(&base, "mailto:test@example.com", &initial_url(), None, true);
        assert_eq!(result, "mailto:test@example.com");
    }
}


================================================
FILE: src/content_processor/content_processor.rs
================================================
// SiteOne Crawler - ContentProcessor trait
// (c) Jan Reges <jan.reges@siteone.cz>

use crate::engine::found_urls::FoundUrls;
use crate::engine::parsed_url::ParsedUrl;
use crate::types::ContentTypeId;

/// Trait for content processors that extract URLs and modify content
/// for offline versions.
pub trait ContentProcessor: Send + Sync {
    /// Parse and find framework specific URLs in HTML/CSS/JS
    fn find_urls(&self, content: &str, source_url: &ParsedUrl) -> Option<FoundUrls>;

    /// Apply content changes for HTML/CSS/JS before URL parsing,
    /// directly modifying the content string.
    /// Called by manager only if is_content_type_relevant() returns true.
    fn apply_content_changes_before_url_parsing(
        &self,
        content: &mut String,
        content_type: ContentTypeId,
        url: &ParsedUrl,
    );

    /// Apply content changes for offline version of the file,
    /// directly modifying the content (HTML/CSS/JS) string.
    /// Called by manager only if is_content_type_relevant() returns true.
    fn apply_content_changes_for_offline_version(
        &self,
        content: &mut String,
        content_type: ContentTypeId,
        url: &ParsedUrl,
        remove_unwanted_code: bool,
    );

    /// Apply content changes for offline version with a content loader callback.
    /// The loader takes a URL string and returns its body text if available.
    /// Default implementation delegates to apply_content_changes_for_offline_version.
    /// Only AstroProcessor overrides this to inline modules from storage.
    fn apply_content_changes_for_offline_version_with_loader(
        &self,
        content: &mut String,
        content_type: ContentTypeId,
        url: &ParsedUrl,
        remove_unwanted_code: bool,
        _content_loader: &dyn Fn(&str) -> Option<String>,
    ) {
        self.apply_content_changes_for_offline_version(content, content_type, url, remove_unwanted_code);
    }

    /// Check if this ContentProcessor is relevant for given content type
    fn is_content_type_relevant(&self, content_type: ContentTypeId) -> bool;

    /// Get the name of this processor (used for stats/logging)
    fn get_name(&self) -> &str;

    /// Enable/disable debug mode
    fn set_debug_mode(&mut self, debug_mode: bool);
}


================================================
FILE: src/content_processor/css_processor.rs
================================================
// SiteOne Crawler - CssProcessor
// (c) Jan Reges <jan.reges@siteone.cz>
//
// Extracts URLs from CSS url() and @import, and converts for offline use.

use once_cell::sync::Lazy;
use regex::Regex;

use crate::content_processor::base_processor::{ProcessorConfig, convert_url_to_relative, is_relevant};
use crate::content_processor::content_processor::ContentProcessor;
use crate::engine::found_url::UrlSource;
use crate::engine::found_urls::FoundUrls;
use crate::engine::parsed_url::ParsedUrl;
use crate::types::ContentTypeId;
use crate::utils;

static RE_CSS_URL: Lazy<Regex> = Lazy::new(|| Regex::new(r#"(?im)url\s*\(\s*["']?([^"')]+)["']?\s*\)"#).unwrap());

static RE_IS_IMAGE: Lazy<Regex> =
    Lazy::new(|| Regex::new(r"(?i)\.(jpg|jpeg|png|gif|webp|avif|svg|ico|tif|bmp)(\?.*|#.*)?$").unwrap());

static RE_IS_FONT: Lazy<Regex> = Lazy::new(|| Regex::new(r"(?i)\.(eot|ttf|woff2|woff|otf)(\?.*|#.*)?$").unwrap());

static RE_IS_CSS: Lazy<Regex> = Lazy::new(|| Regex::new(r"(?i)\.css(\?.*|#.*)?$").unwrap());

static RE_CSS_URL_OFFLINE: Lazy<Regex> =
    Lazy::new(|| Regex::new(r#"(?i)url\((['"]?)((?:[^'")\s]|\([^)]*\))+)['"]?\)"#).unwrap());

pub struct CssProcessor {
    config: ProcessorConfig,
    debug_mode: bool,
    relevant_content_types: Vec<ContentTypeId>,
}

impl CssProcessor {
    pub fn new(config: ProcessorConfig) -> Self {
        Self {
            config,
            debug_mode: false,
            relevant_content_types: vec![ContentTypeId::Html, ContentTypeId::Stylesheet],
        }
    }

    /// Remove unwanted code from CSS based on disable options
    fn remove_unwanted_code_from_css(&self, css: &str) -> String {
        let mut result = css.to_string();

        if !self.config.fonts_enabled {
            result = utils::strip_fonts(&result);
        }
        if !self.config.images_enabled {
            result = utils::strip_images(&result, None);
        }

        result
    }
}

impl ContentProcessor for CssProcessor {
    fn find_urls(&self, content: &str, source_url: &ParsedUrl) -> Option<FoundUrls> {
        let source_url_str = source_url.get_full_url(true, false);

        // Find all url() references in CSS
        let mut url_texts: Vec<&str> = Vec::new();
        for caps in RE_CSS_URL.captures_iter(content) {
            if let Some(m) = caps.get(1) {
                let url = m.as_str();
                let is_image = RE_IS_IMAGE.is_match(url);
                let is_font = RE_IS_FONT.is_match(url);
                let is_css = RE_IS_CSS.is_match(url);

                if (self.config.images_enabled && is_image)
                    || (self.config.fonts_enabled && is_font)
                    || (self.config.styles_enabled && is_css)
                {
                    url_texts.push(url);
                }
            }
        }

        let mut found_urls = FoundUrls::new();
        found_urls.add_urls_from_text_array(&url_texts, &source_url_str, UrlSource::CssUrl);

        if found_urls.get_count() > 0 {
            Some(found_urls)
        } else {
            None
        }
    }

    fn apply_content_changes_before_url_parsing(
        &self,
        _content: &mut String,
        _content_type: ContentTypeId,
        _url: &ParsedUrl,
    ) {
        // No changes needed before URL parsing in CssProcessor
    }

    fn apply_content_changes_for_offline_version(
        &self,
        content: &mut String,
        _content_type: ContentTypeId,
        url: &ParsedUrl,
        _remove_unwanted_code: bool,
    ) {
        let initial_url = &self.config.initial_url;

        *content = RE_CSS_URL_OFFLINE
            .replace_all(content, |caps: &regex::Captures| {
                let quote = caps.get(1).map_or("", |m| m.as_str());
                let found_url = caps.get(2).map_or("", |m| m.as_str());

                // If data URI, anchor, or non-requestable resource, skip
                if !utils::is_href_for_requestable_resource(found_url) || found_url.starts_with('#') {
                    return caps.get(0).map_or("", |m| m.as_str()).to_string();
                }

                let relative_url = convert_url_to_relative(
                    url,
                    found_url,
                    initial_url,
                    None,
                    self.config.offline_export_preserve_urls,
                );
                format!("url({}{}{})", quote, relative_url, quote)
            })
            .to_string();

        *content = self.remove_unwanted_code_from_css(content);
    }

    fn is_content_type_relevant(&self, content_type: ContentTypeId) -> bool {
        is_relevant(content_type, &self.relevant_content_types)
    }

    fn get_name(&self) -> &str {
        "CssProcessor"
    }

    fn set_debug_mode(&mut self, debug_mode: bool) {
        self.debug_mode = debug_mode;
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    fn make_config() -> ProcessorConfig {
        ProcessorConfig::new(ParsedUrl::parse("https://example.com/", None))
    }

    #[test]
    fn test_find_css_urls() {
        let processor = CssProcessor::new(make_config());
        let css = r#"
            body { background: url('/img/bg.jpg'); }
            @font-face { src: url('/fonts/custom.woff2'); }
        "#;
        let source = ParsedUrl::parse("https://example.com/style.css", None);
        let result = processor.find_urls(css, &source);
        assert!(result.is_some());
        assert!(result.unwrap().get_count() >= 2);
    }

    #[test]
    fn test_find_css_urls_disabled_images() {
        let mut config = make_config();
        config.images_enabled = false;
        let processor = CssProcessor::new(config);
        let css = r#"body { background: url('/img/bg.jpg'); }"#;
        let source = ParsedUrl::parse("https://example.com/style.css", None);
        let result = processor.find_urls(css, &source);
        // Should be None because images are disabled
        assert!(result.is_none());
    }
}


================================================
FILE: src/content_processor/html_processor.rs
================================================
// SiteOne Crawler - HtmlProcessor
// (c) Jan Reges <jan.reges@siteone.cz>
//
// Extracts URLs from HTML content and applies offline conversion changes.

use once_cell::sync::Lazy;
use regex::Regex;

use crate::content_processor::base_processor::{ProcessorConfig, convert_url_to_relative, is_relevant};
use crate::content_processor::content_processor::ContentProcessor;
use crate::engine::found_url::UrlSource;
use crate::engine::found_urls::FoundUrls;
use crate::engine::parsed_url::ParsedUrl;
use crate::types::ContentTypeId;
use crate::utils;

pub const JS_VARIABLE_NAME_URL_DEPTH: &str = "_SiteOneUrlDepth";

pub const HTML_PAGES_EXTENSIONS: &[&str] = &[
    "htm", "html", "shtml", "php", "phtml", "ashx", "xhtml", "asp", "aspx", "jsp", "jspx", "do", "cfm", "cgi", "pl",
];

static HTML_EXT_REGEX: Lazy<Regex> = Lazy::new(|| {
    let pattern = format!(r"(?i)\.({})", HTML_PAGES_EXTENSIONS.join("|"));
    Regex::new(&pattern).unwrap()
});

static RE_A_HREF: Lazy<Regex> =
    Lazy::new(|| Regex::new(r#"(?is)<a[^>]*\shref=(?:["']([^"'#][^"']*)["']|([^\s>"'#][^\s>"']*))[^>]*>"#).unwrap());

static RE_ESCAPED_HREF: Lazy<Regex> =
    Lazy::new(|| Regex::new(r#"(?i)href\\["'][:=]\\["'](https?://[^"'\\]+)\\["']"#).unwrap());

static RE_FONT_URL: Lazy<Regex> = Lazy::new(|| {
    Regex::new(r#"(?is)url\s*\(\s*['"]?([^'"\s>]+\.(eot|ttf|woff2|woff|otf)[^'")\s]*)['"]?\s*\)"#).unwrap()
});

static RE_FONT_LINK: Lazy<Regex> = Lazy::new(|| {
    Regex::new(r#"(?is)<link\s+[^>]*href=(?:["']([^"']+\.(?:eot|ttf|woff2|woff|otf)[^"']*)["']|([^\s>"']+\.(?:eot|ttf|woff2|woff|otf)[^\s>"']*))[^>]*>"#).unwrap()
});

static RE_IMG_SRC: Lazy<Regex> =
    Lazy::new(|| Regex::new(r#"(?is)<img\s+[^>]*?src=(?:["']([^"']+)["']|([^\s>"']+))[^>]*>"#).unwrap());

static RE_IMG_DATA_SRC: Lazy<Regex> =
    Lazy::new(|| Regex::new(r#"(?is)<img\s+[^>]*?data-src=(?:["']([^"']+)["']|([^\s>"']+))[^>]*>"#).unwrap());

static RE_INPUT_SRC: Lazy<Regex> = Lazy::new(|| {
    Regex::new(r#"(?is)<input\s+[^>]*?src=(?:["']([^"']+\.[a-z0-9]{1,10})["']|([^\s>"']+\.[a-z0-9]{1,10}))[^>]*>"#)
        .unwrap()
});

static RE_LINK_IMAGE: Lazy<Regex> = Lazy::new(|| {
    Regex::new(r#"(?is)<link\s+[^>]*?href=(?:["']([^"']+\.(?:png|gif|jpg|jpeg|webp|avif|tif|bmp|svg|ico)(?:\?[^"']*)?)["']|([^\s>"']+\.(?:png|gif|jpg|jpeg|webp|avif|tif|bmp|svg|ico)(?:\?[^\s>"']*)?))[^>]*>"#).unwrap()
});

static RE_SOURCE_SRC: Lazy<Regex> =
    Lazy::new(|| Regex::new(r#"(?is)<source\s+[^>]*?src=["']([^"'>]+)["'][^>]*>"#).unwrap());

static RE_CSS_URL_IMAGE: Lazy<Regex> = Lazy::new(|| {
    Regex::new(r#"(?is)url\s*\(\s*['"]?([^'")\s]+\.(jpg|jpeg|png|gif|bmp|tif|webp|avif)[^'")\s]*)['"]?\s*\)"#).unwrap()
});

static RE_SOURCE_SRCSET: Lazy<Regex> =
    Lazy::new(|| Regex::new(r#"(?is)<source\s+[^>]*?srcset=["']([^"'>]+)["'][^>]*>"#).unwrap());

static RE_IMG_SRCSET: Lazy<Regex> = Lazy::new(|| Regex::new(r#"(?is)<img[^>]+srcset=["']([^"']+)["']"#).unwrap());

static RE_IMAGESRCSET: Lazy<Regex> =
    Lazy::new(|| Regex::new(r#"(?is)<[a-z]+[^>]+imagesrcset=["']([^"']+)["']"#).unwrap());

static RE_AUDIO_SRC: Lazy<Regex> =
    Lazy::new(|| Regex::new(r#"(?is)<audio\s+[^>]*?src=(?:["']([^"']+)["']|([^\s>"']+))[^>]*>"#).unwrap());

static RE_VIDEO_SRC: Lazy<Regex> =
    Lazy::new(|| Regex::new(r#"(?is)<video\s+[^>]*?src=(?:["']([^"']+)["']|([^\s>"']+))[^>]*>"#).unwrap());

static RE_SCRIPT_SRC: Lazy<Regex> =
    Lazy::new(|| Regex::new(r#"(?is)<script\s+[^>]*?src=(?:["']([^"']+)["']|([^\s>"']+))[^>]*>"#).unwrap());

static RE_LINK_JS: Lazy<Regex> = Lazy::new(|| {
    Regex::new(r#"(?is)<link\s+[^>]*href=(?:["']([^"']+\.(?:json|js)(?:\?[^"']*)?)["']|([^\s>"']+\.(?:json|js)(?:\?[^\s>"']*)?))[^>]*>"#).unwrap()
});

static RE_DOT_SRC: Lazy<Regex> = Lazy::new(|| Regex::new(r#"(?is)\.src\s*=\s*["']([^"']+)["']"#).unwrap());

static RE_NEXTJS_CHUNKS: Lazy<Regex> =
    Lazy::new(|| Regex::new(r#"(?is):([a-z0-9/._\-\[\]]+chunks[a-z0-9/._\-\[\]]+\.js)"#).unwrap());

static RE_LINK_STYLESHEET: Lazy<Regex> =
    Lazy::new(|| Regex::new(r#"(?is)<link\s+[^>]*?href=["']([^"']+)["'][^>]*>"#).unwrap());

static RE_FILE_EXTENSION: Lazy<Regex> = Lazy::new(|| Regex::new(r"(?i)\.[a-z0-9]{1,10}(\?.*)?$").unwrap());

// Offline version regexes
static RE_HREF_SRC: Lazy<Regex> = Lazy::new(|| {
    Regex::new(r#"(?is)(\.|<[a-z0-9]{1,10}[^>]*\s+)(href|src|component-url)\s*(=)\s*(['"]?)([^'">]+)['"]?([^>]*)"#)
        .unwrap()
});

static RE_SRCSET_ATTR: Lazy<Regex> = Lazy::new(|| {
    Regex::new(
        r#"(?is)(\.|<[a-z0-9]{1,10}[^>]*\s+)(imagesrcset|srcset|renderer-url)\s*(=)\s*(['"]?)([^'">]+)['"]?([^>]*)"#,
    )
    .unwrap()
});

static RE_META_URL: Lazy<Regex> =
    Lazy::new(|| Regex::new(r#"(?im)(<meta[^>]*)(url)\s*(=)\s*(['"]?)([^'">]+)['"]?(")"#).unwrap());

static RE_ESCAPED_HREF_SRC: Lazy<Regex> =
    Lazy::new(|| Regex::new(r#"(?is)(.)(href\\["']|src\\["'])([:=])(\\["'])([^"'\\]+)\\["'](.)"#).unwrap());

static RE_META_REFRESH: Lazy<Regex> = Lazy::new(|| Regex::new(r#"(?i)(<meta[^>]*url=)([^"']+)(["'][^>]*>)"#).unwrap());

static RE_PORT_NORMALIZE: Lazy<Regex> = Lazy::new(|| Regex::new(r"(?i)((https?:)?//[a-z0-9._-]+):[0-9]+").unwrap());

static RE_CLOSE_HEAD: Lazy<Regex> = Lazy::new(|| Regex::new(r"(?i)<\s*/\s*head\s*>").unwrap());

static RE_CLOSE_BODY: Lazy<Regex> = Lazy::new(|| Regex::new(r"(?i)<\s*/\s*body\s*>").unwrap());

static RE_NON_HTTP_SCHEME: Lazy<Regex> = Lazy::new(|| Regex::new(r"(?i)^[a-z]+:[a-z0-9+]").unwrap());

static RE_EXTERNAL_SCRIPT: Lazy<Regex> =
    Lazy::new(|| Regex::new(r#"(?is)<script[^>]*src=["']?(.*?)["']?[^>]*>.*?</script>"#).unwrap());

static RE_EXTERNAL_URL: Lazy<Regex> = Lazy::new(|| Regex::new(r"(?i)^(https?:)?//").unwrap());

static RE_CROSSORIGIN_LINK: Lazy<Regex> =
    Lazy::new(|| Regex::new(r#"(?i)(<link[^>]+)\s*crossorigin(\s*=\s*["']?.*?["']?)?(\s*[^>]*>)"#).unwrap());

static RE_CROSSORIGIN_SCRIPT: Lazy<Regex> =
    Lazy::new(|| Regex::new(r#"(?i)(<script[^>]+)\s*crossorigin(\s*=\s*["']?.*?["']?)?(\s*[^>]*>)"#).unwrap());

static RE_SCRIPT_BLOCK: Lazy<Regex> = Lazy::new(|| Regex::new(r"(?is)<script[^>]*>(.*?)</script>").unwrap());

static RE_SOCNET_IFRAME: Lazy<Regex> =
    Lazy::new(|| Regex::new(r"(?is)<iframe[^>]*(facebook\.com|twitter\.com|linkedin\.com)[^>]*>.*?</iframe>").unwrap());

pub struct HtmlProcessor {
    config: ProcessorConfig,
    debug_mode: bool,
    relevant_content_types: Vec<ContentTypeId>,
}

impl HtmlProcessor {
    pub fn new(config: ProcessorConfig) -> Self {
        Self {
            config,
            debug_mode: false,
            relevant_content_types: vec![ContentTypeId::Html, ContentTypeId::Redirect],
        }
    }

    /// Find <a href> URLs
    fn find_href_urls(&self, html: &str, source_url: &ParsedUrl, found_urls: &mut FoundUrls) {
        let source_url_str = source_url.get_full_url(true, false);

        // Standard <a href="..."> links
        let mut urls: Vec<String> = Vec::new();
        for caps in RE_A_HREF.captures_iter(html) {
            if let Some(m) = caps.get(1).or_else(|| caps.get(2)) {
                urls.push(m.as_str().to_string());
            }
        }

        // Escaped href URLs (e.g., href\":\")
        for caps in RE_ESCAPED_HREF.captures_iter(html) {
            if let Some(m) = caps.get(1) {
                urls.push(m.as_str().to_string());
            }
        }

        // If single_foreign_page is set and source is on a different 2nd-level domain, skip
        if self.config.single_foreign_page && source_url.domain_2nd_level != self.config.initial_url.domain_2nd_level {
            return;
        }

        // Filter by max depth
        if self.config.max_depth > 0 {
            urls.retain(|url_str| {
                let parsed = ParsedUrl::parse(url_str, Some(source_url));
                parsed.get_depth() <= self.config.max_depth as usize
            });
        }

        // Filter out files if files are disabled
        if !self.config.files_enabled {
            urls.retain(|url_str| !RE_FILE_EXTENSION.is_match(url_str) || HTML_EXT_REGEX.is_match(url_str));
        }

        let url_refs: Vec<&str> = urls.iter().map(|s| s.as_str()).collect();
        found_urls.add_urls_from_text_array(&url_refs, &source_url_str, UrlSource::AHref);
    }

    /// Find font URLs in CSS and link tags
    fn find_fonts(&self, html: &str, source_url: &ParsedUrl, found_urls: &mut FoundUrls) {
        let source_url_str = source_url.get_full_url(true, false);

        // CSS @font-face url()
        let font_urls: Vec<&str> = RE_FONT_URL
            .captures_iter(html)
            .filter_map(|caps| caps.get(1).map(|m| m.as_str()))
            .collect();
        found_urls.add_urls_from_text_array(&font_urls, &source_url_str, UrlSource::CssUrl);

        // <link href="...(font extensions)"
        let link_fonts: Vec<&str> = RE_FONT_LINK
            .captures_iter(html)
            .filter_map(|caps| caps.get(1).or_else(|| caps.get(2)).map(|m| m.as_str()))
            .collect();
        found_urls.add_urls_from_text_array(&link_fonts, &source_url_str, UrlSource::LinkHref);
    }

    /// Find image URLs from various sources
    fn find_images(&self, html: &str, source_url: &ParsedUrl, found_urls: &mut FoundUrls) {
        let source_url_str = source_url.get_full_url(true, false);

        // <img src="..."
        let img_srcs: Vec<&str> = RE_IMG_SRC
            .captures_iter(html)
            .filter_map(|caps| caps.get(1).or_else(|| caps.get(2)).map(|m| m.as_str()))
            .collect();
        found_urls.add_urls_from_text_array(&img_srcs, &source_url_str, UrlSource::ImgSrc);

        // <img data-src="..." (lazy loading)
        let data_srcs: Vec<&str> = RE_IMG_DATA_SRC
            .captures_iter(html)
            .filter_map(|caps| caps.get(1).or_else(|| caps.get(2)).map(|m| m.as_str()))
            .collect();
        found_urls.add_urls_from_text_array(&data_srcs, &source_url_str, UrlSource::ImgSrc);

        // <input src="..."
        let input_srcs: Vec<&str> = RE_INPUT_SRC
            .captures_iter(html)
            .filter_map(|caps| caps.get(1).or_else(|| caps.get(2)).map(|m| m.as_str()))
            .collect();
        found_urls.add_urls_from_text_array(&input_srcs, &source_url_str, UrlSource::InputSrc);

        // <link href="...(image extensions)"
        let link_imgs: Vec<&str> = RE_LINK_IMAGE
            .captures_iter(html)
            .filter_map(|caps| caps.get(1).or_else(|| caps.get(2)).map(|m| m.as_str()))
            .collect();
        found_urls.add_urls_from_text_array(&link_imgs, &source_url_str, UrlSource::LinkHref);

        // <source src="..."
        let source_srcs: Vec<&str> = RE_SOURCE_SRC
            .captures_iter(html)
            .filter_map(|caps| caps.get(1).map(|m| m.as_str()))
            .collect();
        found_urls.add_urls_from_text_array(&source_srcs, &source_url_str, UrlSource::SourceSrc);

        // CSS url() with image extensions
        let css_imgs: Vec<&str> = RE_CSS_URL_IMAGE
            .captures_iter(html)
            .filter_map(|caps| caps.get(1).map(|m| m.as_str()))
            .collect();
        found_urls.add_urls_from_text_array(&css_imgs, &source_url_str, UrlSource::CssUrl);

        // srcset from <source>, <img>, and imagesrcset
        let mut srcset_urls: Vec<String> = Vec::new();

        let mut srcset_values: Vec<&str> = Vec::new();
        for caps in RE_SOURCE_SRCSET.captures_iter(html) {
            if let Some(m) = caps.get(1) {
                srcset_values.push(m.as_str());
            }
        }
        for caps in RE_IMG_SRCSET.captures_iter(html) {
            if let Some(m) = caps.get(1) {
                srcset_values.push(m.as_str());
            }
        }
        for caps in RE_IMAGESRCSET.captures_iter(html) {
            if let Some(m) = caps.get(1) {
                srcset_values.push(m.as_str());
            }
        }

        for srcset in &srcset_values {
            // srcset sources are separated by ", " (comma+space)
            for source in srcset.split(", ") {
                let trimmed = source.trim();
                if trimmed.is_empty() {
                    continue;
                }
                // Split by whitespace to separate URL from size descriptor
                let url_part = trimmed.split_whitespace().next().unwrap_or("");
                let url_trimmed = url_part.trim().to_string();
                if !url_trimmed.is_empty() && !srcset_urls.contains(&url_trimmed) {
                    srcset_urls.push(url_trimmed);
                }
            }
        }

        let srcset_refs: Vec<&str> = srcset_urls.iter().map(|s| s.as_str()).collect();
        found_urls.add_urls_from_text_array(&srcset_refs, &source_url_str, UrlSource::ImgSrcset);
    }

    /// Find audio URLs
    fn find_audio(&self, html: &str, source_url: &ParsedUrl, found_urls: &mut FoundUrls) {
        let source_url_str = source_url.get_full_url(true, false);
        let urls: Vec<&str> = RE_AUDIO_SRC
            .captures_iter(html)
            .filter_map(|caps| caps.get(1).or_else(|| caps.get(2)).map(|m| m.as_str()))
            .collect();
        found_urls.add_urls_from_text_array(&urls, &source_url_str, UrlSource::AudioSrc);
    }

    /// Find video URLs
    fn find_video(&self, html: &str, source_url: &ParsedUrl, found_urls: &mut FoundUrls) {
        let source_url_str = source_url.get_full_url(true, false);
        let urls: Vec<&str> = RE_VIDEO_SRC
            .captures_iter(html)
            .filter_map(|caps| caps.get(1).or_else(|| caps.get(2)).map(|m| m.as_str()))
            .collect();
        found_urls.add_urls_from_text_array(&urls, &source_url_str, UrlSource::VideoSrc);
    }

    /// Find script URLs from <script src>, <link href=".js">, .src= assignments, and NextJS chunks
    fn find_scripts(&self, html: &str, source_url: &ParsedUrl, found_urls: &mut FoundUrls) {
        let source_url_str = source_url.get_full_url(true, false);

        // <script src="..."
        let script_srcs: Vec<&str> = RE_SCRIPT_SRC
            .captures_iter(html)
            .filter_map(|caps| caps.get(1).or_else(|| caps.get(2)).map(|m| m.as_str()))
            .collect();
        found_urls.add_urls_from_text_array(&script_srcs, &source_url_str, UrlSource::ScriptSrc);

        // <link href="...(json|js)"
        let link_js: Vec<&str> = RE_LINK_JS
            .captures_iter(html)
            .filter_map(|caps| caps.get(1).or_else(|| caps.get(2)).map(|m| m.as_str()))
            .collect();
        found_urls.add_urls_from_text_array(&link_js, &source_url_str, UrlSource::LinkHref);

        // .src = "..." (lazy loading in JS)
        let dot_srcs: Vec<&str> = RE_DOT_SRC
            .captures_iter(html)
            .filter_map(|caps| caps.get(1).map(|m| m.as_str()))
            .collect();
        found_urls.add_urls_from_text_array(&dot_srcs, &source_url_str, UrlSource::InlineScriptSrc);

        // NextJS chunks
        let mut next_js_chunks: Vec<String> = Vec::new();
        for caps in RE_NEXTJS_CHUNKS.captures_iter(html) {
            if let Some(m) = caps.get(1) {
                let matched = m.as_str();
                let chunk_url = if matched.starts_with("//") {
                    format!("{}:{}", source_url.scheme.as_deref().unwrap_or("https"), matched)
                } else if matched.starts_with("http://") || matched.starts_with("https://") {
                    matched.to_string()
                } else if matched.contains("/_next/") {
                    let mut url = matched.to_string();
                    if source_url.host.is_some() && source_url.host != self.config.initial_url.host {
                        url = format!("{}{}", source_url.get_full_homepage_url(), url);
                    }
                    url
                } else {
                    format!("{}/_next/{}", source_url.get_full_homepage_url(), matched)
                };
                next_js_chunks.push(chunk_url);
            }
        }
        let chunk_refs: Vec<&str> = next_js_chunks.iter().map(|s| s.as_str()).collect();
        found_urls.add_urls_from_text_array(&chunk_refs, &source_url_str, UrlSource::InlineScriptSrc);
    }

    /// Find stylesheet URLs from <link> tags with rel="stylesheet"
    fn find_stylesheets(&self, html: &str, source_url: &ParsedUrl, found_urls: &mut FoundUrls) {
        let source_url_str = source_url.get_full_url(true, false);

        let mut stylesheet_urls: Vec<String> = Vec::new();
        for caps in RE_LINK_STYLESHEET.captures_iter(html) {
            let full_match = caps.get(0).map(|m| m.as_str()).unwrap_or("");
            if let Some(href) = caps.get(1) {
                // Only include if no rel= attribute or rel="stylesheet"
                let full_lower = full_match.to_lowercase();
                if !full_lower.contains("rel=") || full_lower.contains("stylesheet") {
                    stylesheet_urls.push(href.as_str().to_string());
                }
            }
        }

        let url_refs: Vec<&str> = stylesheet_urls.iter().map(|s| s.as_str()).collect();
        found_urls.add_urls_from_text_array(&url_refs, &source_url_str, UrlSource::LinkHref);
    }

    /// Remove all unwanted code from HTML with respect to --disable-* options
    fn remove_unwanted_code_from_html(&self, html: &str) -> String {
        let mut result = html.to_string();

        if !self.config.scripts_enabled {
            result = utils::strip_javascript(&result);
        }
        if !self.config.styles_enabled {
            result = utils::strip_styles(&result);
        }
        if !self.config.fonts_enabled {
            result = utils::strip_fonts(&result);
        }
        if !self.config.images_enabled && result.to_lowercase().contains("<img") {
            result = utils::strip_images(&result, None);
            result = self.set_custom_css_for_tile_images(&result);
            result = utils::add_class_to_html_images(&result, "siteone-crawler-bg");
        }

        result
    }

    /// Add custom CSS for placeholder tile images
    fn set_custom_css_for_tile_images(&self, html: &str) -> String {
        let background_base64 = "iVBORw0KGgoAAAANSUhEUgAAAEAAAAAkCAMAAAAO0sygAAAAAXNSR0IB2cksfwAAAAlwSFlzAAALEwAACxMBAJqcGAAAAMlQTFRFFxcXwMDA////1NTU5+fnpaWl0tLSIyMj5ubmlJSUxcXF29vbz8/P9PT01tbWxMTE8fHxaWlp39/f9fX1yMjI3NzciYmJeXl5Gxsb2dnZNTU18/PzXFxc5eXlJycnysrKZGRk6enp3d3dW1tbsrKyWFhYIiIi19fXvLy8w8PDuLi47e3tzMzM0dHRx8fH09PTHR0dzs7OLy8vwcHB0NDQSEhIqamp4uLiHh4eOzs74ODg3t7ewsLCISEhJCQkaGhoy8vLzc3N2NjYEPdgjAAAAaRJREFUeJzdlWlTgzAQhiGlWlsCCHog9vAWpalavK3X//9RJptACoEwTp1x9MPOht19k+VJCIZhIvNXzVh1DmPVHowfe5cOsrpr5dh6D1kb/VJs0LWQ3cAAO7gyp0tjXinGavBmPQOf5gaVvgIaC5eet5jeceoZbNPcjhjvcu9GGHl7sjYGfbXP3HyaG/Dx/pD7UYDwOFT0ThuDSYwOahgcCn0rgyNWZykMQNtpYXBM9+wkhtreKZ8zZ8D52RoGEeT6E9EntTPmzxP5/mEIXscAX8SF/hL6TSEHc6VTRGbNDMyrYaElRPRxncr1bVpzEzczyHugNjeIGHO9ydbNMjomGgbUUq5PlDMzp3p5FpoYmLei77sERUJ//yBy0wy8lkFf8s/XV/rVMXjk9VahnYF/KvWrY/AMuZdFrvdQCHtPSnVt52BOfa/YP6LcBzoGfj73K1s/q70PtAzYt/DGx8P3Dx6r3Ad6Br6ce2GLmLwPknGAgigAPfNBFDffB6WzKRiMvGJfK/urMlgyycBV9FjDoLAlBl5V/9nwPXzb/tO/8e8y+AJh0S3ETlwQiAAAAABJRU5ErkJggg==";

        RE_CLOSE_HEAD
            .replace(
                html,
                &format!(
                    "<style>\n\
                .siteone-crawler-bg {{\n\
                    background-image: url(\"data:image/png;base64,{}\");\n\
                    background-repeat: repeat;\n\
                    opacity: 0.15;\n\
                }}\n\
            </style></head>",
                    background_base64
                ),
            )
            .to_string()
    }

    /// Set JS variable _SiteOneUrlDepth with number of levels before </head>
    fn set_js_variable_with_url_depth(&self, html: &str, base_url: &str) -> String {
        let base_path = if let Ok(parsed) = url::Url::parse(base_url) {
            parsed.path().to_string()
        } else {
            "/".to_string()
        };

        let trimmed = base_path.trim_start_matches('/');
        let mut depth = trimmed.matches('/').count();

        let needs_index_html = base_path != "/" && base_path.ends_with('/');
        if needs_index_html {
            depth += 1;
        }

        RE_CLOSE_HEAD
            .replace(
                html,
                &format!(
                    "<script>var {} = {};</script></head>",
                    JS_VARIABLE_NAME_URL_DEPTH, depth
                ),
            )
            .to_string()
    }

    /// Set JS function to remove all anchor listeners before </body>
    fn set_js_function_to_remove_all_anchor_listeners(&self, html: &str) -> String {
        RE_CLOSE_BODY
            .replace(
                html,
                concat!(
                    "<script>\n",
                    "function _SiteOneRemoveAllAnchorListeners(){\n",
                    "    var anchors=document.getElementsByTagName('a');\n",
                    "    for(var i=0;i<anchors.length;i++){\n",
                    "        var anchor=anchors[i];\n",
                    "        var newAnchor=anchor.cloneNode(true);\n",
                    "        anchor.parentNode.replaceChild(newAnchor,anchor);\n",
                    "    }\n",
                    "}\n",
                    "setTimeout(_SiteOneRemoveAllAnchorListeners, 200);\n",
                    "setTimeout(_SiteOneRemoveAllAnchorListeners, 1000);\n",
                    "setTimeout(_SiteOneRemoveAllAnchorListeners, 5000);\n",
                    "</script></body>",
                ),
            )
            .to_string()
    }

    /// Remove scheme and host from full origin URLs to simplify relative paths conversion
    fn remove_schema_and_host_from_full_origin_urls(&self, url: &ParsedUrl, content: &str) -> String {
        static RE_BASE_URL_ROOT: Lazy<Regex> = Lazy::new(|| Regex::new(r"(?i)((https?:)?//[^/]+/?).*").unwrap());

        let full_url = url.get_full_url(true, false);
        // Extract base URL root (scheme://host/)
        let base_url_root = RE_BASE_URL_ROOT.replace(&full_url, "$1").to_string();

        let mut result = content.to_string();

        // Normalize port numbers
        result = RE_PORT_NORMALIZE.replace_all(&result, "$1").to_string();

        // Build patterns for href=, src=, url=, url( attributes
        let escaped_root = regex::escape(&base_url_root);
        let attr_patterns = [
            format!(r#"(?i)(href=(["'])){esc}([^"']*)(["'])"#, esc = escaped_root),
            format!(r#"(?i)(src=(["'])){esc}([^"']*)(["'])"#, esc = escaped_root),
            format!(r#"(?i)(url=(["'])){esc}([^"']*)(["'])"#, esc = escaped_root),
            format!(r#"(?i)(url\((["']?)){esc}([^"')]*)(["']\)|\))"#, esc = escaped_root),
        ];

        for pattern in &attr_patterns {
            if let Ok(re) = Regex::new(pattern) {
                let compiled_ignore = &self.config.compiled_ignore_regex;
                result = re
                    .replace_all(&result, |caps: &regex::Captures| {
                        let full_match = caps.get(0).map_or("", |m| m.as_str());

                        // Check against pre-compiled ignore patterns
                        for ire in compiled_ignore {
                            if ire.is_match(full_match) {
                                return full_match.to_string();
                            }
                        }

                        let attr_start = caps.get(1).map_or("", |m| m.as_str());
                        let path = caps.get(3).map_or("", |m| m.as_str());
                        let attr_end = caps.get(4).map_or("", |m| m.as_str());

                        format!("{}/{}{}", attr_start, path, attr_end)
                    })
                    .to_string();
            }
        }

        result
    }

    /// Update all HTML paths to relative for offline version
    fn update_html_paths_to_relative(&self, html: &str, parsed_base_url: &ParsedUrl) -> String {
        let initial_url = &self.config.initial_url;
        let compiled_ignore = &self.config.compiled_ignore_regex;

        let replace_callback = |caps: &regex::Captures| -> String {
            let full_match = caps.get(0).map_or("", |m| m.as_str());
            let start = caps.get(1).map_or("", |m| m.as_str());
            let attribute_raw = caps.get(2).map_or("", |m| m.as_str());
            let attribute = attribute_raw.trim_matches(|c: char| c == ' ' || c == '\\' || c == '"' || c == '\'');
            let assignment_char = caps.get(3).map_or("", |m| m.as_str());
            let quote = caps.get(4).map_or("", |m| m.as_str());
            // Decode HTML entities in URL values (fixes Astro image query params like &#38; → &)
            let value_raw = caps.get(5).map_or("", |m| m.as_str());
            let value_decoded = html_entity_decode(value_raw);
            let value = value_decoded.as_str();
            let end = caps.get(6).map_or("", |m| m.as_str());

            // When modifying x.src (JS) and there is no quote, do not convert
            if start == "." && quote.is_empty() {
                return full_match.to_string();
            }

            // Ignore data URI, anchor, or non-http scheme
            if value.starts_with('#') || RE_NON_HTTP_SCHEME.is_match(value) {
                return full_match.to_string();
            }

            // Check against pre-compiled ignore regex patterns
            for ire in compiled_ignore {
                if ire.is_match(value) {
                    return full_match.to_string();
                }
            }

            let attr_lower = attribute.to_lowercase();
            let new_value = if attr_lower == "srcset" || attr_lower == "imagesrcset" {
                // Handle srcset: multiple sources separated by ", "
                let sources: Vec<&str> = value.split(", ").collect();
                let converted: Vec<String> = sources
                    .iter()
                    .map(|source| {
                        let trimmed = source.trim();
                        if !trimmed.contains(' ') {
                            // URL without size descriptor
                            convert_url_to_relative(
                                parsed_base_url,
                                trimmed,
                                initial_url,
                                Some(&attr_lower),
                                self.config.offline_export_preserve_urls,
                            )
                        } else {
                            // URL with size descriptor (e.g., "url 2x")
                            let mut parts = trimmed.splitn(2, char::is_whitespace);
                            let url_part = parts.next().unwrap_or("");
                            let size_part = parts.next().unwrap_or("");
                            let relative_url = convert_url_to_relative(
                                parsed_base_url,
                                url_part,
                                initial_url,
                                Some(&attr_lower),
                                self.config.offline_export_preserve_urls,
                            );
                            format!("{} {}", relative_url, size_part)
                        }
                    })
                    .collect();
                converted.join(", ")
            } else {
                let mut converted = convert_url_to_relative(
                    parsed_base_url,
                    value,
                    initial_url,
                    Some(attribute),
                    self.config.offline_export_preserve_urls,
                );

                // Handle component-url and renderer-url (Astro)
                if attribute == "component-url" || attribute == "renderer-url" {
                    converted = format!("./{}", converted);
                }

                converted
            };

            format!(
                "{}{}{}{}{}{}{}",
                start, attribute_raw, assignment_char, quote, new_value, quote, end
            )
        };

        let mut result = html.to_string();
        result = RE_HREF_SRC.replace_all(&result, replace_callback).to_string();
        result = RE_SRCSET_ATTR.replace_all(&result, replace_callback).to_string();
        result = RE_META_URL.replace_all(&result, replace_callback).to_string();
        result = RE_ESCAPED_HREF_SRC.replace_all(&result, replace_callback).to_string();
        result
    }

    /// Apply specific HTML changes for offline version
    #[allow(clippy::too_many_arguments)]
    fn apply_specific_html_changes(
        &self,
        html: &mut String,
        parsed_base_url: &ParsedUrl,
        remove_external_js: bool,
        remove_cross_origins: bool,
        remove_analytics: bool,
        remove_socnets: bool,
        remove_cookies_related: bool,
    ) {
        if html.trim().is_empty() {
            return;
        }

        let base_host = parsed_base_url.host.as_deref().unwrap_or("");

        // Remove external JS
        if remove_external_js {
            let base_host_owned = base_host.to_string();
            *html = RE_EXTERNAL_SCRIPT
                .replace_all(html, |caps: &regex::Captures| {
                    let full_match = caps.get(0).map_or("", |m| m.as_str());
                    let src = caps.get(1).map_or("", |m| m.as_str());

                    if RE_EXTERNAL_URL.is_match(src) {
                        // Parse host from the src URL
                        let parsed_src = if src.starts_with("//") {
                            format!("https:{}", src)
                        } else {
                            src.to_string()
                        };
                        if let Ok(parsed) = url::Url::parse(&parsed_src)
                            && parsed.host_str().unwrap_or("") != base_host_owned
                        {
                            return String::new();
                        }
                    }
                    full_match.to_string()
                })
                .to_string();
        }

        // Remove crossorigin attributes
        if remove_cross_origins {
            *html = RE_CROSSORIGIN_LINK.replace_all(html, "$1$3").to_string();
            *html = RE_CROSSORIGIN_SCRIPT.replace_all(html, "$1$3").to_string();
        }

        // Remove analytics and social network scripts
        if remove_analytics || remove_socnets || remove_cookies_related {
            let mut patterns: Vec<&str> = Vec::new();

            if remove_analytics {
                patterns.extend_from_slice(&[
                    "googletagmanager.com",
                    "google-analytics.com",
                    "ga.js",
                    "gtag.js",
                    "gtag(",
                    "analytics.",
                    "connect.facebook.net",
                    "fbq(",
                ]);
            }

            if remove_socnets {
                patterns.extend_from_slice(&[
                    "connect.facebook.net",
                    "connect.facebook.com",
                    "twitter.com",
                    ".x.com",
                    "linkedin.com",
                    "instagram.com",
                    "pinterest.com",
                    "tumblr.com",
                    "plus.google.com",
                    "curator.io",
                ]);
            }

            if remove_cookies_related {
                patterns.extend_from_slice(&["cookies", "cookiebot"]);
            }

            // Deduplicate
            patterns.sort();
            patterns.dedup();

            *html = RE_SCRIPT_BLOCK
                .replace_all(html, |caps: &regex::Captures| {
                    let full_match = caps.get(0).map_or("", |m| m.as_str());
                    let full_lower = full_match.to_lowercase();

                    for keyword in &patterns {
                        if full_lower.contains(&keyword.to_lowercase()) {
                            return String::new();
                        }
                    }

                    full_match.to_string()
                })
                .to_string();

            // Remove social network iframes
            if remove_socnets {
                *html = RE_SOCNET_IFRAME.replace_all(html, "").to_string();
            }
        }
    }

    /// Check if anchor listener removal is forced (e.g., for NextJS sites)
    fn is_forced_to_remove_anchor_listeners(&self, html: &str) -> bool {
        html.contains("_next/")
    }
}

impl ContentProcessor for HtmlProcessor {
    fn find_urls(&self, content: &str, source_url: &ParsedUrl) -> Option<FoundUrls> {
        let mut found_urls = FoundUrls::new();

        if !self.config.single_page {
            self.find_href_urls(content, source_url, &mut found_urls);
        }

        if self.config.fonts_enabled {
            self.find_fonts(content, source_url, &mut found_urls);
        }

        if self.config.images_enabled {
            self.find_images(content, source_url, &mut found_urls);
        }

        if self.config.files_enabled {
            self.find_audio(content, source_url, &mut found_urls);
            self.find_video(content, source_url, &mut found_urls);
        }

        if self.config.scripts_enabled {
            self.find_scripts(content, source_url, &mut found_urls);
        }

        if self.config.styles_enabled {
            self.find_stylesheets(content, source_url, &mut found_urls);
        }

        if found_urls.get_count() > 0 {
            Some(found_urls)
        } else {
            None
        }
    }

    fn apply_content_changes_before_url_parsing(
        &self,
        _content: &mut String,
        _content_type: ContentTypeId,
        _url: &ParsedUrl,
    ) {
        // No changes needed before URL parsing in HtmlProcessor
    }

    fn apply_content_changes_for_offline_version(
        &self,
        content: &mut String,
        _content_type: ContentTypeId,
        url: &ParsedUrl,
        remove_unwanted_code: bool,
    ) {
        let base_url = url.get_full_url(true, false);

        // Remove schema and host from full origin URLs
        *content = self.remove_schema_and_host_from_full_origin_urls(url, content);

        // Remove unwanted code from HTML
        *content = self.remove_unwanted_code_from_html(content);

        // Update all paths to relative
        *content = self.update_html_paths_to_relative(content, url);

        // Meta redirects (e.g., in Astro projects)
        if let Some(caps) = RE_META_REFRESH.captures(content) {
            let full_match = caps.get(0).map_or("", |m| m.as_str());
            let prefix = caps.get(1).map_or("", |m| m.as_str());
            let meta_url = caps.get(2).map_or("", |m| m.as_str());
            let suffix = caps.get(3).map_or("", |m| m.as_str());

            let relative = convert_url_to_relative(
                url,
                meta_url,
                &self.config.initial_url,
                None,
                self.config.offline_export_preserve_urls,
            );
            *content = content.replace(full_match, &format!("{}{}{}", prefix, relative, suffix));
        }

        // Apply specific HTML changes
        self.apply_specific_html_changes(
            content,
            url,
            self.config.disable_javascript,
            remove_unwanted_code,
            remove_unwanted_code,
            remove_unwanted_code,
            remove_unwanted_code,
        );

        // Set JS variable and remove anchor listeners
        if self.config.scripts_enabled {
            if !self.config.offline_export_preserve_urls {
                *content = self.set_js_variable_with_url_depth(content, &base_url);
            }
            if self.config.remove_all_anchor_listeners || self.is_forced_to_remove_anchor_listeners(content) {
                *content = self.set_js_function_to_remove_all_anchor_listeners(content);
            }
        }
    }

    fn is_content_type_relevant(&self, content_type: ContentTypeId) -> bool {
        is_relevant(content_type, &self.relevant_content_types)
    }

    fn get_name(&self) -> &str {
        "HtmlProcessor"
    }

    fn set_debug_mode(&mut self, debug_mode: bool) {
        self.debug_mode = debug_mode;
    }
}

/// Decode HTML entities in URL attribute values.
/// Single-pass implementation to avoid double-decoding (e.g. `&#38;amp;` → `&amp;`, not `&`).
fn html_entity_decode(input: &str) -> String {
    let mut result = String::with_capacity(input.len());
    let bytes = input.as_bytes();
    let len = bytes.len();
    let mut i = 0;

    while i < len {
        if bytes[i] == b'&' {
            // Try to match a named or numeric entity
            if let Some((decoded, advance)) = try_decode_entity(&input[i..]) {
                result.push_str(decoded);
                i += advance;
                continue;
            }
        }
        result.push(input[i..].chars().next().unwrap());
        i += input[i..].chars().next().unwrap().len_utf8();
    }

    result
}

/// Try to decode a single HTML entity at the start of `s`. Returns (decoded, bytes_consumed).
fn try_decode_entity(s: &str) -> Option<(&'static str, usize)> {
    // Named entities
    for (entity, decoded) in [
        ("&amp;", "&"),
        ("&lt;", "<"),
        ("&gt;", ">"),
        ("&quot;", "\""),
        ("&apos;", "'"),
    ] {
        if s.starts_with(entity) {
            return Some((decoded, entity.len()));
        }
    }

    // Numeric entities (decimal and hex)
    for (entity, decoded) in [
        ("&#38;", "&"),
        ("&#x26;", "&"),
        ("&#60;", "<"),
        ("&#x3C;", "<"),
        ("&#x3c;", "<"),
        ("&#62;", ">"),
        ("&#x3E;", ">"),
        ("&#x3e;", ">"),
        ("&#34;", "\""),
        ("&#x22;", "\""),
        ("&#39;", "'"),
        ("&#x27;", "'"),
        ("&#039;", "'"),
    ] {
        if s.starts_with(entity) {
            return Some((decoded, entity.len()));
        }
    }

    None
}

#[cfg(test)]
mod tests {
    use super::*;

    fn make_config() -> ProcessorConfig {
        ProcessorConfig::new(ParsedUrl::parse("https://example.com/", None))
    }

    #[test]
    fn test_find_href_urls() {
        let processor = HtmlProcessor::new(make_config());
        let html = r#"<html><body><a href="/about">About</a><a href="/contact">Contact</a></body></html>"#;
        let source = ParsedUrl::parse("https://example.com/", None);
        let result = processor.find_urls(html, &source);
        assert!(result.is_some());
        let found = result.unwrap();
        assert!(found.get_count() >= 2);
    }

    #[test]
    fn test_find_images() {
        let processor = HtmlProcessor::new(make_config());
        let html = r#"<html><body><img src="/img/logo.png"><img data-src="/img/lazy.jpg"></body></html>"#;
        let source = ParsedUrl::parse("https://example.com/", None);
        let result = processor.find_urls(html, &source);
        assert!(result.is_some());
    }

    #[test]
    fn test_find_scripts() {
        let processor = HtmlProcessor::new(make_config());
        let html = r#"<html><head><script src="/js/app.js"></script></head></html>"#;
        let source = ParsedUrl::parse("https://example.com/", None);
        let result = processor.find_urls(html, &source);
        assert!(result.is_some());
    }

    #[test]
    fn test_single_page_no_hrefs() {
        let mut config = make_config();
        config.single_page = true;
        let processor = HtmlProcessor::new(config);
        let html = r#"<html><body><a href="/about">About</a><script src="/app.js"></script></body></html>"#;
        let source = ParsedUrl::parse("https://example.com/", None);
        let result = processor.find_urls(html, &source);
        assert!(result.is_some());
        // Should only find script, not href
        let found = result.unwrap();
        for (_key, url) in found.get_urls() {
            assert_ne!(url.url, "/about");
        }
    }

    #[test]
    fn test_find_srcset() {
        let processor = HtmlProcessor::new(make_config());
        let html = r#"<img srcset="/img/small.jpg 1x, /img/large.jpg 2x">"#;
        let source = ParsedUrl::parse("https://example.com/", None);
        let result = processor.find_urls(html, &source);
        assert!(result.is_some());
    }

    #[test]
    fn test_spaces_in_quoted_img_src() {
        let processor = HtmlProcessor::new(make_config());
        let html = r#"<html><body><img src="/images/dir with spaces/file with spaces.png?ver=1.0"></body></html>"#;
        let source = ParsedUrl::parse("https://example.com/", None);
        let found = processor.find_urls(html, &source).unwrap();
        let urls: Vec<&str> = found.get_urls().iter().map(|(_, u)| u.url.as_str()).collect();
        assert!(
            urls.iter()
                .any(|u| u.contains("dir%20with%20spaces/file%20with%20spaces.png")),
            "img src with spaces in quoted attribute should be captured. Found: {:?}",
            urls
        );
    }

    #[test]
    fn test_spaces_in_quoted_a_href() {
        let processor = HtmlProcessor::new(make_config());
        let html = r#"<html><body><a href="/pages/page two.html">Link</a></body></html>"#;
        let source = ParsedUrl::parse("https://example.com/", None);
        let found = processor.find_urls(html, &source).unwrap();
        let urls: Vec<&str> = found.get_urls().iter().map(|(_, u)| u.url.as_str()).collect();
        assert!(
            urls.iter().any(|u| u.contains("page%20two.html")),
            "a href with spaces in quoted attribute should be captured. Found: {:?}",
            urls
        );
    }

    #[test]
    fn test_spaces_in_quoted_script_src() {
        let processor = HtmlProcessor::new(make_config());
        let html = r#"<html><head><script src="/js/my script.js"></script></head></html>"#;
        let source = ParsedUrl::parse("https://example.com/", None);
        let found = processor.find_urls(html, &source).unwrap();
        let urls: Vec<&str> = found.get_urls().iter().map(|(_, u)| u.url.as_str()).collect();
        assert!(
            urls.iter().any(|u| u.contains("my%20script.js")),
            "script src with spaces in quoted attribute should be captured. Found: {:?}",
            urls
        );
    }

    #[test]
    fn test_unquoted_src_still_works() {
        let processor = HtmlProcessor::new(make_config());
        let html = r#"<html><body><img src=logo.png></body></html>"#;
        let source = ParsedUrl::parse("https://example.com/", None);
        let found = processor.find_urls(html, &source).unwrap();
        let urls: Vec<&str> = found.get_urls().iter().map(|(_, u)| u.url.as_str()).collect();
        assert!(
            urls.iter().any(|u| u.contains("logo.png")),
            "unquoted img src should still be captured. Found: {:?}",
            urls
        );
    }

    #[test]
    fn test_single_quoted_src_with_spaces() {
        let processor = HtmlProcessor::new(make_config());
        let html = r#"<html><body><img src='/images/dir with spaces/another file.jpg'></body></html>"#;
        let source = ParsedUrl::parse("https://example.com/", None);
        let found = processor.find_urls(html, &source).unwrap();
        let urls: Vec<&str> = found.get_urls().iter().map(|(_, u)| u.url.as_str()).collect();
        assert!(
            urls.iter().any(|u| u.contains("another%20file.jpg")),
            "single-quoted img src with spaces should be captured. Found: {:?}",
            urls
        );
    }

    #[test]
    fn test_unquoted_href_no_spaces() {
        let processor = HtmlProcessor::new(make_config());
        let html = r#"<html><body><a href=/about>About</a></body></html>"#;
        let source = ParsedUrl::parse("https://example.com/", None);
        let found = processor.find_urls(html, &source).unwrap();
        let urls: Vec<&str> = found.get_urls().iter().map(|(_, u)| u.url.as_str()).collect();
        assert!(
            urls.iter().any(|u| u.contains("/about")),
            "unquoted a href should still be captured. Found: {:?}",
            urls
        );
    }

    #[test]
    fn test_unquoted_script_src() {
        let processor = HtmlProcessor::new(make_config());
        let html = r#"<html><head><script src=app.js></script></head></html>"#;
        let source = ParsedUrl::parse("https://example.com/", None);
        let found = processor.find_urls(html, &source).unwrap();
        let urls: Vec<&str> = found.get_urls().iter().map(|(_, u)| u.url.as_str()).collect();
        assert!(
            urls.iter().any(|u| u.contains("app.js")),
            "unquoted script src should still be captured. Found: {:?}",
            urls
        );
    }

    #[test]
    fn test_spaces_in_audio_video_src() {
        let processor = HtmlProcessor::new(make_config());
        let html = r#"<html><body>
            <audio src="/media/my song.mp3"></audio>
            <video src="/media/my video.mp4"></video>
        </body></html>"#;
        let source = ParsedUrl::parse("https://example.com/", None);
        let found = processor.find_urls(html, &source).unwrap();
        let urls: Vec<&str> = found.get_urls().iter().map(|(_, u)| u.url.as_str()).collect();
        assert!(
            urls.iter().any(|u| u.contains("my%20song.mp3")),
            "audio src with spaces should be captured. Found: {:?}",
            urls
        );
        assert!(
            urls.iter().any(|u| u.contains("my%20video.mp4")),
            "video src with spaces should be captured. Found: {:?}",
            urls
        );
    }

    #[test]
    fn test_mixed_quoted_and_unquoted() {
        let processor = HtmlProcessor::new(make_config());
        let html = r#"<html><body>
            <img src="/images/path with spaces/photo.jpg">
            <img src=simple.png>
            <img src='/another path/img.webp'>
        </body></html>"#;
        let source = ParsedUrl::parse("https://example.com/", None);
        let found = processor.find_urls(html, &source).unwrap();
        let urls: Vec<&str> = found.get_urls().iter().map(|(_, u)| u.url.as_str()).collect();
        assert!(
            urls.iter().any(|u| u.contains("path%20with%20spaces/photo.jpg")),
            "double-quoted with spaces should work. Found: {:?}",
            urls
        );
        assert!(
            urls.iter().any(|u| u.contains("simple.png")),
            "unquoted without spaces should work. Found: {:?}",
            urls
        );
        assert!(
            urls.iter().any(|u| u.contains("another%20path/img.webp")),
            "single-quoted with spaces should work. Found: {:?}",
            urls
        );
    }

    #[test]
    fn test_fragment_links_still_skipped() {
        let processor = HtmlProcessor::new(make_config());
        let html = r##"<html><body><a href="#section">Section</a><a href="/real-page">Real</a></body></html>"##;
        let source = ParsedUrl::parse("https://example.com/", None);
        let found = processor.find_urls(html, &source).unwrap();
        let urls: Vec<&str> = found.get_urls().iter().map(|(_, u)| u.url.as_str()).collect();
        assert!(
            !urls.iter().any(|u| u == &"#section"),
            "fragment-only links should still be skipped. Found: {:?}",
            urls
        );
        assert!(
            urls.iter().any(|u| u.contains("/real-page")),
            "real page links should be captured. Found: {:?}",
            urls
        );
    }
}


================================================
FILE: src/content_processor/javascript_processor.rs
================================================
// SiteOne Crawler - JavaScriptProcessor
// (c) Jan Reges <jan.reges@siteone.cz>
//
// Extracts URLs from JS import/from statements and applies offline conversion.

use once_cell::sync::Lazy;
use regex::Regex;

use crate::content_processor::base_processor::{ProcessorConfig, is_relevant};
use crate::content_processor::content_processor::ContentProcessor;
use crate::content_processor::html_processor::JS_VARIABLE_NAME_URL_DEPTH;
use crate::engine::found_url::UrlSource;
use crate::engine::found_urls::FoundUrls;
use crate::engine::parsed_url::ParsedUrl;
use crate::types::ContentTypeId;

static RE_IMPORT_FROM: Lazy<Regex> = Lazy::new(|| Regex::new(r#"(?i)from\s*["']([^"']+\.js[^"']*)["']"#).unwrap());

static RE_QUOTED_JS_PATH: Lazy<Regex> = Lazy::new(|| Regex::new(r#"(?i)["'](/[^"']+\.js)["']"#).unwrap());

static RE_QUOTED_HTTPS_JS: Lazy<Regex> = Lazy::new(|| Regex::new(r#"(?i)["'](https://[^"']+\.js)["']"#).unwrap());

static RE_WEBPACK_CHUNKS: Lazy<Regex> = Lazy::new(|| {
    Regex::new(r#"(?i)"assets/js/".*\+.*\(\{([^}]*)\}.*\[e\].*\|\|.*e\)\s*\+\s*"\.".*\+\s*\{([^}]+)\}"#).unwrap()
});

static RE_WEBPACK_NAME_ITEM: Lazy<Regex> = Lazy::new(|| Regex::new(r#"([0-9]+):\s*"([^"']+)""#).unwrap());

static RE_WEBPACK_HASH_ITEM: Lazy<Regex> = Lazy::new(|| Regex::new(r#"([0-9]+):\s*"([a-f0-9]+)""#).unwrap());

// Offline conversion regexes
static RE_WEBPACK_AP: Lazy<Regex> = Lazy::new(|| Regex::new(r#"a\.p="/""#).unwrap());

static RE_HREF_SLASH: Lazy<Regex> = Lazy::new(|| Regex::new(r#"href:"/"#).unwrap());

static RE_PATH_SLASH: Lazy<Regex> = Lazy::new(|| Regex::new(r#"path:"/"#).unwrap());

static RE_PATH_UPPER_SLASH: Lazy<Regex> = Lazy::new(|| Regex::new(r#"Path:"/"#).unwrap());

static RE_CROSSORIGIN: Lazy<Regex> = Lazy::new(|| Regex::new(r"(?i)crossorigin").unwrap());

pub struct JavaScriptProcessor {
    #[allow(dead_code)]
    config: ProcessorConfig,
    debug_mode: bool,
    relevant_content_types: Vec<ContentTypeId>,
}

impl JavaScriptProcessor {
    pub fn new(config: ProcessorConfig) -> Self {
        Self {
            config,
            debug_mode: false,
            relevant_content_types: vec![ContentTypeId::Html, ContentTypeId::Script],
        }
    }

    /// Find URLs in JavaScript import from statements and quoted JS paths
    fn find_urls_import_from(&self, content: &str, source_url: &ParsedUrl) -> Option<FoundUrls> {
        // Don't process HTML files
        if content.to_lowercase().contains("<html") {
            return None;
        }
        if !content.contains("from") {
            return None;
        }

        let mut found_urls_txt: Vec<String> = Vec::new();

        // import ... from "path.js"
        for caps in RE_IMPORT_FROM.captures_iter(content) {
            if let Some(m) = caps.get(1) {
                found_urls_txt.push(m.as_str().trim().to_string());
            }
        }

        // "/assets/js/12.c6446aa6.js" style paths
        for caps in RE_QUOTED_JS_PATH.captures_iter(content) {
            if let Some(m) = caps.get(1) {
                found_urls_txt.push(m.as_str().trim().to_string());
            }
        }

        // "https://..." style JS URLs
        for caps in RE_QUOTED_HTTPS_JS.captures_iter(content) {
            if let Some(m) = caps.get(1) {
                found_urls_txt.push(m.as_str().trim().to_string());
            }
        }

        // Webpack chunks pattern
        if let Some(caps) = RE_WEBPACK_CHUNKS.captures(content) {
            let mut tmp_webpack: std::collections::HashMap<String, String> = std::collections::HashMap::new();

            // Parse name mappings: {5:"vendors~docsearch"}
            if let Some(names_str) = caps.get(1) {
                for item in names_str.as_str().split(',') {
                    if let Some(item_caps) = RE_WEBPACK_NAME_ITEM.captures(item) {
                        let id = item_caps.get(1).map_or("", |m| m.as_str()).to_string();
                        let name = item_caps.get(2).map_or("", |m| m.as_str()).to_string();
                        tmp_webpack.insert(id, name);
                    }
                }
            }

            // Parse hash mappings and build URLs
            if let Some(hashes_str) = caps.get(2) {
                for item in hashes_str.as_str().split(',') {
                    if let Some(item_caps) = RE_WEBPACK_HASH_ITEM.captures(item) {
                        let id = item_caps.get(1).map_or("", |m| m.as_str());
                        let hash = item_caps.get(2).map_or("", |m| m.as_str());

                        found_urls_txt.push(format!("/assets/js/{}.{}.js", id, hash));

                        // Special case: named webpack chunks
                        if let Some(name) = tmp_webpack.get(id) {
                            found_urls_txt.push(format!("/assets/js/{}.{}.js", name, hash));
                        }
                    }
                }
            }
        }

        if found_urls_txt.is_empty() {
            return None;
        }

        let mut found_urls = FoundUrls::new();
        let url_refs: Vec<&str> = found_urls_txt.iter().map(|s| s.as_str()).collect();
        found_urls.add_urls_from_text_array(&url_refs, &source_url.path, UrlSource::JsUrl);

        if found_urls.get_count() > 0 {
            Some(found_urls)
        } else {
            None
        }
    }
}

impl ContentProcessor for JavaScriptProcessor {
    fn find_urls(&self, content: &str, source_url: &ParsedUrl) -> Option<FoundUrls> {
        self.find_urls_import_from(content, source_url)
    }

    fn apply_content_changes_before_url_parsing(
        &self,
        _content: &mut String,
        _content_type: ContentTypeId,
        _url: &ParsedUrl,
    ) {
        // No changes needed before URL parsing in JavaScriptProcessor
    }

    fn apply_content_changes_for_offline_version(
        &self,
        content: &mut String,
        _content_type: ContentTypeId,
        _url: &ParsedUrl,
        _remove_unwanted_code: bool,
    ) {
        // Replace crossorigin keyword (case-insensitive)
        if RE_CROSSORIGIN.is_match(content) {
            *content = RE_CROSSORIGIN.replace_all(content, "_SiteOne_CO_").to_string();
        }

        // When preserving URLs, skip webpack path transformations since paths remain root-relative
        if self.config.offline_export_preserve_urls {
            return;
        }

        let webpack_path_prefix = format!(
            "({} > 0 ? \"../\".repeat({}) : \"./\")",
            JS_VARIABLE_NAME_URL_DEPTH, JS_VARIABLE_NAME_URL_DEPTH
        );

        // webpack case: a.p="/"
        if content.to_lowercase().contains("a.p=") {
            *content = RE_WEBPACK_AP
                .replace_all(content, &format!("a.p={}", webpack_path_prefix))
                .to_string();
        }

        // webpack href/path/Path cases
        if content.to_lowercase().contains("href:\"/") {
            *content = RE_HREF_SLASH
                .replace_all(content, &format!("href:{}+\"", webpack_path_prefix))
                .to_string();
        }
        if content.to_lowercase().contains("path:\"/") {
            *content = RE_PATH_SLASH
                .replace_all(content, &format!("href:{}+\"", webpack_path_prefix))
                .to_string();
        }
        if content.contains("Path:\"/") {
            *content = RE_PATH_UPPER_SLASH
                .replace_all(content, &format!("href:{}+\"", webpack_path_prefix))
                .to_string();
        }
    }

    fn is_content_type_relevant(&self, content_type: ContentTypeId) -> bool {
        is_relevant(content_type, &self.relevant_content_types)
    }

    fn get_name(&self) -> &str {
        "JavaScriptProcessor"
    }

    fn set_debug_mode(&mut self, debug_mode: bool) {
        self.debug_mode = debug_mode;
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    fn make_config() -> ProcessorConfig {
        ProcessorConfig::new(ParsedUrl::parse("https://example.com/", None))
    }

    #[test]
    fn test_find_import_from() {
        let processor = JavaScriptProcessor::new(make_config());
        let js = r#"import{R as W}from"./Repl.209fef3e.js";import{s}from"./stores.js";"#;
        let source = ParsedUrl::parse("https://example.com/app.js", None);
        let result = processor.find_urls(js, &source);
        assert!(result.is_some());
        assert!(result.unwrap().get_count() >= 2);
    }

    #[test]
    fn test_skip_html_content() {
        let processor = JavaScriptProcessor::new(make_config());
        let html = r#"<html><head></head><body>from something</body></html>"#;
        let source = ParsedUrl::parse("https://example.com/page.html", None);
        let result = processor.find_urls(html, &source);
        assert!(result.is_none());
    }

    #[test]
    fn test_find_quoted_js_paths() {
        let processor = JavaScriptProcessor::new(make_config());
        let js = r#"from x; var chunks = ["/assets/js/12.c6446aa6.js","/assets/js/120.03870a87.js"]"#;
        let source = ParsedUrl::parse("https://example.com/bundle.js", None);
        let result = processor.find_urls(js, &source);
        assert!(result.is_some());
    }
}


================================================
FILE: src/content_processor/manager.rs
================================================
// SiteOne Crawler - ContentProcessorManager
// (c) Jan Reges <jan.reges@siteone.cz>
//
// Holds all registered processors and delegates operations to them.

use std::time::Instant;

use crate::engine::found_urls::FoundUrls;
use crate::engine::parsed_url::ParsedUrl;
use crate::result::manager_stats::ManagerStats;
use crate::types::ContentTypeId;

use super::content_processor::ContentProcessor;

pub const SUPER_TABLE_CONTENT_PROCESSORS_STATS: &str = "content-processors-stats";

pub struct ContentProcessorManager {
    processors: Vec<Box<dyn ContentProcessor>>,
    stats: ManagerStats,
}

impl ContentProcessorManager {
    pub fn new() -> Self {
        Self {
            processors: Vec::new(),
            stats: ManagerStats::new(),
        }
    }

    /// Register a content processor. Returns error if a processor with the
    /// same name is already registered.
    pub fn register_processor(&mut self, processor: Box<dyn ContentProcessor>) -> Result<(), String> {
        let name = processor.get_name().to_string();
        if self.processors.iter().any(|p| p.get_name() == name) {
            return Err(format!("Content processor '{}' is already registered", name));
        }
        self.processors.push(processor);
        Ok(())
    }

    /// Get references to all registered processors
    pub fn get_processors(&self) -> &[Box<dyn ContentProcessor>] {
        &self.processors
    }

    /// Find URLs in content using all relevant processors.
    /// Returns a Vec of FoundUrls from each processor that found something.
    pub fn find_urls(&mut self, content: &str, content_type: ContentTypeId, url: &ParsedUrl) -> Vec<FoundUrls> {
        let mut result = Vec::new();

        for processor in &self.processors {
            if processor.is_content_type_relevant(content_type) {
                let start = Instant::now();
                let found_urls = processor.find_urls(content, url);
                self.stats.measure_exec_time(processor.get_name(), "findUrls", start);

                if let Some(urls) = found_urls
                    && urls.get_count() > 0
                {
                    result.push(urls);
                }
            }
        }

        result
    }

    /// Apply content changes for offline version using all relevant processors.
    pub fn apply_content_changes_for_offline_version(
        &mut self,
        content: &mut String,
        content_type: ContentTypeId,
        url: &ParsedUrl,
        remove_unwanted_code: bool,
    ) {
        for processor in &self.processors {
            if processor.is_content_type_relevant(content_type) {
                let start = Instant::now();
                processor.apply_content_changes_for_offline_version(content, content_type, url, remove_unwanted_code);
                self.stats
                    .measure_exec_time(processor.get_name(), "applyContentChangesForOfflineVersion", start);
            }
        }
    }

    /// Apply content changes for offline version with a content loader callback.
    /// Used when storage access is available (e.g., from the offline exporter).
    pub fn apply_content_changes_for_offline_version_with_loader(
        &mut self,
        content: &mut String,
        content_type: ContentTypeId,
        url: &ParsedUrl,
        remove_unwanted_code: bool,
        content_loader: &dyn Fn(&str) -> Option<String>,
    ) {
        for processor in &self.processors {
            if processor.is_content_type_relevant(content_type) {
                let start = Instant::now();
                processor.apply_content_changes_for_offline_version_with_loader(
                    content,
                    content_type,
                    url,
                    remove_unwanted_code,
                    content_loader,
                );
                self.stats
                    .measure_exec_time(processor.get_name(), "applyContentChangesForOfflineVersion", start);
            }
        }
    }

    /// Apply content changes before URL parsing using all relevant processors.
    pub fn apply_content_changes_before_url_parsing(
        &mut self,
        content: &mut String,
        content_type: ContentTypeId,
        url: &ParsedUrl,
    ) {
        for processor in &self.processors {
            if processor.is_content_type_relevant(content_type) {
                let start = Instant::now();
                processor.apply_content_changes_before_url_parsing(content, content_type, url);
                self.stats
                    .measure_exec_time(processor.get_name(), "applyContentChangesBeforeUrlParsing", start);
            }
        }
    }

    /// Get reference to the stats tracker
    pub fn get_stats(&self) -> &ManagerStats {
        &self.stats
    }
}

impl Default for ContentProcessorManager {
    fn default() -> Self {
        Self::new()
    }
}


================================================
FILE: src/content_processor/mod.rs
================================================
pub mod astro_processor;
pub mod base_processor;
#[allow(clippy::module_inception)]
pub mod content_processor;
pub mod css_processor;
pub mod html_processor;
pub mod javascript_processor;
pub mod manager;
pub mod nextjs_processor;
pub mod svelte_processor;
pub mod xml_processor;


================================================
FILE: src/content_processor/nextjs_processor.rs
================================================
// SiteOne Crawler - NextJsProcessor
// (c) Jan Reges <jan.reges@siteone.cz>
//
// Handles Next.js specific URL extraction and offline conversion.

use once_cell::sync::Lazy;
use regex::Regex;

use crate::content_processor::base_processor::{ProcessorConfig, is_relevant};
use crate::content_processor::content_processor::ContentProcessor;
use crate::content_processor::html_processor::JS_VARIABLE_NAME_URL_DEPTH;
use crate::engine::found_url::UrlSource;
use crate::engine::found_urls::FoundUrls;
use crate::engine::parsed_url::ParsedUrl;
use crate::types::ContentTypeId;

static RE_MANIFEST_JS: Lazy<Regex> = Lazy::new(|| Regex::new(r#"(?is)["']([a-z0-9/._\-\[\]]\.js)["']"#).unwrap());

// Offline conversion regexes
static RE_DISABLE_PREFETCH: Lazy<Regex> = Lazy::new(|| Regex::new(r"(?i)(prefetch:\([a-z]+,[a-z]+\)=>\{)if").unwrap());

static RE_ESCAPED_NEXT: Lazy<Regex> = Lazy::new(|| Regex::new(r#"(?i)\\(["'])/_next/"#).unwrap());

static RE_ASSIGN_NEXT: Lazy<Regex> = Lazy::new(|| Regex::new(r#"(?i)([a-z0-9]+\.[a-z0-9]+=|:)(["'])/_next/"#).unwrap());

static RE_CONCAT_NEXT: Lazy<Regex> = Lazy::new(|| Regex::new(r#"(?i)(concat\([a-z]+,)(["']/_next/)(["'])"#).unwrap());

static RE_NEXT_DATA: Lazy<Regex> =
    Lazy::new(|| Regex::new(r"(?is)<script[^>]+__NEXT_DATA__[^>]*>.*?</script>").unwrap());

static RE_PREFETCH_FUNC: Lazy<Regex> = Lazy::new(|| Regex::new(r"(?i)(prefetch\()([a-z]+)(\)\s*\{)\s*let").unwrap());

static RE_HREF_CONCAT: Lazy<Regex> = Lazy::new(|| Regex::new(r#"(?i)(\{href:)(["'])(/)(['"]\.)"#).unwrap());

static RE_PUSH_SLASH: Lazy<Regex> = Lazy::new(|| Regex::new(r#"(?i)(push\(\[)(["']/)"#).unwrap());

static RE_RETURN_QUERY: Lazy<Regex> =
    Lazy::new(|| Regex::new(r#"(?i)(return\s*["'])\s*\?[^"']+=[^"']*(["'])"#).unwrap());

static RE_NEXT_QUERY_PARAMS: Lazy<Regex> =
    Lazy::new(|| Regex::new(r"(?i)((_next|chunks)/[a-z0-9/()\[\]._@%^{}-]+\.[a-z0-9]{1,5})\?[a-z0-9_&=.-]+").unwrap());

static RE_DPL_QUERY: Lazy<Regex> = Lazy::new(|| Regex::new(r#"(?i)\?dpl=[^"' ]+"#).unwrap());

pub struct NextJsProcessor {
    #[allow(dead_code)]
    config: ProcessorConfig,
    debug_mode: bool,
    relevant_content_types: Vec<ContentTypeId>,
}

impl NextJsProcessor {
    pub fn new(config: ProcessorConfig) -> Self {
        Self {
            config,
            debug_mode: false,
            relevant_content_types: vec![ContentTypeId::Html, ContentTypeId::Script, ContentTypeId::Stylesheet],
        }
    }
}

impl ContentProcessor for NextJsProcessor {
    fn find_urls(&self, content: &str, source_url: &ParsedUrl) -> Option<FoundUrls> {
        // Only process Next.js manifest files
        let is_nextjs_manifest =
            source_url.path.contains("_next/") && source_url.path.to_lowercase().contains("manifest");
        if !is_nextjs_manifest {
            return None;
        }

        let nextjs_base_dir = if let Some(pos) = source_url.path.find("/_next/") {
            source_url.path[..pos + 7].to_string() // include "/_next/"
        } else {
            return None;
        };

        let mut found_urls_txt: Vec<String> = Vec::new();
        for caps in RE_MANIFEST_JS.captures_iter(content) {
            if let Some(m) = caps.get(1) {
                found_urls_txt.push(format!("{}{}", nextjs_base_dir, m.as_str()));
            }
        }

        if found_urls_txt.is_empty() {
            return None;
        }

        let mut found_urls = FoundUrls::new();
        let url_refs: Vec<&str> = found_urls_txt.iter().map(|s| s.as_str()).collect();
        found_urls.add_urls_from_text_array(&url_refs, &source_url.path, UrlSource::JsUrl);

        if found_urls.get_count() > 0 {
            Some(found_urls)
        } else {
            None
        }
    }

    fn apply_content_changes_before_url_parsing(
        &self,
        content: &mut String,
        _content_type: ContentTypeId,
        _url: &ParsedUrl,
    ) {
        // Only process content containing _next
        if !content.to_lowercase().contains("_next") {
            return;
        }

        // Remove query params from static assets in NextJS
        *content = RE_NEXT_QUERY_PARAMS.replace_all(content, "$1").to_string();
        *content = RE_DPL_QUERY.replace_all(content, "").to_string();
    }

    fn apply_content_changes_for_offline_version(
        &self,
        content: &mut String,
        _content_type: ContentTypeId,
        url: &ParsedUrl,
        _remove_unwanted_code: bool,
    ) {
        // Only process content containing _next
        if !content.to_lowercase().contains("_next") {
            return;
        }

        // Disable prefetching in NextJS
        *content = RE_DISABLE_PREFETCH.replace_all(content, "$1 return; if").to_string();

        // Calculate depth for relative prefix
        let base_path = &url.path;
        let trimmed = base_path.trim_start_matches('/');
        let mut depth = trimmed.matches('/').count();
        let needs_index = base_path != "/" && !base_path.is_empty() && base_path.ends_with('/');
        if needs_index {
            depth += 1;
        }

        let nextjs_prefix1 = if depth > 0 {
            "../".repeat(depth)
        } else {
            "./".to_string()
        };

        // Replace escaped /_next/ paths
        *content = RE_ESCAPED_NEXT
            .replace_all(content, |caps: &regex::Captures| {
                let quote = caps.get(1).map_or("", |m| m.as_str());
                format!("\\{}{}_next/", quote, nextjs_prefix1)
            })
            .to_string();

        let nextjs_prefix2 = format!(
            "({} > 0 ? \"../\".repeat({}) : \"./\")",
            JS_VARIABLE_NAME_URL_DEPTH, JS_VARIABLE_NAME_URL_DEPTH
        );

        // Replace assignment /_next/ patterns
        *content = RE_ASSIGN_NEXT
            .replace_all(content, |caps: &regex::Captures| {
                let prefix = caps.get(1).map_or("", |m| m.as_str());
                let quote = caps.get(2).map_or("", |m| m.as_str());
                format!("{}{} + {}_next/", prefix, nextjs_prefix2, quote)
            })
            .to_string();

        // concat(e,"/_next/" -> concat(e,PREFIX+"/_next/")
        *content = RE_CONCAT_NEXT
            .replace_all(content, |caps: &regex::Captures| {
                let concat_start = caps.get(1).map_or("", |m| m.as_str());
                let next_path = caps.get(2).map_or("", |m| m.as_str());
                let end_quote = caps.get(3).map_or("", |m| m.as_str());
                format!("{}{}+{}{}", concat_start, nextjs_prefix2, next_path, end_quote)
            })
            .to_string();

        // Remove __NEXT_DATA__ script and replace with empty
        let empty_next_data =
            r#"<script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}}}</script>"#;
        *content = RE_NEXT_DATA.replace_all(content, empty_next_data).to_string();

        // Add prefix to prefetch function
        *content = RE_PREFETCH_FUNC
            .replace_all(content, |caps: &regex::Captures| {
                let start = caps.get(1).map_or("", |m| m.as_str());
                let arg = caps.get(2).map_or("", |m| m.as_str());
                let mid = caps.get(3).map_or("", |m| m.as_str());
                format!("{}{}{} {}={}+{}; let", start, arg, mid, arg, nextjs_prefix2, arg)
            })
            .to_string();

        // {href:"/".concat
        *content = RE_HREF_CONCAT
            .replace_all(content, |caps: &regex::Captures| {
                let start = caps.get(1).map_or("", |m| m.as_str());
                let q1 = caps.get(2).map_or("", |m| m.as_str());
                let slash = caps.get(3).map_or("", |m| m.as_str());
                let end = caps.get(4).map_or("", |m| m.as_str());
                format!("{}{}+{}{}{}", start, nextjs_prefix2, q1, slash, end)
            })
            .to_string();

        // push(["/
        *content = RE_PUSH_SLASH
            .replace_all(content, |caps: &regex::Captures| {
                let start = caps.get(1).map_or("", |m| m.as_str());
                let path = caps.get(2).map_or("", |m| m.as_str());
                format!("{}{}+{}", start, nextjs_prefix2, path)
            })
            .to_string();

        // return"?dpl=..." -> return""
        *content = RE_RETURN_QUERY.replace_all(content, "$1$2").to_string();

        // Remove query params from _next/static/ paths
        *content = RE_NEXT_QUERY_PARAMS.replace_all(content, "$1").to_string();
        *content = RE_DPL_QUERY.replace_all(content, "").to_string();
    }

    fn is_content_type_relevant(&self, content_type: ContentTypeId) -> bool {
        is_relevant(content_type, &self.relevant_content_types)
    }

    fn get_name(&self) -> &str {
        "NextJsProcessor"
    }

    fn set_debug_mode(&mut self, debug_mode: bool) {
        self.debug_mode = debug_mode;
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    fn make_config() -> ProcessorConfig {
        ProcessorConfig::new(ParsedUrl::parse("https://example.com/", None))
    }

    #[test]
    fn test_non_manifest_returns_none() {
        let processor = NextJsProcessor::new(make_config());
        let content = r#"some javascript content"#;
        let source = ParsedUrl::parse("https://example.com/app.js", None);
        let result = processor.find_urls(content, &source);
        assert!(result.is_none());
    }

    #[test]
    fn test_before_url_parsing_removes_dpl() {
        let processor = NextJsProcessor::new(make_config());
        let mut content = r#"/_next/static/css/file.css?dpl=dpl_abc123"#.to_string();
        let source = ParsedUrl::parse("https://example.com/page", None);
        processor.apply_content_changes_before_url_parsing(&mut content, ContentTypeId::Html, &source);
        assert!(!content.contains("?dpl="));
    }
}


================================================
FILE: src/content_processor/svelte_processor.rs
================================================
// SiteOne Crawler - SvelteProcessor
// (c) Jan Reges <jan.reges@siteone.cz>
//
// Handles SvelteKit specific patterns.

use once_cell::sync::Lazy;
use regex::Regex;

use crate::content_processor::base_processor::ProcessorConfig;
use crate::content_processor::content_processor::ContentProcessor;
use crate::engine::found_urls::FoundUrls;
use crate::engine::parsed_url::ParsedUrl;
use crate::types::ContentTypeId;

static RE_SVELTE_TAG: Lazy<Regex> = Lazy::new(|| Regex::new(r"(?i)<svelte:[^>]+>\s*").unwrap());

pub struct SvelteProcessor {
    #[allow(dead_code)]
    config: ProcessorConfig,
    debug_mode: bool,
}

impl SvelteProcessor {
    pub fn new(config: ProcessorConfig) -> Self {
        Self {
            config,
            debug_mode: false,
        }
    }
}

impl ContentProcessor for SvelteProcessor {
    fn find_urls(&self, _content: &str, _source_url: &ParsedUrl) -> Option<FoundUrls> {
        // SvelteProcessor doesn't extract URLs
        None
    }

    fn apply_content_changes_before_url_parsing(
        &self,
        _content: &mut String,
        _content_type: ContentTypeId,
        _url: &ParsedUrl,
    ) {
        // No changes needed before URL parsing in SvelteProcessor
    }

    fn apply_content_changes_for_offline_version(
        &self,
        content: &mut String,
        _content_type: ContentTypeId,
        _url: &ParsedUrl,
        _remove_unwanted_code: bool,
    ) {
        // Remove <svelte:*> tags for offline version
        if content.contains("<svelte:") {
            *content = RE_SVELTE_TAG.replace_all(content, "").to_string();
        }
    }

    fn is_content_type_relevant(&self, content_type: ContentTypeId) -> bool {
        // SvelteProcessor is only relevant for HTML (overrides the base relevantContentTypes)
        content_type == ContentTypeId::Html
    }

    fn get_name(&self) -> &str {
        "SvelteProcessor"
    }

    fn set_debug_mode(&mut self, debug_mode: bool) {
        self.debug_mode = debug_mode;
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    fn make_config() -> ProcessorConfig {
        ProcessorConfig::new(ParsedUrl::parse("https://example.com/", None))
    }

    #[test]
    fn test_remove_svelte_tags() {
        let processor = SvelteProcessor::new(make_config());
        let mut content = r#"<html><head><svelte:head></svelte:head></head><body>test</body></html>"#.to_string();
        let url = ParsedUrl::parse("https://example.com/", None);
        processor.apply_content_changes_for_offline_version(&mut content, ContentTypeId::Html, &url, false);
        assert!(!content.contains("<svelte:"));
    }

    #[test]
    fn test_is_relevant_only_for_html() {
        let processor = SvelteProcessor::new(make_config());
        assert!(processor.is_content_type_relevant(ContentTypeId::Html));
        assert!(!processor.is_content_type_relevant(ContentTypeId::Script));
        assert!(!processor.is_content_type_relevant(ContentTypeId::Stylesheet));
    }
}


================================================
FILE: src/content_processor/xml_processor.rs
================================================
// SiteOne Crawler - XmlProcessor
// (c) Jan Reges <jan.reges@siteone.cz>
//
// Extracts URLs from sitemap.xml and sitemap index files.

use quick_xml::Reader;
use quick_xml::events::Event;

use crate::content_processor::base_processor::{ProcessorConfig, is_relevant};
use crate::content_processor::content_processor::ContentProcessor;
use crate::engine::found_url::{FoundUrl, UrlSource};
use crate::engine::found_urls::FoundUrls;
use crate::engine::parsed_url::ParsedUrl;
use crate::types::ContentTypeId;

pub struct XmlProcessor {
    #[allow(dead_code)]
    config: ProcessorConfig,
    debug_mode: bool,
    relevant_content_types: Vec<ContentTypeId>,
}

impl XmlProcessor {
    pub fn new(config: ProcessorConfig) -> Self {
        Self {
            config,
            debug_mode: false,
            relevant_content_types: vec![ContentTypeId::Xml],
        }
    }

    fn is_sitemap_xml_index(content: &str) -> bool {
        content.to_lowercase().contains("<sitemapindex")
    }

    fn is_sitemap_xml(content: &str) -> bool {
        content.to_lowercase().contains("<urlset")
    }

    /// Parse URLs from a sitemap.xml <urlset> document
    fn get_urls_from_sitemap_xml(content: &str) -> Vec<String> {
        let mut urls = Vec::new();
        let mut reader = Reader::from_str(content);
        reader.config_mut().trim_text(true);

        let mut in_loc = false;
        let mut buf = Vec::new();

        loop {
            match reader.read_event_into(&mut buf) {
                Ok(Event::Start(ref e)) => {
                    let local_name = e.local_name();
                    if local_name.as_ref() == b"loc" {
                        in_loc = true;
                    }
                }
                Ok(Event::Text(ref e)) => {
                    if in_loc && let Ok(text) = e.decode() {
                        let url = text.trim().to_string();
                        if !url.is_empty() {
                            urls.push(url);
                        }
                    }
                }
                Ok(Event::End(ref e)) => {
                    let local_name = e.local_name();
                    if local_name.as_ref() == b"loc" {
                        in_loc = false;
                    }
                }
                Ok(Event::Eof) => break,
                Err(_) => break,
                _ => {}
            }
            buf.clear();
        }

        urls
    }

    /// Parse URLs from a sitemap index document
    fn get_urls_from_sitemap_xml_index(content: &str) -> Vec<String> {
        let mut urls = Vec::new();
        let mut reader = Reader::from_str(content);
        reader.config_mut().trim_text(true);

        let mut in_sitemap = false;
        let mut in_loc = false;
        let mut buf = Vec::new();

        loop {
            match reader.read_event_into(&mut buf) {
                Ok(Event::Start(ref e)) => {
                    let local_name = e.local_name();
                    if local_name.as_ref() == b"sitemap" {
                        in_sitemap = true;
                    } else if local_name.as_ref() == b"loc" && in_sitemap {
                        in_loc = true;
                    }
                }
                Ok(Event::Text(ref e)) => {
                    if in_loc && let Ok(text) = e.decode() {
                        let url = text.trim().to_string();
                        let url_lower = url.to_lowercase();
                        // Include .xml and .xml.gz sitemap URLs
                        if url_lower.ends_with(".xml") || url_lower.ends_with(".xml.gz") {
                            urls.push(url);
                        }
                    }
                }
                Ok(Event::End(ref e)) => {
                    let local_name = e.local_name();
                    if local_name.as_ref() == b"loc" {
                        in_loc = false;
                    } else if local_name.as_ref() == b"sitemap" {
                        in_sitemap = false;
                    }
                }
                Ok(Event::Eof) => break,
                Err(_) => break,
                _ => {}
            }
            buf.clear();
        }

        urls
    }
}

impl ContentProcessor for XmlProcessor {
    fn find_urls(&self, content: &str, source_url: &ParsedUrl) -> Option<FoundUrls> {
        let source_url_str = source_url.get_full_url(true, false);

        if Self::is_sitemap_xml_index(content) {
            let urls = Self::get_urls_from_sitemap_xml_index(content);
            if urls.is_empty() {
                return None;
            }

            let mut found_urls = FoundUrls::new();
            for url in urls {
                found_urls.add_url(FoundUrl::new(&url, &source_url_str, UrlSource::Sitemap));
            }
            return Some(found_urls);
        }

        if Self::is_sitemap_xml(content) {
            let urls = Self::get_urls_from_sitemap_xml(content);
            if urls.is_empty() {
                return None;
            }

            let mut found_urls = FoundUrls::new();
            for url in urls {
                found_urls.add_url(FoundUrl::new(&url, &source_url_str, UrlSource::Sitemap));
            }
            return Some(found_urls);
        }

        None
    }

    fn apply_content_changes_before_url_parsing(
        &self,
        _content: &mut String,
        _content_type: ContentTypeId,
        _url: &ParsedUrl,
    ) {
        // No changes needed before URL parsing in XmlProcessor
    }

    fn apply_content_changes_for_offline_version(
        &self,
        _content: &mut String,
        _content_type: ContentTypeId,
        _url: &ParsedUrl,
        _remove_unwanted_code: bool,
    ) {
        // XML files don't need offline conversion
    }

    fn is_content_type_relevant(&self, content_type: ContentTypeId) -> bool {
        is_relevant(content_type, &self.relevant_content_types)
    }

    fn get_name(&self) -> &str {
        "XmlProcessor"
    }

    fn set_debug_mode(&mut self, debug_mode: bool) {
        self.debug_mode = debug_mode;
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    fn make_config() -> ProcessorConfig {
        ProcessorConfig::new(ParsedUrl::parse("https://example.com/", None))
    }

    #[test]
    fn test_sitemap_xml() {
        let processor = XmlProcessor::new(make_config());
        let xml = r#"<?xml version="1.0" encoding="UTF-8"?>
            <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
                <url><loc>https://example.com/page1</loc></url>
                <url><loc>https://example.com/page2</loc></url>
            </urlset>"#;
        let source = ParsedUrl::parse("https://example.com/sitemap.xml", None);
        let result = processor.find_urls(xml, &source);
        assert!(result.is_some());
        assert_eq!(result.unwrap().get_count(), 2);
    }

    #[test]
    fn test_sitemap_index() {
        let processor = XmlProcessor::new(make_config());
        let xml = r#"<?xml version="1.0" encoding="UTF-8"?>
            <sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
                <sitemap><loc>https://example.com/sitemap1.xml</loc></sitemap>
                <sitemap><loc>https://example.com/sitemap2.xml</loc></sitemap>
                <sitemap><loc>https://example.com/sitemap3.xml.gz</loc></sitemap>
                <sitemap><loc>https://example.com/sitemap.tar.gz</loc></sitemap>
            </sitemapindex>"#;
        let source = ParsedUrl::parse("https://example.com/sitemap.xml", None);
        let result = processor.find_urls(xml, &source);
        assert!(result.is_some());
        // .xml and .xml.gz are included, but not .tar.gz
        assert_eq!(result.unwrap().get_count(), 3);
    }

    #[test]
    fn test_non_sitemap_xml() {
        let processor = XmlProcessor::new(make_config());
        let xml = r#"<?xml version="1.0"?><root><item>test</item></root>"#;
        let source = ParsedUrl::parse("https://example.com/data.xml", None);
        let result = processor.find_urls(xml, &source);
        assert!(result.is_none());
    }

    /// Test the full gzip decompression + XML parsing pipeline.
    /// Simulates what the crawler does when it fetches a .xml.gz sitemap:
    /// gzip-compressed bytes → decompress → parse XML → extract URLs.
    #[test]
    fn test_gzip_compressed_sitemap() {
        use flate2::Compression;
        use flate2::read::GzDecoder;
        use flate2::write::GzEncoder;
        use std::io::Write;

        let xml = r#"<?xml version="1.0" encoding="UTF-8"?>
            <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
                <url><loc>https://example.com/page1</loc></url>
                <url><loc>https://example.com/page2</loc></url>
                <url><loc>https://example.com/page3</loc></url>
            </urlset>"#;

        // Compress the XML (simulates what a .xml.gz file contains)
        let mut encoder = GzEncoder::new(Vec::new(), Compression::default());
        encoder.write_all(xml.as_bytes()).unwrap();
        let compressed = encoder.finish().unwrap();

        // Verify it's actually compressed (smaller or at least different)
        assert_ne!(compressed, xml.as_bytes());

        // Decompress (same logic as in Crawler::process_url for .xml.gz)
        let mut decoder = GzDecoder::new(&compressed[..]);
        let mut decompressed = Vec::new();
        std::io::Read::read_to_end(&mut decoder, &mut decompressed).unwrap();
        let decompressed_str = String::from_utf8(decompressed).unwrap();

        // Parse the decompressed XML with XmlProcessor
        let processor = XmlProcessor::new(make_config());
        let source = ParsedUrl::parse("https://example.com/sitemap.xml.gz", None);
        let result = processor.find_urls(&decompressed_str, &source);
        assert!(result.is_some());
        assert_eq!(result.unwrap().get_count(), 3);
    }

    /// Same test for gzip-compressed sitemap index.
    #[test]
    fn test_gzip_compressed_sitemap_index() {
        use flate2::Compression;
        use flate2::read::GzDecoder;
        use flate2::write::GzEncoder;
        use std::io::Write;

        let xml = r#"<?xml version="1.0" encoding="UTF-8"?>
            <sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
                <sitemap><loc>https://example.com/sitemap-posts.xml</loc></sitemap>
                <sitemap><loc>https://example.com/sitemap-pages.xml.gz</loc></sitemap>
            </sitemapindex>"#;

        let mut encoder = GzEncoder::new(Vec::new(), Compression::default());
        encoder.write_all(xml.as_bytes()).unwrap();
        let compressed = encoder.finish().unwrap();

        let mut decoder = GzDecoder::new(&compressed[..]);
        let mut decompressed = Vec::new();
        std::io::Read::read_to_end(&mut decoder, &mut decompressed).unwrap();
        let decompressed_str = String::from_utf8(decompressed).unwrap();

        let processor = XmlProcessor::new(make_config());
        let source = ParsedUrl::parse("https://example.com/sitemap-index.xml.gz", None);
        let result = processor.find_urls(&decompressed_str, &source);
        assert!(result.is_some());
        // Both .xml and .xml.gz URLs from the index
        assert_eq!(result.unwrap().get_count(), 2);
    }
}


================================================
FILE: src/debugger.rs
================================================
// SiteOne Crawler - Debugger
// (c) Jan Reges <jan.reges@siteone.cz>

use std::fs::OpenOptions;
use std::io::Write;
use std::sync::RwLock;

use crate::utils;

pub const DEBUG: &str = "debug";
pub const INFO: &str = "info";
pub const NOTICE: &str = "notice";
pub const WARNING: &str = "warning";
pub const CRITICAL: &str = "critical";

static DEBUG_ENABLED: RwLock<bool> = RwLock::new(false);
static DEBUG_PRINT_TO_OUTPUT: RwLock<bool> = RwLock::new(false);
static DEBUG_LOG_FILE: RwLock<Option<String>> = RwLock::new(None);

pub fn debug(category: &str, message: &str, severity: &str, time: Option<f64>, size: Option<i64>) {
    let enabled = DEBUG_ENABLED.read().map(|v| *v).unwrap_or(false);
    if !enabled {
        return;
    }

    let now = chrono::Local::now().format("%Y-%m-%d %H:%M:%S").to_string();
    let mut final_message = format!("{} | {:8} | {:14} | ", now, severity, category,);

    if let Some(t) = time {
        final_message.push_str(&format!("{:7} | ", utils::get_formatted_duration(t)));
    }
    if let Some(s) = size {
        final_message.push_str(&format!("{:7} | ", utils::get_formatted_size(s, 0)));
    }

    final_message.push_str(message);

    print_debug(&final_message);
    log_debug(&final_message);
}

pub fn console_array_debug(row_data: &[String], col_widths: &[usize]) {
    let enabled = DEBUG_ENABLED.read().map(|v| *v).unwrap_or(false);
    if !enabled {
        return;
    }

    let console_width = utils::get_console_width();
    let widths: Vec<usize> = if col_widths.is_empty() {
        let col_width = console_width / row_data.len();
        vec![col_width.max(10); row_data.len()]
    } else {
        col_widths.iter().map(|w| (*w).max(10)).collect()
    };

    let mut row = Vec::new();
    for (i, value) in row_data.iter().enumerate() {
        let w = widths.get(i).copied().unwrap_or(10);
        let val = if value.len() > w {
            utils::truncate_in_two_thirds(value, w, "..", None)
        } else {
            format!("{:<width$}", value, width = w)
        };
        row.push(val);
    }

    let message = row.join(" | ");
    print_debug(&message);
    log_debug(&message);
}

pub fn force_enabled_debug(log_file: Option<&str>) {
    if let Ok(mut d) = DEBUG_ENABLED.write() {
        *d = true;
    }
    if let Ok(mut p) = DEBUG_PRINT_TO_OUTPUT.write() {
        *p = true;
    }
    if let Some(f) = log_file
        && let Ok(mut lf) = DEBUG_LOG_FILE.write()
    {
        *lf = Some(f.to_string());
    }
}

pub fn set_config(debug_enabled: bool, debug_log_file: Option<&str>) {
    if debug_enabled {
        if let Ok(mut d) = DEBUG_ENABLED.write() {
            *d = true;
        }
        if let Ok(mut p) = DEBUG_PRINT_TO_OUTPUT.write() {
            *p = true;
        }
        if let Some(f) = debug_log_file
            && let Ok(mut lf) = DEBUG_LOG_FILE.write()
        {
            *lf = Some(f.to_string());
        }
    } else if debug_log_file.is_some() {
        // when debug is disabled but debugLogFile is set, logging to file is enabled but printing to output is not
        if let Ok(mut d) = DEBUG_ENABLED.write() {
            *d = true;
        }
        if let Ok(mut p) = DEBUG_PRINT_TO_OUTPUT.write() {
            *p = false;
        }
        if let Some(f) = debug_log_file
            && let Ok(mut lf) = DEBUG_LOG_FILE.write()
        {
            *lf = Some(f.to_string());
        }
    }
}

fn print_debug(message: &str) {
    let should_print = DEBUG_PRINT_TO_OUTPUT.read().map(|v| *v).unwrap_or(false);
    if should_print {
        println!("{}", message);
    }
}

fn log_debug(message: &str) {
    let log_file = DEBUG_LOG_FILE.read().ok().and_then(|v| v.clone());
    if let Some(path) = log_file {
        let abs_path = utils::get_absolute_path(&path);
        if let Ok(mut file) = OpenOptions::new().create(true).append(true).open(&abs_path) {
            let _ = writeln!(file, "{}", message);
        }
    }
}


================================================
FILE: src/engine/crawler.rs
================================================
// SiteOne Crawler - Core Crawler Engine
// (c) Jan Reges <jan.reges@siteone.cz>
//
// Main crawling engine with concurrent URL processing.

use std::collections::{HashMap, VecDeque};
use std::sync::atomic::{AtomicBool, AtomicI64, AtomicUsize, Ordering};
use std::sync::{Arc, Mutex};

use dashmap::DashMap;
use md5::{Digest, Md5};
use once_cell::sync::Lazy;
use regex::Regex;
use tokio::sync::Semaphore;

/// Regex to extract <base href="..."> from HTML
static RE_BASE_HREF: Lazy<Regex> = Lazy::new(|| Regex::new(r#"(?is)<base[^>]+href\s*=\s*["']?([^"'\s>]+)"#).unwrap());

/// Static regexes for title/description/keywords extraction (Fix #12)
static RE_TITLE: Lazy<Regex> = Lazy::new(|| Regex::new(r"(?is)<title[^>]*>([^<]*)</title>").unwrap());
static RE_DESCRIPTION: Lazy<Regex> = Lazy::new(|| {
    Regex::new(r#"(?is)<meta\s+[^>]*name=["']description["']\s+[^>]*content=["']([^"']+)["'][^>]*>"#).unwrap()
});
static RE_KEYWORDS: Lazy<Regex> = Lazy::new(|| {
    Regex::new(r#"(?is)<meta\s+[^>]*name=["']keywords["']\s+[^>]*content=["']([^"']+)["'][^>]*>"#).unwrap()
});
static RE_DOM_COUNT: Lazy<Regex> = Lazy::new(|| Regex::new(r"<\w+").unwrap());

use crate::analysis::manager::AnalysisManager;
use crate::content_processor::html_processor::HTML_PAGES_EXTENSIONS;
use crate::content_processor::manager::ContentProcessorManager;
use crate::engine::found_url::UrlSource;
use crate::engine::found_urls::FoundUrls;
use crate::engine::http_client::HttpClient;
use crate::engine::http_response::HttpResponse;
use crate::engine::parsed_url::ParsedUrl;
use crate::engine::robots_txt::RobotsTxt;
use crate::error::CrawlerResult;
use crate::options::core_options::CoreOptions;
use crate::output::output::Output;
use crate::result::status::Status;
use crate::result::visited_url::VisitedUrl;
use crate::types::{ContentTypeId, DeviceType, SkippedReason};
use crate::utils;
use crate::version;

/// Entry in the URL queue
#[derive(Debug, Clone)]
#[allow(dead_code)]
pub struct QueueEntry {
    pub url: String,
    pub uq_id: String,
    pub source_uq_id: String,
    pub source_attr: i32,
}

/// Entry for a visited URL in the visited table
#[derive(Debug, Clone)]
#[allow(dead_code)]
pub struct VisitedEntry {
    pub url: String,
    pub uq_id: String,
    pub source_uq_id: String,
    pub source_attr: i32,
}

/// Entry for a skipped URL
#[derive(Debug, Clone)]
#[allow(dead_code)]
pub struct SkippedEntry {
    pub url: String,
    pub reason: SkippedReason,
    pub source_uq_id: String,
    pub source_attr: i32,
}

/// Accept header for HTTP requests
const ACCEPT_HEADER: &str = "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7";

/// Main crawler engine
pub struct Crawler {
    options: Arc<CoreOptions>,
    http_client: Arc<HttpClient>,
    content_processor_manager: Arc<Mutex<ContentProcessorManager>>,
    analysis_manager: Arc<Mutex<AnalysisManager>>,
    output: Arc<Mutex<Box<dyn Output>>>,
    status: Arc<Mutex<Status>>,

    /// URL queue (key = md5 of full URL, value = QueueEntry)
    queue: Arc<DashMap<String, QueueEntry>>,
    /// Insertion-ordered queue keys for breadth-first processing
    queue_order: Arc<Mutex<VecDeque<String>>>,
    /// Visited URLs (key = md5 of full URL, value = VisitedEntry)
    visited: Arc<DashMap<String, VisitedEntry>>,
    /// Skipped URLs (key = md5 of full URL, value = SkippedEntry)
    skipped: Arc<DashMap<String, SkippedEntry>>,

    /// Initial parsed URL
    initial_parsed_url: ParsedUrl,
    /// Final user agent string
    final_user_agent: String,
    /// Accept header (may be modified for offline export)
    accept_header: String,
    /// Whether the initial URL has been found as existing HTML
    initial_existing_url_found: Arc<AtomicBool>,
    /// Whether the crawler has been terminated
    terminated: Arc<AtomicBool>,

    /// Rate limiting: optimal delay between requests in seconds
    optimal_delay_between_requests: f64,
    /// Last request timestamp (epoch seconds)
    last_request_time: Arc<Mutex<f64>>,

    /// Counter for done URLs
    done_urls_count: Arc<AtomicUsize>,

    /// Non-200 basenames to their occurrence counts
    non200_basenames_to_occurrences: Arc<DashMap<String, i64>>,

    /// Cached robots.txt data per domain:port
    robots_txt_cache: Arc<DashMap<String, Option<RobotsTxt>>>,
    /// Counter for loaded robots.txt files
    loaded_robots_txt_count: Arc<AtomicI64>,

    /// Cached resolve mappings (domain:port -> IP)
    resolve_cache: Arc<DashMap<String, String>>,

    /// Pre-compiled include regex patterns
    compiled_include_regex: Arc<Vec<Regex>>,
    /// Pre-compiled ignore regex patterns
    compiled_ignore_regex: Arc<Vec<Regex>>,
}

impl Crawler {
    pub fn new(
        options: Arc<CoreOptions>,
        http_client: HttpClient,
        content_processor_manager: ContentProcessorManager,
        analysis_manager: AnalysisManager,
        output: Box<dyn Output>,
        status: Status,
    ) -> Self {
        let initial_parsed_url = ParsedUrl::parse(&options.url, None);
        let final_user_agent = Self::build_final_user_agent(&options);

        // Set the final user agent in status
        let status = {
            status.set_final_user_agent(&final_user_agent);
            status
        };

        let optimal_delay = (1.0 / options.max_reqs_per_sec).max(0.001);

        // Pre-compile include/ignore regex patterns
        let compiled_include_regex: Vec<Regex> = options
            .include_regex
            .iter()
            .filter_map(|p| {
                let pattern = utils::extract_pcre_regex_pattern(p);
                Regex::new(&pattern).ok()
            })
            .collect();
        let compiled_ignore_regex: Vec<Regex> = options
            .ignore_regex
            .iter()
            .filter_map(|p| {
                let pattern = utils::extract_pcre_regex_pattern(p);
                Regex::new(&pattern).ok()
            })
            .collect();

        // Build resolve cache
        let resolve_cache = DashMap::new();
        let resolve_re = Regex::new(r"^([^:]+):([0-9]+):(.+)$");
        for resolve in &options.resolve {
            if let Ok(ref re) = resolve_re
                && let Some(caps) = re.captures(resolve)
            {
                let domain = caps.get(1).map_or("", |m| m.as_str());
                let port = caps.get(2).map_or("", |m| m.as_str());
                let ip = caps.get(3).map_or("", |m| m.as_str());
                resolve_cache.insert(format!("{}:{}", domain, port), ip.to_string());
            }
        }

        Crawler {
            options,
            http_client: Arc::new(http_client),
            content_processor_manager: Arc::new(Mutex::new(content_processor_manager)),
            analysis_manager: Arc::new(Mutex::new(analysis_manager)),
            output: Arc::new(Mutex::new(output)),
            status: Arc::new(Mutex::new(status)),
            queue: Arc::new(DashMap::new()),
            queue_order: Arc::new(Mutex::new(VecDeque::new())),
            visited: Arc::new(DashMap::new()),
            skipped: Arc::new(DashMap::new()),
            initial_parsed_url,
            final_user_agent,
            accept_header: ACCEPT_HEADER.to_string(),
            initial_existing_url_found: Arc::new(AtomicBool::new(false)),
            terminated: Arc::new(AtomicBool::new(false)),
            optimal_delay_between_requests: optimal_delay,
            last_request_time: Arc::new(Mutex::new(0.0)),
            done_urls_count: Arc::new(AtomicUsize::new(0)),
            non200_basenames_to_occurrences: Arc::new(DashMap::new()),
            robots_txt_cache: Arc::new(DashMap::new()),
            loaded_robots_txt_count: Arc::new(AtomicI64::new(0)),
            resolve_cache: Arc::new(resolve_cache),
            compiled_include_regex: Arc::new(compiled_include_regex),
            compiled_ignore_regex: Arc::new(compiled_ignore_regex),
        }
    }

    /// Main crawl loop. Processes URLs concurrently with rate limiting.
    pub async fn run(&mut self) -> CrawlerResult<()> {
        // Add initial URL to queue
        self.add_url_to_queue(&self.initial_parsed_url.clone(), None, UrlSource::InitUrl as i32);

        // Print table header
        if let Ok(mut output) = self.output.lock() {
            output.add_table_header();
        }

        // Set up Ctrl+C handler
        let terminated = self.terminated.clone();
        let ctrl_c_handler = tokio::spawn(async move {
            if let Ok(()) = tokio::signal::ctrl_c().await {
                terminated.store(true, Ordering::SeqCst);
            }
        });

        // Semaphore for controlling concurrent workers
        let semaphore = Arc::new(Semaphore::new(self.options.workers as usize));
        let mut join_handles = Vec::new();

        loop {
            if self.terminated.load(Ordering::SeqCst) {
                if let Ok(mut output) = self.output.lock() {
                    output.add_notice(
                        "Crawler interrupted by user (Ctrl+C). Processing will stop after in-flight requests complete.",
                    );
                }
                break;
            }

            // Take the next URL from the queue
            let entry = self.take_next_from_queue();
            let entry = match entry {
                Some(e) => e,
                None => {
                    // Queue is empty - check if there are still active workers
                    let avail = semaphore.available_permits();
                    let total = self.options.workers as usize;
                    if avail == total {
                        // No active workers and empty queue = done
                        break;
                    }
                    // Wait a bit for workers to finish and potentially add new URLs
                    tokio::time::sleep(tokio::time::Duration::from_millis(10)).await;
                    continue;
                }
            };

            // Acquire semaphore permit
            let permit = semaphore.clone().acquire_owned().await;
            let permit = match permit {
                Ok(p) => p,
                Err(_) => break,
            };

            // Clone all needed Arcs for the spawned task
            let options = self.options.clone();
            let http_client = self.http_client.clone();
            let content_processor_manager = self.content_processor_manager.clone();
            let analysis_manager = self.analysis_manager.clone();
            let output = self.output.clone();
            let status = self.status.clone();
            let queue = self.queue.clone();
            let queue_order = self.queue_order.clone();
            let visited = self.visited.clone();
            let skipped = self.skipped.clone();
            let initial_parsed_url = self.initial_parsed_url.clone();
            let final_user_agent = self.final_user_agent.clone();
            let accept_header = self.accept_header.clone();
            let initial_existing_url_found = self.initial_existing_url_found.clone();
            let terminated = self.terminated.clone();
            let done_urls_count = self.done_urls_count.clone();
            let non200_basenames = self.non200_basenames_to_occurrences.clone();
            let robots_txt_cache = self.robots_txt_cache.clone();
            let loaded_robots_txt_count = self.loaded_robots_txt_count.clone();
            let resolve_cache = self.resolve_cache.clone();
            let last_request_time = self.last_request_time.clone();
            let optimal_delay = self.optimal_delay_between_requests;
            let compiled_include_regex = self.compiled_include_regex.clone();
            let compiled_ignore_regex = self.compiled_ignore_regex.clone();

            let handle = tokio::spawn(async move {
                let _permit = permit; // Hold permit until task completes

                if terminated.load(Ordering::SeqCst) {
                    return;
                }

                Self::process_url(
                    entry,
                    &options,
                    &http_client,
                    &content_processor_manager,
                    &analysis_manager,
                    &output,
                    &status,
                    &queue,
                    &queue_order,
                    &visited,
                    &skipped,
                    &initial_parsed_url,
                    &final_user_agent,
                    &accept_header,
                    &initial_existing_url_found,
                    &terminated,
                    &done_urls_count,
                    &non200_basenames,
                    &robots_txt_cache,
                    &loaded_robots_txt_count,
                    &resolve_cache,
                    &last_request_time,
                    optimal_delay,
                    &compiled_include_regex,
                    &compiled_ignore_regex,
                )
                .await;
            });

            join_handles.push(handle);

            // Clean up finished handles periodically
            if join_handles.len() > 100 {
                let mut remaining = Vec::new();
                for h in join_handles {
                    if !h.is_finished() {
                        remaining.push(h);
                    }
                }
                join_handles = remaining;
            }
        }

        // Wait for all in-flight workers to complete
        for handle in join_handles {
            let _ = handle.await;
        }

        ctrl_c_handler.abort();

        Ok(())
    }

    /// Take the next URL from the queue (breadth-first order)
    fn take_next_from_queue(&self) -> Option<QueueEntry> {
        let mut order = self.queue_order.lock().unwrap_or_else(|e| e.into_inner());
        while !order.is_empty() {
            let Some(key) = order.pop_front() else { break };
            if let Some((_, entry)) = self.queue.remove(&key) {
                // Add to visited table
                self.visited.insert(
                    key.clone(),
                    VisitedEntry {
                        url: entry.url.clone(),
                        uq_id: entry.uq_id.clone(),
                        source_uq_id: entry.source_uq_id.clone(),
                        source_attr: entry.source_attr,
                    },
                );
                return Some(entry);
            }
        }
        None
    }

    /// Process a single URL: fetch, parse content, extract URLs, update status
    #[allow(clippy::too_many_arguments)]
    async fn process_url(
        entry: QueueEntry,
        options: &Arc<CoreOptions>,
        http_client: &Arc<HttpClient>,
        content_processor_manager: &Arc<Mutex<ContentProcessorManager>>,
        analysis_manager: &Arc<Mutex<AnalysisManager>>,
        output: &Arc<Mutex<Box<dyn Output>>>,
        status: &Arc<Mutex<Status>>,
        queue: &Arc<DashMap<String, QueueEntry>>,
        queue_order: &Arc<Mutex<VecDeque<String>>>,
        visited: &Arc<DashMap<String, VisitedEntry>>,
        skipped: &Arc<DashMap<String, SkippedEntry>>,
        initial_parsed_url: &ParsedUrl,
        final_user_agent: &str,
        accept_header: &str,
        initial_existing_url_found: &Arc<AtomicBool>,
        terminated: &Arc<AtomicBool>,
        done_urls_count: &Arc<AtomicUsize>,
        non200_basenames: &Arc<DashMap<String, i64>>,
        robots_txt_cache: &Arc<DashMap<String, Option<RobotsTxt>>>,
        loaded_robots_txt_count: &Arc<AtomicI64>,
        resolve_cache: &Arc<DashMap<String, String>>,
        last_request_time: &Arc<Mutex<f64>>,
        optimal_delay: f64,
        compiled_include_regex: &Arc<Vec<Regex>>,
        compiled_ignore_regex: &Arc<Vec<Regex>>,
    ) {
        let parsed_url = ParsedUrl::parse(&entry.url, None);
        let parsed_url_uq_id = Self::compute_url_uq_id(&parsed_url);

        let is_asset_url = parsed_url
            .extension
            .as_ref()
            .map(|ext| !HTML_PAGES_EXTENSIONS.contains(&ext.to_lowercase().as_str()))
            .unwrap_or(false);

        let scheme = parsed_url
            .scheme
            .as_deref()
            .unwrap_or(initial_parsed_url.scheme.as_deref().unwrap_or("https"));

        let host_and_port =
            if parsed_url.host.is_none() || parsed_url.host.as_deref() == initial_parsed_url.host.as_deref() {
                let host = initial_parsed_url.host.as_deref().unwrap_or("");
                let port = initial_parsed_url.port.unwrap_or(443);
                if port != 80 && port != 443 {
                    format!("{}:{}", host, port)
                } else {
                    host.to_string()
                }
            } else {
                let host = parsed_url.host.as_deref().unwrap_or("");
                let port = parsed_url.port.unwrap_or(443);
                if port != 80 && port != 443 {
                    format!("{}:{}", host, port)
                } else {
                    host.to_string()
                }
            };

        let host = match &parsed_url.host {
            Some(h) => h.clone(),
            None => {
                if let Ok(mut out) = output.lock() {
                    out.add_error(&format!("Invalid/unsupported URL found: {}", entry.url));
                }
                return;
            }
        };

        let absolute_url = format!(
            "{}://{}{}{}",
            scheme,
            host_and_port,
            parsed_url.path,
            parsed_url.query.as_ref().map(|q| format!("?{}", q)).unwrap_or_default()
        );

        let final_url_for_client = if options.add_random_query_params {
            Self::add_random_query_params(&parsed_url.path)
        } else {
            format!(
                "{}{}",
                parsed_url.path,
                parsed_url.query.as_ref().map(|q| format!("?{}", q)).unwrap_or_default()
            )
        };

        // Get origin header from source URL
        let origin = if !entry.source_uq_id.is_empty() {
            match status.lock() {
                Ok(st) => st.get_origin_header_value_by_source_uq_id(&entry.source_uq_id),
                _ => None,
            }
        } else {
            None
        };

        let is_image = parsed_url.is_image();
        let set_origin = origin.is_some() && !is_image;

        // For security: only send HTTP auth to same 2nd-level domain
        let use_http_auth = initial_parsed_url
            .domain_2nd_level
            .as_ref()
            .map(|d2| parsed_url.domain_2nd_level.as_deref() == Some(d2.as_str()))
            .unwrap_or(parsed_url.host == initial_parsed_url.host);

        let url_basename = parsed_url.get_base_name();

        // Check non-200 basename protection
        let http_response = if let Some(ref basename) = url_basename {
            match non200_basenames.get(basename) {
                Some(count) => {
                    if *count > options.max_non200_responses_per_basename {
                        Some(HttpResponse::create_skipped(
                            final_url_for_client.clone(),
                            format!(
                                "URL with basename '{}' has more than {} non-200 responses ({}).",
                                basename, options.max_non200_responses_per_basename, *count
                            ),
                        ))
                    } else {
                        None
                    }
                }
                _ => None,
            }
        } else {
            None
        };

        let http_response = match http_response {
            Some(skipped) => skipped,
            None => {
                let port = parsed_url.port.unwrap_or(if scheme == "https" { 443 } else { 80 });

                // Apply URL transformations
                let (http_request_host, http_request_path) =
                    Self::apply_http_request_transformations(&host, &final_url_for_client, &options.transform_url);

                let forced_ip = resolve_cache
                    .get(&format!("{}:{}", http_request_host, port))
                    .map(|v| v.value().clone());

                // Rate limiting: skip delay for cached responses (no actual HTTP request needed)
                let origin_for_request = if set_origin { origin.as_deref() } else { None };
                if !http_client.is_url_cached(
                    &http_request_host,
                    port,
                    scheme,
                    &http_request_path,
                    "GET",
                    final_user_agent,
                    accept_header,
                    &options.accept_encoding,
                    origin_for_request,
                ) {
                    let sleep_duration = {
                        let now = Self::current_timestamp();
                        let mut last_time = last_request_time.lock().unwrap_or_else(|e| e.into_inner());
                        let elapsed = now - *last_time;
                        if elapsed < optimal_delay {
                            let sleep = optimal_delay - elapsed;
                            *last_time = now + sleep; // Reserve slot immediately to avoid TOCTOU race
                            sleep
                        } else {
                            *last_time = now;
                            0.0
                        }
                    };
                    if sleep_duration > 0.0 {
                        tokio::time::sleep(tokio::time::Duration::from_secs_f64(sleep_duration.max(0.001))).await;
                    }
                }

                match http_client
                    .request(
                        &http_request_host,
                        port,
                        scheme,
                        &http_request_path,
                        "GET",
                        options.timeout as u64,
                        final_user_agent,
                        accept_header,
                        &options.accept_encoding,
                        origin_for_request,
                        use_http_auth,
                        forced_ip.as_deref(),
                    )
                    .await
                {
                    Ok(resp) => resp,
                    Err(e) => {
                        if let Ok(mut out) = output.lock() {
                            out.add_error(&format!("HTTP request error for {}: {}", absolute_url, e));
                        }
                        return;
                    }
                }
            }
        };

        // When the crawler has been terminated, do not process response
        if terminated.load(Ordering::SeqCst) {
            return;
        }

        let response_status = http_response.status_code;
        let elapsed_time = http_response.exec_time;

        // Handle gzip-compressed sitemaps (.xml.gz): decompress body before processing
        let is_gzip_sitemap = parsed_url.path.to_lowercase().ends_with(".xml.gz");
        let (body, body_text) = if is_gzip_sitemap
            && let Some(ref raw_body) = http_response.body
            && !raw_body.is_empty()
        {
            use flate2::read::GzDecoder;
            let mut decoder = GzDecoder::new(&raw_body[..]);
            let mut decompressed = Vec::new();
            if std::io::Read::read_to_end(&mut decoder, &mut decompressed).is_ok() {
                let text = String::from_utf8_lossy(&decompressed).to_string();
                (Some(decompressed), Some(text))
            } else {
                (http_response.body.clone(), http_response.body_text())
            }
        } else {
            (http_response.body.clone(), http_response.body_text())
        };

        let body_size = if is_asset_url {
            http_response
                .get_header("content-length")
                .and_then(|v| v.parse::<i64>().ok())
                .unwrap_or_else(|| body.as_ref().map(|b| b.len() as i64).unwrap_or(0))
        } else {
            body.as_ref().map(|b| b.len() as i64).unwrap_or(0)
        };

        if response_status != 200 {
            Self::process_non200_url(&parsed_url, non200_basenames);
        }

        // Detect content type
        let content_type_header = http_response.get_header("content-type").cloned().unwrap_or_default();
        let is_html_body = content_type_header.to_lowercase().contains("text/html");
        let is_css_body = content_type_header.to_lowercase().contains("text/css");
        let is_js_body = content_type_header.to_lowercase().contains("application/javascript")
            || content_type_header.to_lowercase().contains("text/javascript");
        let is_xml_body = is_gzip_sitemap
            || content_type_header.to_lowercase().contains("application/xml")
            || content_type_header.to_lowercase().contains("text/xml");

        let is_allowed_for_crawling =
            Self::is_url_allowed_by_regexes(&parsed_url, options, compiled_include_regex, compiled_ignore_regex)
                && Self::is_external_domain_allowed_for_crawling(
                    parsed_url.host.as_deref().unwrap_or(""),
                    initial_parsed_url,
                    &options.allowed_domains_for_crawling,
                );

        let mut extra_parsed_content: HashMap<String, String> = HashMap::new();

        // Mark initial URL as found
        if !initial_existing_url_found.load(Ordering::SeqCst) && is_html_body && response_status == 200 && body_size > 0
        {
            initial_existing_url_found.store(true, Ordering::SeqCst);
        }

        // Get content type ID
        let has_location = http_response.get_header("location").is_some();
        let content_type = if has_location && response_status > 300 && response_status < 320 {
            ContentTypeId::Redirect
        } else if is_gzip_sitemap {
            ContentTypeId::Xml
        } else {
            Self::get_content_type_id_by_header(&content_type_header)
        };

        // Apply content changes before URL parsing (text-based, for HTML/CSS/JS)
        let mut body_for_parsing = body_text.clone().unwrap_or_default();
        if let Ok(mut cpm) = content_processor_manager.lock() {
            cpm.apply_content_changes_before_url_parsing(&mut body_for_parsing, content_type, &parsed_url);
        }

        // Parse body and fill queue with new URLs
        if !body_for_parsing.is_empty() && is_html_body && is_allowed_for_crawling {
            let html_extras = Self::parse_html_body_and_fill_queue(
                &body_for_parsing,
                content_type,
                &parsed_url,
                options,
                content_processor_manager,
                queue,
                queue_order,
                visited,
                skipped,
                initial_parsed_url,
                non200_basenames,
                robots_txt_cache,
                loaded_robots_txt_count,
                resolve_cache,
                http_client,
                output,
                status,
                terminated,
                compiled_include_regex,
                compiled_ignore_regex,
            );
            for (k, v) in html_extras {
                extra_parsed_content.insert(k, v);
            }
        } else if !body_for_parsing.is_empty() && (is_js_body || is_css_body || is_xml_body) {
            Self::parse_content_and_fill_url_queue(
                &body_for_parsing,
                content_type,
                &parsed_url,
                options,
                content_processor_manager,
                queue,
                queue_order,
                visited,
                skipped,
                initial_parsed_url,
                non200_basenames,
                robots_txt_cache,
                loaded_robots_txt_count,
                resolve_cache,
                http_client,
                output,
                status,
                terminated,
                compiled_include_regex,
                compiled_ignore_regex,
            );
        }

        // Handle redirect
        if (301..=308).contains(&response_status)
            && let Some(redirect_location) = http_response.get_header("location")
        {
            let redirect_location = redirect_location.clone();
            extra_parsed_content.insert("Location".to_string(), redirect_location.clone());
            Self::add_redirect_location_to_queue_if_suitable(
                &redirect_location,
                &parsed_url_uq_id,
                scheme,
                &host_and_port,
                &parsed_url,
                options,
                queue,
                queue_order,
                visited,
                skipped,
                initial_parsed_url,
                terminated,
                compiled_include_regex,
                compiled_ignore_regex,
            );
        }

        // Set extras from headers
        for extra_column in &options.extra_columns {
            let col_name_lower = extra_column.name.to_lowercase();
            if let Some(header_val) = http_response.get_header(&col_name_lower) {
                extra_parsed_content.insert(extra_column.name.clone(), header_val.clone());
            }
        }

        // Caching
        let (cache_type_flags, cache_lifetime) = if http_response.status_code > 0 {
            (
                Self::get_cache_type_flags(&http_response.headers),
                Self::get_cache_lifetime(&http_response.headers),
            )
        } else {
            (crate::result::visited_url::CACHE_TYPE_NOT_AVAILABLE, None)
        };

        // Create VisitedUrl and update status
        let is_external = parsed_url
            .host
            .as_deref()
            .map(|h| !Self::hosts_are_www_equivalent(h, initial_parsed_url.host.as_deref().unwrap_or("")))
            .unwrap_or(false);

        let visited_url = VisitedUrl::new(
            parsed_url_uq_id.clone(),
            entry.source_uq_id.clone(),
            entry.source_attr,
            absolute_url.clone(),
            response_status,
            elapsed_time,
            Some(body_size),
            content_type,
            Some(content_type_header.clone()),
            http_response.get_header("content-encoding").cloned(),
            if extra_parsed_content.is_empty() {
                None
            } else {
                Some(extra_parsed_content.clone())
            },
            is_external,
            is_allowed_for_crawling,
            cache_type_flags,
            cache_lifetime.map(|l| l as i64),
        );

        if let Ok(mut st) = status.lock() {
            st.add_visited_url(visited_url.clone(), body.as_deref(), Some(&http_response.headers));
        }

        // Run per-URL analysis (headers, security, accessibility, best practices, etc.)
        if let Ok(mut am) = analysis_manager.lock()
            && let Ok(st) = status.lock()
        {
            let analysis_results =
                am.analyze_visited_url(&visited_url, body_text.as_deref(), Some(&http_response.headers), &st);

            // Store analysis results as extra columns for progress table display
            let extra_column_values = am.get_analysis_column_values(&analysis_results);
            for (col_name, col_value) in extra_column_values {
                extra_parsed_content.insert(col_name, col_value);
            }
        }

        // Increment done count
        let done_count = done_urls_count.fetch_add(1, Ordering::SeqCst) + 1;
        let total_count = queue.len() + visited.len();
        let progress_status = format!("{}/{}", done_count, total_count);

        // Print table row to output
        if let Ok(mut out) = output.lock() {
            out.add_table_row(
                &http_response.headers,
                &absolute_url,
                response_status,
                elapsed_time,
                body_size,
                content_type as i32,
                &extra_parsed_content,
                &progress_status,
                cache_type_flags as i32,
                cache_lifetime,
            );
        }
    }

    /// Parse HTML body, extract URLs, and fill the queue
    #[allow(clippy::too_many_arguments)]
    fn parse_html_body_and_fill_queue(
        body: &str,
        content_type: ContentTypeId,
        url: &ParsedUrl,
        options: &Arc<CoreOptions>,
        content_processor_manager: &Arc<Mutex<ContentProcessorManager>>,
        queue: &Arc<DashMap<String, QueueEntry>>,
        queue_order: &Arc<Mutex<VecDeque<String>>>,
        visited: &Arc<DashMap<String, VisitedEntry>>,
        skipped: &Arc<DashMap<String, SkippedEntry>>,
        initial_parsed_url: &ParsedUrl,
        non200_basenames: &Arc<DashMap<String, i64>>,
        robots_txt_cache: &Arc<DashMap<String, Option<RobotsTxt>>>,
        loaded_robots_txt_count: &Arc<AtomicI64>,
        resolve_cache: &Arc<DashMap<String, String>>,
        http_client: &Arc<HttpClient>,
        output: &Arc<Mutex<Box<dyn Output>>>,
        status: &Arc<Mutex<Status>>,
        terminated: &Arc<AtomicBool>,
        compiled_include_regex: &[Regex],
        compiled_ignore_regex: &[Regex],
    ) -> HashMap<String, String> {
        let mut result = HashMap::new();

        // Skip link following from HTML pages when initial URL is a sitemap.xml
        // (sitemap-only mode: only crawl URLs listed in the sitemap)
        let is_sitemap_only = Self::is_sitemap_url(initial_parsed_url);
        if !is_sitemap_only || content_type == ContentTypeId::Xml {
            Self::parse_content_and_fill_url_queue(
                body,
                content_type,
                url,
                options,
                content_processor_manager,
                queue,
                queue_order,
                visited,
                skipped,
                initial_parsed_url,
                non200_basenames,
                robots_txt_cache,
                loaded_robots_txt_count,
                resolve_cache,
                http_client,
                output,
                status,
                terminated,
                compiled_include_regex,
                compiled_ignore_regex,
            );
        }

        // Extract Title
        if let Some(caps) = RE_TITLE.captures(body) {
            let title = caps.get(1).map_or("", |m| m.as_str()).trim();
            result.insert("Title".to_string(), Self::decode_html_entities(title));
        }

        // Extract Description
        if let Some(caps) = RE_DESCRIPTION.captures(body) {
            let desc = caps.get(1).map_or("", |m| m.as_str()).trim();
            result.insert("Description".to_string(), Self::decode_html_entities(desc));
        }

        // Extract Keywords if needed
        if options.has_header_to_table("Keywords")
            && let Some(caps) = RE_KEYWORDS.captures(body)
        {
            let keywords = caps.get(1).map_or("", |m| m.as_str()).trim();
            result.insert("Keywords".to_string(), Self::decode_html_entities(keywords));
        }

        // Extract DOM count if needed
        if options.has_header_to_table("DOM") {
            let dom_count = RE_DOM_COUNT.find_iter(body).count();
            result.insert("DOM".to_string(), dom_count.to_string());
        }

        // Custom extraction for extra columns
        for extra_column in &options.extra_columns {
            if extra_column.custom_method.is_some()
                && let Some(value) = extra_column.extract_value(body)
            {
                result.insert(extra_column.name.clone(), value);
            }
        }

        result
    }

    /// Parse content (HTML/CSS/JS/XML) and fill URL queue
    #[allow(clippy::too_many_arguments)]
    fn parse_content_and_fill_url_queue(
        content: &str,
        content_type: ContentTypeId,
        url: &ParsedUrl,
        options: &Arc<CoreOptions>,
        content_processor_manager: &Arc<Mutex<ContentProcessorManager>>,
        queue: &Arc<DashMap<String, QueueEntry>>,
        queue_order: &Arc<Mutex<VecDeque<String>>>,
        visited: &Arc<DashMap<String, VisitedEntry>>,
        skipped: &Arc<DashMap<String, SkippedEntry>>,
        initial_parsed_url: &ParsedUrl,
        non200_basenames: &Arc<DashMap<String, i64>>,
        robots_txt_cache: &Arc<DashMap<String, Option<RobotsTxt>>>,
        loaded_robots_txt_count: &Arc<AtomicI64>,
        resolve_cache: &Arc<DashMap<String, String>>,
        http_client: &Arc<HttpClient>,
        output: &Arc<Mutex<Box<dyn Output>>>,
        status: &Arc<Mutex<Status>>,
        terminated: &Arc<AtomicBool>,
        compiled_include_regex: &[Regex],
        compiled_ignore_regex: &[Regex],
    ) {
        // Detect <base href="..."> in HTML content to use as base URL for resolving relative URLs
        let effective_base_url = if content_type == ContentTypeId::Html {
            if let Some(caps) = RE_BASE_HREF.captures(content) {
                if let Some(base_href) = caps.get(1) {
                    let base_href_str = base_href.as_str();
                    // Only use base href if it looks like a valid URL or path
                    if base_href_str.starts_with("http://")
                        || base_href_str.starts_with("https://")
                        || base_href_str.starts_with("//")
                        || base_href_str.starts_with('/')
                    {
                        Some(ParsedUrl::parse(base_href_str, Some(url)))
                    } else {
                        None
                    }
                } else {
                    None
                }
            } else {
                None
            }
        } else {
            None
        };
        let source_url = effective_base_url.as_ref().unwrap_or(url);

        let found_urls_list = match content_processor_manager.lock() {
            Ok(mut cpm) => cpm.find_urls(content, content_type, source_url),
            _ => Vec::new(),
        };

        for found_urls in found_urls_list {
            Self::add_suitable_urls_to_queue(
                &found_urls,
                source_url,
                options,
                queue,
                queue_order,
                visited,
                skipped,
                initial_parsed_url,
                non200_basenames,
                robots_txt_cache,
                loaded_robots_txt_count,
                resolve_cache,
                http_client,
                output,
                status,
                terminated,
                compiled_include_regex,
                compiled_ignore_regex,
            );
        }
    }

    /// Add suitable found URLs to the queue after filtering
    #[allow(clippy::too_many_arguments)]
    fn add_suitable_urls_to_queue(
        found_urls: &FoundUrls,
        source_url: &ParsedUrl,
        options: &Arc<CoreOptions>,
        queue: &Arc<DashMap<String, QueueEntry>>,
        queue_order: &Arc<Mutex<VecDeque<String>>>,
        visited: &Arc<DashMap<String, VisitedEntry>>,
        skipped: &Arc<DashMap<String, SkippedEntry>>,
        initial_parsed_url: &ParsedUrl,
        non200_basenames: &Arc<DashMap<String, i64>>,
        robots_txt_cache: &Arc<DashMap<String, Option<RobotsTxt>>>,
        _loaded_robots_txt_count: &Arc<AtomicI64>,
        _resolve_cache: &Arc<DashMap<String, String>>,
        _http_client: &Arc<HttpClient>,
        _output: &Arc<Mutex<Box<dyn Output>>>,
        _status: &Arc<Mutex<Status>>,
        terminated: &Arc<AtomicBool>,
        compiled_include_regex: &[Regex],
        compiled_ignore_regex: &[Regex],
    ) {
        let source_url_uq_id = Self::compute_url_uq_id(source_url);

        for found_url in found_urls.get_urls().values() {
            if terminated.load(Ordering::SeqCst) {
                return;
            }

            let url_for_queue = found_url.url.trim().to_string();
            let parsed_url_for_queue = ParsedUrl::parse(&url_for_queue, Some(source_url));

            // Skip URLs that are not requestable resources
            if !utils::is_href_for_requestable_resource(&url_for_queue) {
                continue;
            }

            // Check if URL is on same host or allowed host
            let is_url_on_same_host = parsed_url_for_queue.host.is_none()
                || parsed_url_for_queue.host == initial_parsed_url.host
                || Self::hosts_are_www_equivalent(
                    parsed_url_for_queue.host.as_deref().unwrap_or(""),
                    initial_parsed_url.host.as_deref().unwrap_or(""),
                );
            let mut is_url_on_allowed_host = false;
            if let Some(ref parsed_host) = parsed_url_for_queue.host
                && Some(parsed_host.as_str()) != initial_parsed_url.host.as_deref()
            {
                let is_allowed_static = !options.allowed_domains_for_external_files.is_empty()
                    && Self::is_domain_allowed_for_static_files(
                        parsed_host,
                        &options.allowed_domains_for_external_files,
                    );
                let is_allowed_crawlable = !options.allowed_domains_for_crawling.is_empty()
                    && Self::is_external_domain_allowed_for_crawling(
                        parsed_host,
                        initial_parsed_url,
                        &options.allowed_domains_for_crawling,
                    );
                if (is_allowed_static && found_url.is_included_asset()) || is_allowed_crawlable {
                    is_url_on_allowed_host = true;
                }
            }

            // Skip basename with too many non-200s
            if let Some(ref basename) = parsed_url_for_queue.get_base_name()
                && let Some(count) = non200_basenames.get(basename)
                && *count >= options.max_non200_responses_per_basename
            {
                continue;
            }

            if !is_url_on_same_host && !is_url_on_allowed_host {
                // Add to skipped
                let url_key = Self::compute_url_key(&parsed_url_for_queue);
                if !skipped.contains_key(&url_key) {
                    skipped.insert(
                        url_key,
                        SkippedEntry {
                            url: parsed_url_for_queue.get_full_url(true, false),
                            reason: SkippedReason::NotAllowedHost,
                            source_uq_id: source_url_uq_id.clone(),
                            source_attr: found_url.source as i32,
                        },
                    );
                }
                continue;
            }

            // Check robots.txt (skip for static files)
            if !parsed_url_for_queue.is_static_file() && !options.ignore_robots_txt {
                let check_host = parsed_url_for_queue
                    .host
                    .as_deref()
                    .unwrap_or(initial_parsed_url.host.as_deref().unwrap_or(""));
                if !Self::is_url_allowed_by_robots_txt_cached(check_host, &url_for_queue, robots_txt_cache) {
                    let url_key = Self::compute_url_key(&parsed_url_for_queue);
                    if !skipped.contains_key(&url_key) {
                        skipped.insert(
                            url_key,
                            SkippedEntry {
                                url: parsed_url_for_queue.get_full_url(true, false),
                                reason: SkippedReason::RobotsTxt,
                                source_uq_id: source_url_uq_id.clone(),
                                source_attr: found_url.source as i32,
                            },
                        );
                    }
                    continue;
                }
            }

            // Build absolute URL
            let source_full_url = source_url.get_full_url(true, false);
            let absolute_url = utils::get_absolute_url_by_base_url(&source_full_url, &url_for_queue);

            if absolute_url.is_empty() {
                continue;
            }

            // Remove fragment
            let absolute_url = if let Some(hash_pos) = absolute_url.find('#') {
                absolute_url[..hash_pos].to_string()
            } else {
                absolute_url
            };

            // Filter query params if configured
            let absolute_url = if options.remove_query_params {
                if let Some(q_pos) = absolute_url.find('?') {
                    absolute_url[..q_pos].to_string()
                } else {
                    absolute_url
                }
            } else if !options.keep_query_params.is_empty() {
                filter_query_params(&absolute_url, &options.keep_query_params)
            } else {
                absolute_url
            };

            // Re-parse and check suitability
            let mut parsed_url_for_queue = ParsedUrl::parse(&absolute_url, Some(source_url));

            // Force relative URLs: normalize host/scheme variants to match initial URL
            if options.force_relative_urls {
                Self::normalize_url_to_initial(&mut parsed_url_for_queue, initial_parsed_url);
            }

            let suitable = Self::is_url_suitable_for_queue_static(
                &parsed_url_for_queue,
                queue,
                visited,
                options,
                compiled_include_regex,
                compiled_ignore_regex,
            );
            if suitable {
                Self::add_url_to_queue_static(
                    &parsed_url_for_queue,
                    Some(&source_url_uq_id),
                    found_url.source as i32,
                    queue,
                    queue_order,
                    visited,
                    options,
                    terminated,
                );
            }
        }
    }

    /// Add URL to the queue
    fn add_url_to_queue(&self, url: &ParsedUrl, source_uq_id: Option<&str>, source_attr: i32) {
        Self::add_url_to_queue_static(
            url,
            source_uq_id,
            source_attr,
            &self.queue,
            &self.queue_order,
            &self.visited,
            &self.options,
            &self.terminated,
        );
    }

    /// Static version of add_url_to_queue for use in async contexts
    #[allow(clippy::too_many_arguments)]
    fn add_url_to_queue_static(
        url: &ParsedUrl,
        source_uq_id: Option<&str>,
        source_attr: i32,
        queue: &DashMap<String, QueueEntry>,
        queue_order: &Mutex<VecDeque<String>>,
        visited: &DashMap<String, VisitedEntry>,
        options: &CoreOptions,
        terminated: &AtomicBool,
    ) {
        if terminated.load(Ordering::SeqCst) {
            return;
        }

        // Check max_visited_urls limit
        if (queue.len() + visited.len()) as i64 >= options.max_visited_urls {
            return;
        }

        let url_str = url.get_full_url(true, false);
        let url_key = Self::compute_url_key(url);
        let uq_id = Self::compute_url_uq_id(url);

        if (queue.len() as i64) >= options.max_queue_length {
            return;
        }

        let entry = QueueEntry {
            url: url_str,
            uq_id,
            source_uq_id: source_uq_id.unwrap_or("").to_string(),
            source_attr,
        };

        queue.insert(url_key.clone(), entry);
        if let Ok(mut order) = queue_order.lock() {
            order.push_back(url_key);
        }
    }

    /// Normalize URL host/scheme to match the initial URL when force_relative_urls is enabled.
    /// Handles www/non-www and http/https variants of the same domain.
    fn normalize_url_to_initial(url: &mut ParsedUrl, initial_url: &ParsedUrl) {
        if let (Some(url_host), Some(initial_host)) = (url.host.as_ref(), initial_url.host.as_ref()) {
            let url_host_no_www = url_host.strip_prefix("www.").unwrap_or(url_host);
            let initial_host_no_www = initial_host.strip_prefix("www.").unwrap_or(initial_host);

            if url_host_no_www.eq_ignore_ascii_case(initial_host_no_www) {
                // Normalize host to match initial URL
                if url.host.as_deref() != initial_url.host.as_deref() {
                    url.host = initial_url.host.clone();
                }
                // Normalize scheme to match initial URL
                if url.scheme != initial_url.scheme {
                    url.scheme = initial_url.scheme.clone();
                }
                // Rebuild the url string
                url.url = url.get_full_url(true, true);
            }
        }
    }

    /// Check if a URL is suitable for the queue
    fn is_url_suitable_for_queue_static(
        url: &ParsedUrl,
        queue: &DashMap<String, QueueEntry>,
        visited: &DashMap<String, VisitedEntry>,
        options: &CoreOptions,
        compiled_include: &[Regex],
        compiled_ignore: &[Regex],
    ) -> bool {
        if !Self::is_url_allowed_by_regexes(url, options, compiled_include, compiled_ignore) {
            return false;
        }

        if (visited.len() + queue.len()) as i64 >= options.max_visited_urls {
            return false;
        }

        let full_url = url.get_full_url(true, false);
        let url_key = Self::compute_url_key(url);

        let is_in_queue = queue.contains_key(&url_key);
        let is_already_visited = visited.contains_key(&url_key);
        let is_url_with_html = url.extension.is_none()
            || HTML_PAGES_EXTENSIONS.contains(&url.extension.as_deref().unwrap_or("").to_lowercase().as_str());
        let path_lower = url.path.to_lowercase();
        let is_url_with_sitemap =
            path_lower.contains("sitemap") && (path_lower.ends_with(".xml") || path_lower.ends_with(".xml.gz"));
        let is_url_too_long = full_url.len() as i64 > options.max_url_length;
        let allowed_only_html = options.crawl_only_html_files();

        if !is_in_queue
            && !is_already_visited
            && !is_url_too_long
            && (is_url_with_html || !allowed_only_html || is_url_with_sitemap)
        {
            return true;
        }

        false
    }

    /// Check if URL is allowed by include/ignore regex rules
    fn is_url_allowed_by_regexes(
        url: &ParsedUrl,
        options: &CoreOptions,
        compiled_include: &[Regex],
        compiled_ignore: &[Regex],
    ) -> bool {
        // Bypass regex filtering for static files if configured
        if options.regex_filtering_only_for_pages && url.is_static_file() {
            return true;
        }

        let full_url = url.get_full_url(true, false);

        let mut is_allowed = compiled_include.is_empty();
        for re in compiled_include {
            if re.is_match(&full_url) {
                is_allowed = true;
                break;
            }
        }

        for re in compiled_ignore {
            if re.is_match(&full_url) {
                is_allowed = false;
                break;
            }
        }

        is_allowed
    }

    /// Check if a domain is allowed for static file downloads
    fn is_domain_allowed_for_static_files(domain: &str, allowed_domains: &[String]) -> bool {
        use std::sync::OnceLock;
        static COMPILED: OnceLock<Vec<Regex>> = OnceLock::new();
        let patterns = COMPILED.get_or_init(|| compile_domain_patterns(allowed_domains));
        patterns.iter().any(|re| re.is_match(domain))
    }

    /// Check if two hosts are www/non-www equivalents
    fn hosts_are_www_equivalent(host_a: &str, host_b: &str) -> bool {
        if host_a == host_b {
            return true;
        }
        let a = host_a.strip_prefix("www.").unwrap_or(host_a);
        let b = host_b.strip_prefix("www.").unwrap_or(host_b);
        a == b
    }

    /// Check if an external domain is allowed for whole-domain crawling
    fn is_external_domain_allowed_for_crawling(
        domain: &str,
        initial_parsed_url: &ParsedUrl,
        allowed_domains: &[String],
    ) -> bool {
        let initial_host = initial_parsed_url.host.as_deref().unwrap_or("");
        if domain == initial_host {
            return true;
        }

        // www/non-www equivalence: handles redirects like
        // www.rust-lang.org -> rust-lang.org (or vice versa)
        if Self::hosts_are_www_equivalent(domain, initial_host) {
            return true;
        }

        use std::sync::OnceLock;
        static COMPILED: OnceLock<Vec<Regex>> = OnceLock::new();
        let patterns = COMPILED.get_or_init(|| compile_domain_patterns(allowed_domains));
        patterns.iter().any(|re| re.is_match(domain))
    }

    /// Add redirect location to queue if suitable
    #[allow(clippy::too_many_arguments)]
    fn add_redirect_location_to_queue_if_suitable(
        redirect_location: &str,
        source_uq_id: &str,
        scheme: &str,
        host_and_port: &str,
        source_url: &ParsedUrl,
        options: &Arc<CoreOptions>,
        queue: &Arc<DashMap<String, QueueEntry>>,
        queue_order: &Arc<Mutex<VecDeque<String>>>,
        visited: &Arc<DashMap<String, VisitedEntry>>,
        _skipped: &Arc<DashMap<String, SkippedEntry>>,
        _initial_parsed_url: &ParsedUrl,
        terminated: &Arc<AtomicBool>,
        compiled_include_regex: &[Regex],
        compiled_ignore_regex: &[Regex],
    ) {
        let redirect_url = if redirect_location.starts_with("//") {
            format!("{}:{}", scheme, redirect_location)
        } else if redirect_location.starts_with('/') {
            format!("{}://{}{}", scheme, host_and_port, redirect_location)
        } else if redirect_location.starts_with("http://") || redirect_location.starts_with("https://") {
            redirect_location.to_string()
        } else {
            format!(
                "{}://{}{}/{}",
                scheme, host_and_port, source_url.path, redirect_location
            )
        };

        let parsed_redirect_url = ParsedUrl::parse(&redirect_url, Some(source_url));

        if Self::is_url_suitable_for_queue_static(
            &parsed_redirect_url,
            queue,
            visited,
            options,
            compiled_include_regex,
            compiled_ignore_regex,
        ) {
            Self::add_url_to_queue_static(
                &parsed_redirect_url,
                Some(source_uq_id),
                UrlSource::Redirect as i32,
                queue,
                queue_order,
                visited,
                options,
                terminated,
            );

            // If initial URL redirects to same 2nd-level domain, the domain checks
            // in is_external_domain_allowed_for_crawling and is_url_on_same_host
            // handle this via 2nd-level domain comparison.
        }
    }

    /// Process a URL that returned non-200 status
    fn process_non200_url(url: &ParsedUrl, non200_basenames: &DashMap<String, i64>) {
        if let Some(basename) = url.get_base_name()
            && basename != "index.html"
            && basename != "index.htm"
            && basename != "index"
        {
            non200_basenames
                .entry(basename)
                .and_modify(|count| *count += 1)
                .or_insert(1);
        }
    }

    /// Check if URL is allowed by robots.txt (using cache)
    fn is_url_allowed_by_robots_txt_cached(
        domain: &str,
        url: &str,
        robots_txt_cache: &DashMap<String, Option<RobotsTxt>>,
    ) -> bool {
        // Only check the matching domain's robots.txt
        for entry in robots_txt_cache.iter() {
            if !entry.key().starts_with(domain) {
                continue;
            }
            if let Some(ref robots_txt) = *entry.value()
                && !robots_txt.is_allowed(url)
            {
                return false;
            }
        }
        true
    }

    /// Fetch and parse robots.txt for a domain
    pub async fn fetch_robots_txt(&self, domain: &str, port: u16, scheme: &str) {
        if self.options.ignore_robots_txt {
            return;
        }

        let cache_key = format!("{}:{}", domain, port);
        if self.robots_txt_cache.contains_key(&cache_key) {
            return;
        }

        // Prevent parallel fetches for same domain
        self.robots_txt_cache.insert(cache_key.clone(), None);

        let use_http_auth = self
            .initial_parsed_url
            .domain_2nd_level
            .as_ref()
            .map(|d2| domain.ends_with(d2.as_str()))
            .unwrap_or(domain == self.initial_parsed_url.host.as_deref().unwrap_or(""));

        let (http_request_host, http_request_path) =
            Self::apply_http_request_transformations(domain, "/robots.txt", &self.options.transform_url);

        let forced_ip = self
            .resolve_cache
            .get(&format!("{}:{}", http_request_host, port))
            .map(|v| v.value().clone());

        let response = self
            .http_client
            .request(
                &http_request_host,
                port,
                scheme,
                &http_request_path,
                "GET",
                3,
                &Self::get_crawler_user_agent_signature(),
                ACCEPT_HEADER,
                "gzip, deflate, br",
                None,
                use_http_auth,
                forced_ip.as_deref(),
            )
            .await;

        let count = self.loaded_robots_txt_count.fetch_add(1, Ordering::SeqCst) + 1;

        if let Ok(resp) = response {
            if count <= 10
                && let Ok(st) = self.status.lock()
            {
                st.add_notice_to_summary(
                    &format!("robots-txt-{}", domain),
                    &format!(
                        "Loaded robots.txt for domain '{}': status code {}, size {} and took {}.",
                        domain,
                        resp.status_code,
                        resp.get_formatted_body_length(),
                        resp.get_formatted_exec_time(),
                    ),
                );
            }

            if resp.status_code == 200
                && let Some(ref body_bytes) = resp.body
            {
                let body_str = String::from_utf8_lossy(body_bytes);
                let robots_txt = RobotsTxt::parse(&body_str);

                if let Ok(st) = self.status.lock() {
                    st.set_robots_txt_content(scheme, domain, port, &body_str);
                }

                self.robots_txt_cache.insert(cache_key, Some(robots_txt));
                return;
            }
        }

        // No valid robots.txt found
        self.robots_txt_cache.insert(cache_key, None);
    }

    /// Get content type ID from Content-Type header
    fn get_content_type_id_by_header(content_type_header: &str) -> ContentTypeId {
        let header_lower = content_type_header.to_lowercase();

        if header_lower.contains("text/html") {
            ContentTypeId::Html
        } else if header_lower.contains("text/javascript")
            || header_lower.contains("application/javascript")
            || header_lower.contains("application/x-javascript")
        {
            ContentTypeId::Script
        } else if header_lower.contains("text/css") {
            ContentTypeId::Stylesheet
        } else if header_lower.contains("image/") {
            ContentTypeId::Image
        } else if header_lower.contains("audio/") {
            ContentTypeId::Audio
        } else if header_lower.contains("video/") {
            ContentTypeId::Video
        } else if header_lower.contains("font/") {
            ContentTypeId::Font
        } else if header_lower.contains("application/json") {
            ContentTypeId::Json
        } else if header_lower.contains("application/xml")
            || header_lower.contains("text/xml")
            || header_lower.contains("+xml")
        {
            ContentTypeId::Xml
        } else if header_lower.contains("application/pdf")
            || header_lower.contains("application/msword")
            || header_lower.contains("application/vnd.ms-excel")
            || header_lower.contains("application/vnd.ms-powerpoint")
            || header_lower.contains("text/plain")
            || header_lower.contains("document")
        {
            ContentTypeId::Document
        } else {
            ContentTypeId::Other
        }
    }

    /// Build final user agent string
    fn build_final_user_agent(options: &CoreOptions) -> String {
        let base = if let Some(ref ua) = options.user_agent {
            ua.clone()
        } else {
            match options.device {
                DeviceType::Desktop => format!(
                    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/{}.0.0.0 Safari/537.36",
                    chrono::Utc::now().format("%y")
                ),
                DeviceType::Mobile => "Mozilla/5.0 (iPhone; CPU iPhone OS 15_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.0 Mobile/15A5370a Safari/604.1".to_string(),
                DeviceType::Tablet => "Mozilla/5.0 (Linux; Android 11; SAMSUNG SM-T875) AppleWebKit/537.36 (KHTML, like Gecko) SamsungBrowser/14.0 Chrome/87.0.4280.141 Safari/537.36".to_string(),
            }
        };

        // Add signature unless user agent ends with '!'
        if base.ends_with('!') {
            base.trim_end_matches('!').trim_end().to_string()
        } else {
            format!("{} {}", base, Self::get_crawler_user_agent_signature())
        }
    }

    /// Get crawler user agent signature
    pub fn get_crawler_user_agent_signature() -> String {
        format!("siteone-crawler/{}", version::CODE)
    }

    /// Compute MD5-based key for URL deduplication
    fn compute_url_key(url: &ParsedUrl) -> String {
        let relevant_parts = url.get_full_url(true, false);
        let mut hasher = Md5::new();
        hasher.update(relevant_parts.as_bytes());
        format!("{:x}", hasher.finalize())
    }

    /// Check if URL points to a sitemap.xml or sitemap.xml.gz file
    fn is_sitemap_url(url: &ParsedUrl) -> bool {
        let path_lower = url.path.to_lowercase();
        path_lower.contains("sitemap") && (path_lower.ends_with(".xml") || path_lower.ends_with(".xml.gz"))
    }

    /// Compute short unique ID for a URL (first 8 chars of MD5)
    fn compute_url_uq_id(url: &ParsedUrl) -> String {
        let full_url = url.get_full_url(true, false);
        let mut hasher = Md5::new();
        hasher.update(full_url.as_bytes());
        let hash = format!("{:x}", hasher.finalize());
        hash[..8].to_string()
    }

    /// Decode HTML entities
    fn decode_html_entities(text: &str) -> String {
        text.replace("&amp;", "&")
            .replace("&lt;", "<")
            .replace("&gt;", ">")
            .replace("&quot;", "\"")
            .replace("&#39;", "'")
            .replace("&ndash;", "\u{2013}")
            .replace("&mdash;", "\u{2014}")
    }

    /// Get current timestamp in seconds
    fn current_timestamp() -> f64 {
        std::time::SystemTime::now()
            .duration_since(std::time::UNIX_EPOCH)
            .unwrap_or_default()
            .as_secs_f64()
    }

    /// Add random query parameters to a URL path
    fn add_random_query_params(path: &str) -> String {
        let random_val = rand_simple();
        if path.contains('?') {
            format!("{}&_soc={}", path, random_val)
        } else {
            format!("{}?_soc={}", path, random_val)
        }
    }

    /// Apply URL transformations for HTTP request (--transform-url)
    fn apply_http_request_transformations(host: &str, path: &str, transform_url: &[String]) -> (String, String) {
        if transform_url.is_empty() {
            return (host.to_string(), path.to_string());
        }

        let mut full_url = format!("{}{}", host, path);
        let original_url = full_url.clone();

        for transform in transform_url {
            let parts: Vec<&str> = transform.splitn(2, "->").collect();
            if parts.len() != 2 {
                continue;
            }

            let from = parts[0].trim();
            let to = parts[1].trim();

            // Check if it's a regex pattern
            let is_regex = utils::is_regex_pattern(from);

            if is_regex {
                if let Ok(re) = Regex::new(from) {
                    full_url = re.replace_all(&full_url, to).to_string();
                }
            } else {
                full_url = full_url.replace(from, to);
            }
        }

        if full_url != original_url {
            // Parse transformed URL back to host and path
            if let Ok(parsed) = url::Url::parse(&format!("http://{}", full_url)) {
                let new_host = parsed.host_str().unwrap_or(host).to_string();
                let new_path = if let Some(query) = parsed.query() {
                    format!("{}?{}", parsed.path(), query)
                } else {
                    parsed.path().to_string()
                };
                return (new_host, new_path);
            }
        }

        (host.to_string(), path.to_string())
    }

    /// Remove AVIF and WebP support from Accept header (for offline export)
    pub fn remove_avif_and_webp_support_from_accept_header(&mut self) {
        self.accept_header = self.accept_header.replace("image/avif,", "").replace("image/webp,", "");
    }

    /// Terminate the crawler
    pub fn terminate(&self) {
        self.terminated.store(true, Ordering::SeqCst);
    }

    /// Get forced IP for domain and port from --resolve options
    pub fn get_forced_ip_for_domain_and_port(&self, domain: &str, port: u16) -> Option<String> {
        self.resolve_cache
            .get(&format!("{}:{}", domain, port))
            .map(|v| v.value().clone())
    }

    /// Get cache type flags from response headers
    fn get_cache_type_flags(headers: &HashMap<String, String>) -> u32 {
        use crate::result::visited_url::*;

        let mut flags: u32 = 0;

        if let Some(cache_control) = headers.get("cache-control") {
            flags |= CACHE_TYPE_HAS_CACHE_CONTROL;
            let cc_lower = cache_control.to_lowercase();
            if cc_lower.contains("max-age") {
                flags |= CACHE_TYPE_HAS_MAX_AGE;
            }
            if cc_lower.contains("s-maxage") || cc_lower.contains("s-max-age") {
                flags |= CACHE_TYPE_HAS_S_MAX_AGE;
            }
            if cc_lower.contains("stale-while-revalidate") {
                flags |= CACHE_TYPE_HAS_STALE_WHILE_REVALIDATE;
            }
            if cc_lower.contains("stale-if-error") {
                flags |= CACHE_TYPE_HAS_STALE_IF_ERROR;
            }
            if cc_lower.contains("public") {
                flags |= CACHE_TYPE_HAS_PUBLIC;
            }
            if cc_lower.contains("private") {
                flags |= CACHE_TYPE_HAS_PRIVATE;
            }
            if cc_lower.contains("no-cache") {
                flags |= CACHE_TYPE_HAS_NO_CACHE;
            }
            if cc_lower.contains("no-store") {
                flags |= CACHE_TYPE_HAS_NO_STORE;
            }
            if cc_lower.contains("must-revalidate") {
                flags |= CACHE_TYPE_HAS_MUST_REVALIDATE;
            }
            if cc_lower.contains("proxy-revalidate") {
                flags |= CACHE_TYPE_HAS_PROXY_REVALIDATE;
            }
            if cc_lower.contains("immutable") {
                flags |= CACHE_TYPE_HAS_IMMUTABLE;
            }
        }

        if headers.contains_key("expires") {
            flags |= CACHE_TYPE_HAS_EXPIRES;
        }
        if headers.contains_key("etag") {
            flags |= CACHE_TYPE_HAS_ETAG;
        }
        if headers.contains_key("last-modified") {
            flags |= CACHE_TYPE_HAS_LAST_MODIFIED;
        }

        if flags == 0 {
            flags = CACHE_TYPE_NO_CACHE_HEADERS;
        }

        flags
    }

    /// Get cache lifetime from response headers (in seconds)
    fn get_cache_lifetime(headers: &HashMap<String, String>) -> Option<i32> {
        if let Some(cache_control) = headers.get("cache-control") {
            let cc_lower = cache_control.to_lowercase();
            // Try max-age first
            if let Some(pos) = cc_lower.find("max-age=") {
                let after = &cc_lower[pos + 8..];
                let num_str: String = after.chars().take_while(|c| c.is_ascii_digit()).collect();
                if let Ok(seconds) = num_str.parse::<i32>() {
                    return Some(seconds);
                }
            }
        }
        None
    }

    // --- Public accessors ---

    pub fn get_content_processor_manager(&self) -> &Arc<Mutex<ContentProcessorManager>> {
        &self.content_processor_manager
    }

    pub fn get_initial_parsed_url(&self) -> &ParsedUrl {
        &self.initial_parsed_url
    }

    pub fn get_options(&self) -> &Arc<CoreOptions> {
        &self.options
    }

    pub fn get_output(&self) -> &Arc<Mutex<Box<dyn Output>>> {
        &self.output
    }

    pub fn get_status(&self) -> &Arc<Mutex<Status>> {
        &self.status
    }

    pub fn get_visited(&self) -> &Arc<DashMap<String, VisitedEntry>> {
        &self.visited
    }

    pub fn get_queue(&self) -> &Arc<DashMap<String, QueueEntry>> {
        &self.queue
    }

    pub fn get_skipped(&self) -> &Arc<DashMap<String, SkippedEntry>> {
        &self.skipped
    }

    pub fn get_analysis_manager(&self) -> &Arc<Mutex<AnalysisManager>> {
        &self.analysis_manager
    }

    pub fn get_done_urls_count(&self) -> usize {
        self.done_urls_count.load(Ordering::SeqCst)
    }
}

/// Simple pseudo-random number for query params
fn rand_simple() -> u64 {
    let now = std::time::SystemTime::now()
        .duration_since(std::time::UNIX_EPOCH)
        .unwrap_or_default();
    now.as_nanos() as u64 % 1_000_000
}

/// Pre-compile domain wildcard patterns into regex (e.g. "*.example.com" → "^.*\.example\.com$")
fn compile_domain_patterns(domains: &[String]) -> Vec<Regex> {
    domains
        .iter()
        .filter_map(|d| {
            let pattern = format!("^{}$", regex::escape(d).replace(r"\*", ".*"));
            Regex::new(&pattern).ok()
        })
        .collect()
}

/// Filter query parameters in a URL, keeping only those whose names are in the allowlist.
fn filter_query_params(url: &str, keep_params: &[String]) -> String {
    if let Some(q_pos) = url.find('?') {
        let base = &url[..q_pos];
        let query_str = &url[q_pos + 1..];
        let filtered: Vec<&str> = query_str
            .split('&')
            .filter(|pair| {
                let name = pair.split('=').next().unwrap_or("");
                !name.is_empty() && keep_params.iter().any(|k| k == name)
            })
            .collect();
        if filtered.is_empty() {
            base.to_string()
        } else {
            format!("{}?{}", base, filtered.join("&"))
        }
    } else {
        url.to_string()
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    // =========================================================================
    // <base href> regex tests (#68)
    // =========================================================================

    #[test]
    fn base_href_double_quotes() {
        let html = r#"<html><head><base href="https://example.com/subdir/"></head></html>"#;
        let caps = RE_BASE_HREF.captures(html).unwrap();
        assert_eq!(caps.get(1).unwrap().as_str(), "https://example.com/subdir/");
    }

    #[test]
    fn base_href_single_quotes() {
        let html = r#"<html><head><base href='https://example.com/'></head></html>"#;
        let caps = RE_BASE_HREF.captures(html).unwrap();
        assert_eq!(caps.get(1).unwrap().as_str(), "https://example.com/");
    }

    #[test]
    fn base_href_no_quotes() {
        let html = r#"<base href=https://example.com/dir/>"#;
        let caps = RE_BASE_HREF.captures(html).unwrap();
        assert_eq!(caps.get(1).unwrap().as_str(), "https://example.com/dir/");
    }

    #[test]
    fn base_href_relative_path() {
        let html = r#"<base href="/subdir/">"#;
        let caps = RE_BASE_HREF.captures(html).unwrap();
        assert_eq!(caps.get(1).unwrap().as_str(), "/subdir/");
    }

    #[test]
    fn base_href_case_insensitive() {
        let html = r#"<BASE HREF="https://example.com/">"#;
        let caps = RE_BASE_HREF.captures(html).unwrap();
        assert_eq!(caps.get(1).unwrap().as_str(), "https://example.com/");
    }

    #[test]
    fn base_href_absent() {
        let html = r#"<html><head><title>No base</title></head></html>"#;
        assert!(RE_BASE_HREF.captures(html).is_none());
    }

    #[test]
    fn base_href_with_other_attrs() {
        let html = r#"<base target="_blank" href="https://example.com/app/">"#;
        let caps = RE_BASE_HREF.captures(html).unwrap();
        assert_eq!(caps.get(1).unwrap().as_str(), "https://example.com/app/");
    }

    // =========================================================================
    // is_sitemap_url tests (#69)
    // =========================================================================

    #[test]
    fn sitemap_url_standard() {
        let url = ParsedUrl::parse("https://example.com/sitemap.xml", None);
        assert!(Crawler::is_sitemap_url(&url));
    }

    #[test]
    fn sitemap_url_with_index() {
        let url = ParsedUrl::parse("https://example.com/sitemap-index.xml", None);
        assert!(Crawler::is_sitemap_url(&url));
    }

    #[test]
    fn sitemap_url_nested() {
        let url = ParsedUrl::parse("https://example.com/sitemaps/sitemap-pages.xml", None);
        assert!(Crawler::is_sitemap_url(&url));
    }

    #[test]
    fn sitemap_url_case_insensitive() {
        let url = ParsedUrl::parse("https://example.com/Sitemap.XML", None);
        assert!(Crawler::is_sitemap_url(&url));
    }

    #[test]
    fn not_sitemap_regular_page() {
        let url = ParsedUrl::parse("https://example.com/about", None);
        assert!(!Crawler::is_sitemap_url(&url));
    }

    #[test]
    fn not_sitemap_xml_without_sitemap() {
        let url = ParsedUrl::parse("https://example.com/feed.xml", None);
        assert!(!Crawler::is_sitemap_url(&url));
    }

    #[test]
    fn not_sitemap_html_page() {
        let url = ParsedUrl::parse("https://example.com/sitemap.html", None);
        assert!(!Crawler::is_sitemap_url(&url));
    }

    #[test]
    fn sitemap_url_gzip() {
        let url = ParsedUrl::parse("https://example.com/sitemap.xml.gz", None);
        assert!(Crawler::is_sitemap_url(&url));
    }

    #[test]
    fn sitemap_url_gzip_nested() {
        let url = ParsedUrl::parse("https://example.com/sitemaps/sitemap-posts.xml.gz", None);
        assert!(Crawler::is_sitemap_url(&url));
    }

    #[test]
    fn not_sitemap_tar_gz() {
        let url = ParsedUrl::parse("https://example.com/archive.tar.gz", None);
        assert!(!Crawler::is_sitemap_url(&url));
    }

    // =========================================================================
    // normalize_url_to_initial tests (#35)
    // =========================================================================

    #[test]
    fn normalize_www_to_no_www() {
        let initial = ParsedUrl::parse("https://example.com/", None);
        let mut url = ParsedUrl::parse("https://www.example.com/page", None);
        Crawler::normalize_url_to_initial(&mut url, &initial);
        assert_eq!(url.host.as_deref(), Some("example.com"));
        assert_eq!(url.scheme, Some("https".to_string()));
    }

    #[test]
    fn normalize_no_www_to_www() {
        let initial = ParsedUrl::parse("https://www.example.com/", None);
        let mut url = ParsedUrl::parse("https://example.com/page", None);
        Crawler::normalize_url_to_initial(&mut url, &initial);
        assert_eq!(url.host.as_deref(), Some("www.example.com"));
    }

    #[test]
    fn normalize_http_to_https() {
        let initial = ParsedUrl::parse("https://example.com/", None);
        let mut url = ParsedUrl::parse("http://example.com/page", None);
        Crawler::normalize_url_to_initial(&mut url, &initial);
        assert_eq!(url.scheme, Some("https".to_string()));
    }

    #[test]
    fn normalize_both_www_and_scheme() {
        let initial = ParsedUrl::parse("https://example.com/", None);
        let mut url = ParsedUrl::parse("http://www.example.com/page", None);
        Crawler::normalize_url_to_initial(&mut url, &initial);
        assert_eq!(url.host.as_deref(), Some("example.com"));
        assert_eq!(url.scheme, Some("https".to_string()));
    }

    #[test]
    fn normalize_leaves_different_domain_unchanged() {
        let initial = ParsedUrl::parse("https://example.com/", None);
        let mut url = ParsedUrl::parse("https://other.com/page", None);
        Crawler::normalize_url_to_initial(&mut url, &initial);
        assert_eq!(url.host.as_deref(), Some("other.com"));
    }

    #[test]
    fn normalize_same_url_no_change() {
        let initial = ParsedUrl::parse("https://example.com/", None);
        let mut url = ParsedUrl::parse("https://example.com/page", None);
        Crawler::normalize_url_to_initial(&mut url, &initial);
        assert_eq!(url.host.as_deref(), Some("example.com"));
        assert_eq!(url.scheme, Some("https".to_string()));
    }

    #[test]
    fn normalize_preserves_path() {
        let initial = ParsedUrl::parse("https://example.com/", None);
        let mut url = ParsedUrl::parse("http://www.example.com/some/deep/path?q=1", None);
        Crawler::normalize_url_to_initial(&mut url, &initial);
        assert_eq!(url.path, "/some/deep/path");
        assert_eq!(url.query.as_deref(), Some("q=1"));
    }

    #[test]
    fn filter_query_params_keeps_specified() {
        let keep = vec!["foo".to_string(), "baz".to_string()];
        let result = filter_query_params("https://example.com/page?foo=1&bar=2&baz=3", &keep);
        assert_eq!(result, "https://example.com/page?foo=1&baz=3");
    }

    #[test]
    fn filter_query_params_removes_all_when_none_match() {
        let keep = vec!["xyz".to_string()];
        let result = filter_query_params("https://example.com/page?foo=1&bar=2", &keep);
        assert_eq!(result, "https://example.com/page");
    }

    #[test]
    fn filter_query_params_no_query_string() {
        let keep = vec!["foo".to_string()];
        let result = filter_query_params("https://example.com/page", &keep);
        assert_eq!(result, "https://example.com/page");
    }

    #[test]
    fn filter_query_params_keeps_param_without_value() {
        let keep = vec!["debug".to_string()];
        let result = filter_query_params("https://example.com/page?debug&foo=bar", &keep);
        assert_eq!(result, "https://example.com/page?debug");
    }

    #[test]
    fn filter_query_params_preserves_order() {
        let keep = vec!["c".to_string(), "a".to_string()];
        let result = filter_query_params("https://example.com/?a=1&b=2&c=3", &keep);
        assert_eq!(result, "https://example.com/?a=1&c=3");
    }

    #[test]
    fn filter_query_params_single_kept_param() {
        let keep = vec!["id".to_string()];
        let result = filter_query_params("https://example.com/page?id=42&session=abc&tracking=xyz", &keep);
        assert_eq!(result, "https://example.com/page?id=42");
    }
}


================================================
FILE: src/engine/found_url.rs
================================================
// SiteOne Crawler - FoundUrl
// (c) Jan Reges <jan.reges@siteone.cz>

use once_cell::sync::Lazy;
use regex::Regex;

use super::parsed_url::ParsedUrl;

/// Source of discovered URL - where in HTML/CSS/JS was found
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
#[repr(u8)]
pub enum UrlSource {
    InitUrl = 5,
    AHref = 10,
    ImgSrc = 20,
    ImgSrcset = 21,
    InputSrc = 22,
    SourceSrc = 23,
    VideoSrc = 24,
    AudioSrc = 25,
    ScriptSrc = 30,
    InlineScriptSrc = 40,
    LinkHref = 50,
    CssUrl = 60,
    JsUrl = 70,
    Redirect = 80,
    Sitemap = 90,
}

impl UrlSource {
    /// Get short human-readable name for this source type
    pub fn short_name(&self) -> &'static str {
        match self {
            UrlSource::InitUrl => "Initial URL",
            UrlSource::AHref => "<a href>",
            UrlSource::ImgSrc => "<img src>",
            UrlSource::ImgSrcset => "<img srcset>",
            UrlSource::InputSrc => "<input src>",
            UrlSource::SourceSrc => "<source src>",
            UrlSource::VideoSrc => "<video src>",
            UrlSource::AudioSrc => "<audio src>",
            UrlSource::ScriptSrc => "<script src>",
            UrlSource::InlineScriptSrc => "inline <script src>",
            UrlSource::LinkHref => "<link href>",
            UrlSource::CssUrl => "css url()",
            UrlSource::JsUrl => "js url",
            UrlSource::Redirect => "redirect",
            UrlSource::Sitemap => "sitemap",
        }
    }

    /// Convert from integer source code.
    pub fn from_code(code: u8) -> Option<Self> {
        match code {
            5 => Some(UrlSource::InitUrl),
            10 => Some(UrlSource::AHref),
            20 => Some(UrlSource::ImgSrc),
            21 => Some(UrlSource::ImgSrcset),
            22 => Some(UrlSource::InputSrc),
            23 => Some(UrlSource::SourceSrc),
            24 => Some(UrlSource::VideoSrc),
            25 => Some(UrlSource::AudioSrc),
            30 => Some(UrlSource::ScriptSrc),
            40 => Some(UrlSource::InlineScriptSrc),
            50 => Some(UrlSource::LinkHref),
            60 => Some(UrlSource::CssUrl),
            70 => Some(UrlSource::JsUrl),
            80 => Some(UrlSource::Redirect),
            90 => Some(UrlSource::Sitemap),
            _ => None,
        }
    }
}

impl std::fmt::Display for UrlSource {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        write!(f, "{}", self.short_name())
    }
}

/// Regex to match absolute HTTP URLs
static HTTP_URL_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"(?i)^https?://").unwrap());

/// A URL found during crawling, with information about where it was found
#[derive(Debug, Clone)]
pub struct FoundUrl {
    /// The normalized found URL
    pub url: String,
    /// URL of the page where this URL was found
    pub source_url: String,
    /// Source type (where in HTML/CSS the URL was found)
    pub source: UrlSource,
}

impl FoundUrl {
    pub fn new(url: &str, source_url: &str, source: UrlSource) -> Self {
        let normalized = normalize_url(url, source_url);
        Self {
            url: normalized,
            source_url: source_url.to_string(),
            source,
        }
    }

    /// Is this URL an included asset (img src, script src, link href) and not linked by href?
    pub fn is_included_asset(&self) -> bool {
        self.source != UrlSource::AHref
    }
}

impl std::fmt::Display for FoundUrl {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        write!(f, "{}", self.url)
    }
}

/// Normalize URL and remove strange characters/behavior.
/// Remove unwanted http(s)://SAME_DOMAIN:SAME_OPTIONAL_PORT prefix when it matches the source URL.
fn normalize_url(url: &str, source_url: &str) -> String {
    // Replace HTML entities and escape sequences
    let mut normalized = url
        .replace("&#38;", "&")
        .replace("&amp;", "&")
        .replace("\\ ", "%20")
        .replace(' ', "%20");

    // Trim leading quotes/tabs/spaces
    normalized = normalized.trim_start_matches(['"', '\'', '\t', ' ']).to_string();
    // Trim trailing &, quotes, tabs, spaces
    normalized = normalized.trim_end_matches(['&', '"', '\'', '\t', ' ']).to_string();

    // Remove unwanted http(s)://SAME_DOMAIN:SAME_OPTIONAL_PORT
    if HTTP_URL_RE.is_match(&normalized) {
        let parsed_url = ParsedUrl::parse(&normalized, Some(&ParsedUrl::parse(source_url, None)));
        let parsed_source = ParsedUrl::parse(source_url, None);

        if parsed_url.host == parsed_source.host
            && parsed_source.port == parsed_url.port
            && parsed_source.port.is_some()
            && let (Some(scheme), Some(host)) = (&parsed_url.scheme, &parsed_url.host)
        {
            // Build regex pattern to strip scheme://host[:port]
            let port_pattern = match parsed_url.port {
                Some(p) => format!("(:{p})?"),
                None => String::new(),
            };
            let pattern = format!(
                r"(?i){}://{}{}",
                regex::escape(scheme),
                regex::escape(host),
                port_pattern
            );
            if let Ok(re) = Regex::new(&pattern) {
                normalized = re.replace(&normalized, "").to_string();
            }
        }
    }

    normalized
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_normalize_url_entities() {
        let found = FoundUrl::new("/page?a=1&amp;b=2", "https://example.com/", UrlSource::AHref);
        assert_eq!(found.url, "/page?a=1&b=2");
    }

    #[test]
    fn test_normalize_url_spaces() {
        let found = FoundUrl::new("/path with spaces", "https://example.com/", UrlSource::AHref);
        assert_eq!(found.url, "/path%20with%20spaces");
    }

    #[test]
    fn test_is_included_asset() {
        let link = FoundUrl::new("/page", "https://example.com/", UrlSource::AHref);
        assert!(!link.is_included_asset());

        let img = FoundUrl::new("/img.png", "https://example.com/", UrlSource::ImgSrc);
        assert!(img.is_included_asset());
    }

    #[test]
    fn test_source_short_name() {
        assert_eq!(UrlSource::AHref.short_name(), "<a href>");
        assert_eq!(UrlSource::Redirect.short_name(), "redirect");
    }
}


================================================
FILE: src/engine/found_urls.rs
================================================
// SiteOne Crawler - FoundUrls collection
// (c) Jan Reges <jan.reges@siteone.cz>

use std::collections::HashMap;

use md5::{Digest, Md5};
use once_cell::sync::Lazy;
use regex::Regex;

use super::found_url::{FoundUrl, UrlSource};

/// Regex for detecting non-http scheme URLs (mailto:, javascript:, data:, tel:, etc.)
static NON_HTTP_SCHEME_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"(?i)^[a-z]+:[a-z0-9]").unwrap());

/// Collection of found URLs, deduplicated by MD5 hash of URL
#[derive(Debug, Clone)]
pub struct FoundUrls {
    found_urls: HashMap<String, FoundUrl>,
}

impl FoundUrls {
    pub fn new() -> Self {
        Self {
            found_urls: HashMap::new(),
        }
    }

    /// Add a found URL, deduplicated by MD5 hash
    pub fn add_url(&mut self, found_url: FoundUrl) {
        let key = md5_hex(&found_url.url);
        self.found_urls.entry(key).or_insert(found_url);
    }

    /// Add URLs from a text array, filtering out invalid ones
    pub fn add_urls_from_text_array(&mut self, urls: &[&str], source_url: &str, source: UrlSource) {
        for url in urls {
            if is_url_valid_for_crawling(url) {
                self.add_url(FoundUrl::new(url, source_url, source));
            }
        }
    }

    /// Get all found URLs
    pub fn get_urls(&self) -> &HashMap<String, FoundUrl> {
        &self.found_urls
    }

    /// Get count of found URLs
    pub fn get_count(&self) -> usize {
        self.found_urls.len()
    }
}

impl Default for FoundUrls {
    fn default() -> Self {
        Self::new()
    }
}

/// Compute MD5 hex hash of a string
fn md5_hex(input: &str) -> String {
    let mut hasher = Md5::new();
    hasher.update(input.as_bytes());
    format!("{:x}", hasher.finalize())
}

/// Check if URL is valid for crawling. Ignored are:
/// - anchor #fragment links
/// - data:, mailto:, javascript: and other non-http(s) links
/// - file:// links
fn is_url_valid_for_crawling(url: &str) -> bool {
    let url = url.trim();
    if url.starts_with('#') {
        return false;
    }
    if NON_HTTP_SCHEME_RE.is_match(url) {
        return false;
    }
    if url.to_lowercase().starts_with("file://") {
        return false;
    }
    true
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_dedup_by_md5() {
        let mut urls = FoundUrls::new();
        urls.add_url(FoundUrl::new("/page", "https://example.com/", UrlSource::AHref));
        urls.add_url(FoundUrl::new("/page", "https://example.com/other", UrlSource::AHref));
        assert_eq!(urls.get_count(), 1);
    }

    #[test]
    fn test_add_urls_from_text_array() {
        let mut urls = FoundUrls::new();
        urls.add_urls_from_text_array(
            &["/page1", "/page2", "#fragment", "mailto:test@test.com", "/page1"],
            "https://example.com/",
            UrlSource::AHref,
        );
        assert_eq!(urls.get_count(), 2);
    }

    #[test]
    fn test_is_url_valid_for_crawling() {
        assert!(is_url_valid_for_crawling("/page"));
        assert!(is_url_valid_for_crawling("https://example.com"));
        assert!(!is_url_valid_for_crawling("#fragment"));
        assert!(!is_url_valid_for_crawling("mailto:test@test.com"));
        assert!(!is_url_valid_for_crawling("javascript:void(0)"));
        assert!(!is_url_valid_for_crawling("data:text/html,test"));
        assert!(!is_url_valid_for_crawling("file:///etc/passwd"));
    }
}


================================================
FILE: src/engine/http_client.rs
================================================
// SiteOne Crawler - HttpClient
// (c) Jan Reges <jan.reges@siteone.cz>

use std::collections::HashMap;
use std::path::Path;
use std::time::Instant;

use flate2::Compression;
use flate2::read::GzDecoder;
use flate2::write::GzEncoder;
use md5::{Digest, Md5};
use reqwest::header::{HeaderMap, HeaderName, HeaderValue};

use base64::Engine as _;

use super::http_response::HttpResponse;
use crate::error::{CrawlerError, CrawlerResult};
use crate::version;

/// Async HTTP client for crawling with caching, proxy, and auth support
pub struct HttpClient {
    /// Reusable reqwest client (Arc-backed, clone is cheap)
    client: reqwest::Client,
    /// Basic HTTP auth in format "username:password"
    http_auth: Option<String>,
    /// Cache directory. If None, caching is disabled
    cache_dir: Option<String>,
    /// Whether to compress cached data with gzip
    compression: bool,
    /// Cache TTL in seconds. None = infinite (never expires)
    cache_ttl: Option<u64>,
}

impl HttpClient {
    pub fn new(
        proxy: Option<String>,
        http_auth: Option<String>,
        cache_dir: Option<String>,
        compression: bool,
        cache_ttl: Option<u64>,
        accept_invalid_certs: bool,
    ) -> Self {
        let client = Self::build_shared_client(&proxy, accept_invalid_certs);
        Self {
            client,
            http_auth,
            cache_dir,
            compression,
            cache_ttl,
        }
    }

    /// Build the shared reqwest::Client with proxy support.
    /// Timeout is set per-request, not on the shared client.
    fn build_shared_client(proxy: &Option<String>, accept_invalid_certs: bool) -> reqwest::Client {
        let mut builder = reqwest::Client::builder()
            .danger_accept_invalid_certs(accept_invalid_certs)
            .redirect(reqwest::redirect::Policy::none());

        if let Some(proxy_str) = proxy {
            let parts: Vec<&str> = proxy_str.splitn(2, ':').collect();
            if parts.len() == 2 {
                let proxy_url = format!("http://{}:{}", parts[0], parts[1]);
                if let Ok(proxy) = reqwest::Proxy::all(&proxy_url) {
                    builder = builder.proxy(proxy);
                }
            }
        }

        builder.build().unwrap_or_else(|_| reqwest::Client::new())
    }

    /// Perform an HTTP request (GET or HEAD)
    #[allow(clippy::too_many_arguments)]
    pub async fn request(
        &self,
        host: &str,
        port: u16,
        scheme: &str,
        url: &str,
        http_method: &str,
        timeout_secs: u64,
        user_agent: &str,
        accept: &str,
        accept_encoding: &str,
        origin: Option<&str>,
        use_http_auth_if_configured: bool,
        forced_ip: Option<&str>,
    ) -> CrawlerResult<HttpResponse> {
        let path = url::Url::parse(url).ok().map(|u| u.path().to_string());
        let extension = path.as_ref().and_then(|p| {
            std::path::Path::new(p)
                .extension()
                .and_then(|e| e.to_str())
                .map(|e| e.to_string())
        });

        let args_for_cache: Vec<String> = vec![
            host.to_string(),
            port.to_string(),
            scheme.to_string(),
            url.to_string(),
            http_method.to_string(),
            user_agent.to_string(),
            accept.to_string(),
            accept_encoding.to_string(),
            origin.unwrap_or("").to_string(),
        ];
        let cache_key = self.get_cache_key(host, port, &args_for_cache, extension.as_deref());

        // Check cache first (skip URLs with spaces as they are likely problematic)
        if !url.contains(' ')
            && let Some(mut cached) = self.get_from_cache(&cache_key)
        {
            cached.set_loaded_from_cache(true);
            return Ok(cached);
        }

        // Build request headers
        let mut request_headers = HeaderMap::new();
        if let Ok(v) = HeaderValue::from_str(&format!("siteone-crawler/{}", version::CODE)) {
            request_headers.insert("x-crawler-info", v);
        }
        if let Ok(v) = HeaderValue::from_str(user_agent) {
            request_headers.insert(reqwest::header::USER_AGENT, v);
        }
        if let Ok(v) = HeaderValue::from_str(accept) {
            request_headers.insert(reqwest::header::ACCEPT, v);
        }
        if let Ok(v) = HeaderValue::from_str(accept_encoding) {
            request_headers.insert(reqwest::header::ACCEPT_ENCODING, v);
        }
        if let Ok(v) = HeaderValue::from_str("close") {
            request_headers.insert(reqwest::header::CONNECTION, v);
        }

        if let Some(ip) = forced_ip {
            let _ = ip; // forced_ip handling: set Host header
            if let Ok(v) = HeaderValue::from_str(host) {
                request_headers.insert(reqwest::header::HOST, v);
            }
        }

        if let Some(origin_val) = origin
            && let Ok(v) = HeaderValue::from_str(origin_val)
            && let Ok(name) = HeaderName::from_bytes(b"origin")
        {
            request_headers.insert(name, v);
        }

        // Use shared client with per-request timeout
        let client = self.client.clone();

        // Fix spaces in URL
        let request_url = url.replace("\\ ", "%20").replace(' ', "%20");

        // Build the actual URL to request
        let actual_host = forced_ip.unwrap_or(host);
        let full_url = if request_url.starts_with("http://") || request_url.starts_with("https://") {
            request_url.clone()
        } else {
            let port_str = match (scheme, port) {
                ("http", 80) | ("https", 443) => String::new(),
                _ => format!(":{}", port),
            };
            format!("{}://{}{}{}", scheme, actual_host, port_str, request_url)
        };

        let start_time = Instant::now();

        let timeout = std::time::Duration::from_secs(timeout_secs);
        let request = match http_method.to_uppercase().as_str() {
            "HEAD" => client.head(&full_url).timeout(timeout),
            _ => client.get(&full_url).timeout(timeout),
        };

        let request = request.headers(request_headers);

        // Add basic auth if configured and requested
        let request = if use_http_auth_if_configured {
            if let Some(ref auth) = self.http_auth {
                let parts: Vec<&str> = auth.splitn(2, ':').collect();
                if parts.len() == 2 {
                    request.basic_auth(parts[0], Some(parts[1]))
                } else {
                    request.basic_auth(auth, Option::<&str>::None)
                }
            } else {
                request
            }
        } else {
            request
        };

        let result = match request.send().await {
            Ok(resp) => {
                let status = resp.status().as_u16() as i32;
                let mut resp_headers = convert_response_headers(resp.headers());
                // reqwest auto-decompresses and strips Content-Encoding header.
                // Detect decompression by checking if Transfer-Encoding: chunked and
                // Vary: Accept-Encoding are present (indicating the response was compressed).
                let has_transfer_chunked = resp_headers
                    .get("transfer-encoding")
                    .map(|vals| vals.iter().any(|v| v.contains("chunked")))
                    .unwrap_or(false);
                let has_vary_encoding = resp_headers
                    .get("vary")
                    .map(|vals| vals.iter().any(|v| v.contains("Accept-Encoding")))
                    .unwrap_or(false);
                if has_transfer_chunked && has_vary_encoding && !resp_headers.contains_key("content-encoding") {
                    resp_headers.insert("content-encoding".to_string(), vec!["gzip".to_string()]);
                }
                let body = resp.bytes().await.ok().map(|b| b.to_vec());
                let elapsed = start_time.elapsed().as_secs_f64();

                HttpResponse::new(url.to_string(), status, body, resp_headers, elapsed)
            }
            Err(e) => {
                let elapsed = start_time.elapsed().as_secs_f64();
                let status = if e.is_connect() {
                    -1 // Connection failure
                } else if e.is_timeout() {
                    -2 // Timeout
                } else if e.is_request() {
                    -4 // Send error
                } else {
                    -1 // Generic connection failure
                };
                HttpResponse::new(url.to_string(), status, None, HashMap::new(), elapsed)
            }
        };

        self.save_to_cache(&cache_key, &result)?;
        Ok(result)
    }

    /// Get cached HTTP response
    fn get_from_cache(&self, cache_key: &str) -> Option<HttpResponse> {
        let cache_file = self.get_cache_file_path(cache_key)?;

        let cache_path = Path::new(&cache_file);
        if !cache_path.is_file() {
            return None;
        }

        // Check TTL: if cache file is older than TTL, treat as miss
        if let Some(ttl_secs) = self.cache_ttl
            && let Ok(metadata) = cache_path.metadata()
            && let Ok(modified) = metadata.modified()
            && let Ok(age) = modified.elapsed()
            && age.as_secs() > ttl_secs
        {
            return None;
        }

        let data = std::fs::read(&cache_file).ok()?;
        let json_str = if self.compression {
            let mut decoder = GzDecoder::new(&data[..]);
            let mut decompressed = String::new();
            std::io::Read::read_to_string(&mut decoder, &mut decompressed).ok()?;
            decompressed
        } else {
            String::from_utf8(data).ok()?
        };

        let cached: CachedResponse = serde_json::from_str(&json_str).ok()?;

        // Don't use cached responses with error/server-error status codes
        if matches!(cached.status_code, 429 | 500 | 502 | 503 | -1 | -2 | -3 | -4) {
            return None;
        }

        let mut headers = HashMap::new();
        for (k, v) in &cached.headers {
            headers.insert(k.clone(), vec![v.clone()]);
        }

        // Decode body: try base64 first (new format), fall back to raw UTF-8 (old cache format)
        let body_bytes = cached.body.as_ref().map(|b| {
            // Try base64 decode first, fall back to raw UTF-8 bytes (old cache format)
            base64::engine::general_purpose::STANDARD
                .decode(b)
                .unwrap_or_else(|_| b.as_bytes().to_vec())
        });

        Some(HttpResponse::new(
            cached.url,
            cached.status_code,
            body_bytes,
            headers,
            cached.exec_time,
        ))
    }

    /// Save HTTP response to disk cache
    fn save_to_cache(&self, cache_key: &str, result: &HttpResponse) -> CrawlerResult<()> {
        let cache_file = match self.get_cache_file_path(cache_key) {
            Some(f) => f,
            None => return Ok(()),
        };

        let cache_dir = Path::new(&cache_file)
            .parent()
            .map(|p| p.to_string_lossy().to_string())
            .unwrap_or_default();

        if !Path::new(&cache_dir).is_dir() {
            std::fs::create_dir_all(&cache_dir).map_err(|e| {
                CrawlerError::Io(std::io::Error::new(
                    e.kind(),
                    format!("Cannot create cache dir {}: {}", cache_dir, e),
                ))
            })?;
        }

        let cached = CachedResponse {
            url: result.url.clone(),
            status_code: result.status_code,
            body: result
                .body
                .as_ref()
                .map(|b| base64::engine::general_purpose::STANDARD.encode(b)),
            headers: result.headers.clone(),
            exec_time: result.exec_time,
        };

        let json = serde_json::to_string(&cached)
            .map_err(|e| CrawlerError::Other(format!("Cache serialization error: {}", e)))?;

        let data = if self.compression {
            let mut encoder = GzEncoder::new(Vec::new(), Compression::default());
            std::io::Write::write_all(&mut encoder, json.as_bytes()).map_err(CrawlerError::Io)?;
            encoder.finish().map_err(CrawlerError::Io)?
        } else {
            json.into_bytes()
        };

        std::fs::write(&cache_file, &data).map_err(|e| {
            CrawlerError::Io(std::io::Error::new(
                e.kind(),
                format!("Cannot write to cache file {}: {}", cache_file, e),
            ))
        })?;

        Ok(())
    }

    /// Check if a response for the given request parameters exists in cache.
    /// Used to skip rate limiting for cached responses.
    #[allow(clippy::too_many_arguments)]
    pub fn is_url_cached(
        &self,
        host: &str,
        port: u16,
        scheme: &str,
        url: &str,
        http_method: &str,
        user_agent: &str,
        accept: &str,
        accept_encoding: &str,
        origin: Option<&str>,
    ) -> bool {
        if self.cache_dir.is_none() || url.contains(' ') {
            return false;
        }
        let path = url::Url::parse(url).ok().map(|u| u.path().to_string());
        let extension = path.as_ref().and_then(|p| {
            std::path::Path::new(p)
                .extension()
                .and_then(|e| e.to_str())
                .map(|e| e.to_string())
        });
        let args_for_cache: Vec<String> = vec![
            host.to_string(),
            port.to_string(),
            scheme.to_string(),
            url.to_string(),
            http_method.to_string(),
            user_agent.to_string(),
            accept.to_string(),
            accept_encoding.to_string(),
            origin.unwrap_or("").to_string(),
        ];
        let cache_key = self.get_cache_key(host, port, &args_for_cache, extension.as_deref());
        match self.get_cache_file_path(&cache_key) {
            Some(file) => Path::new(&file).is_file(),
            None => false,
        }
    }

    /// Get cache file path for a given cache key
    fn get_cache_file_path(&self, cache_key: &str) -> Option<String> {
        let cache_dir = self.cache_dir.as_ref()?;
        let ext = if self.compression { ".cache.gz" } else { ".cache" };
        Some(format!("{}/{}{}", cache_dir, cache_key, ext))
    }

    /// Generate a cache key from request parameters
    fn get_cache_key(&self, host: &str, port: u16, args: &[String], extension: Option<&str>) -> String {
        let mut hasher = Md5::new();
        for arg in args {
            hasher.update(arg.as_bytes());
        }
        let md5 = format!("{:x}", hasher.finalize());
        let ext_suffix = extension.map(|e| format!(".{}", e)).unwrap_or_default();
        format!("{}-{}/{}/{}{}", host, port, &md5[..2], md5, ext_suffix)
    }
}

/// Internal struct for cache serialization
#[derive(serde::Serialize, serde::Deserialize)]
struct CachedResponse {
    url: String,
    status_code: i32,
    /// Body stored as base64-encoded bytes to preserve binary data in JSON
    body: Option<String>,
    headers: HashMap<String, String>,
    exec_time: f64,
}

/// Convert reqwest response headers to HashMap<String, Vec<String>>
fn convert_response_headers(headers: &reqwest::header::HeaderMap) -> HashMap<String, Vec<String>> {
    let mut result: HashMap<String, Vec<String>> = HashMap::new();
    for (key, value) in headers.iter() {
        let key_str = key.as_str().to_lowercase();
        let val_str = value.to_str().unwrap_or("").to_string();
        result.entry(key_str).or_default().push(val_str);
    }
    result
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_cache_key_generation() {
        let client = HttpClient::new(None, None, Some("/tmp/cache".to_string()), false, None, false);
        let args = vec![
            "example.com".to_string(),
            "443".to_string(),
            "https".to_string(),
            "/page".to_string(),
        ];
        let key = client.get_cache_key("example.com", 443, &args, Some("html"));
        assert!(key.starts_with("example.com-443/"));
        assert!(key.ends_with(".html"));
    }

    #[test]
    fn test_cache_file_path() {
        let client = HttpClient::new(None, None, Some("/tmp/cache".to_string()), false, None, false);
        let path = client.get_cache_file_path("example.com-443/ab/abcdef");
        assert_eq!(path, Some("/tmp/cache/example.com-443/ab/abcdef.cache".to_string()));

        let client_gz = HttpClient::new(None, None, Some("/tmp/cache".to_string()), true, None, false);
        let path_gz = client_gz.get_cache_file_path("example.com-443/ab/abcdef");
        assert_eq!(
            path_gz,
            Some("/tmp/cache/example.com-443/ab/abcdef.cache.gz".to_string())
        );
    }

    #[test]
    fn test_no_cache_when_disabled() {
        let client = HttpClient::new(None, None, None, false, None, false);
        assert!(client.get_cache_file_path("any-key").is_none());
    }
}


================================================
FILE: src/engine/http_response.rs
================================================
// SiteOne Crawler - HttpResponse
// (c) Jan Reges <jan.reges@siteone.cz>

use std::collections::HashMap;

use crate::utils;

/// HTTP response from the crawler's HTTP client.
/// Body is stored as raw bytes (`Vec<u8>`) to preserve binary data (images, fonts, etc.)
/// without UTF-8 corruption. Use `body_text()` when you need a String for text processing.
#[derive(Debug, Clone)]
pub struct HttpResponse {
    pub url: String,
    pub status_code: i32,
    pub body: Option<Vec<u8>>,
    pub headers: HashMap<String, String>,
    pub exec_time: f64,
    pub skipped_reason: Option<String>,
    loaded_from_cache: bool,
}

impl HttpResponse {
    pub fn new(
        url: String,
        status_code: i32,
        body: Option<Vec<u8>>,
        headers: HashMap<String, Vec<String>>,
        exec_time: f64,
    ) -> Self {
        let (status_code, body, headers) = Self::detect_redirect_and_set_meta_redirect(status_code, body, headers);

        let flat_headers = utils::get_flat_response_headers(&headers);

        Self {
            url,
            status_code,
            body,
            headers: flat_headers,
            exec_time,
            skipped_reason: None,
            loaded_from_cache: false,
        }
    }

    /// Get body as text (lossy UTF-8 conversion). Use for HTML/CSS/JS processing.
    pub fn body_text(&self) -> Option<String> {
        self.body.as_ref().map(|b| String::from_utf8_lossy(b).into_owned())
    }

    pub fn get_formatted_exec_time(&self) -> String {
        utils::get_formatted_duration(self.exec_time)
    }

    pub fn get_formatted_body_length(&self) -> String {
        let len = self.body.as_ref().map(|b| b.len()).unwrap_or(0) as i64;
        utils::get_formatted_size(len, 0)
    }

    /// Detect redirect and modify response to text/html with <meta> redirect (required for offline mode)
    fn detect_redirect_and_set_meta_redirect(
        status_code: i32,
        mut body: Option<Vec<u8>>,
        mut headers: HashMap<String, Vec<String>>,
    ) -> (i32, Option<Vec<u8>>, HashMap<String, Vec<String>>) {
        if status_code > 300 && status_code < 320 {
            let location = headers.get("location").and_then(|v| v.first()).cloned();
            if let Some(ref loc) = location {
                body = Some(
                    format!(
                        "<meta http-equiv=\"refresh\" content=\"0; url={}\"> Redirecting to {} ...",
                        loc, loc
                    )
                    .into_bytes(),
                );
                headers.insert("content-type".to_string(), vec!["text/html".to_string()]);
            }
        }
        (status_code, body, headers)
    }

    pub fn set_loaded_from_cache(&mut self, loaded: bool) {
        self.loaded_from_cache = loaded;
    }

    pub fn is_loaded_from_cache(&self) -> bool {
        self.loaded_from_cache
    }

    pub fn is_skipped(&self) -> bool {
        self.skipped_reason.is_some()
    }

    /// Create a skipped response (status code -6)
    pub fn create_skipped(url: String, reason: String) -> Self {
        let mut response = Self {
            url,
            status_code: -6,
            body: Some(Vec::new()),
            headers: HashMap::new(),
            exec_time: 0.0,
            skipped_reason: Some(reason),
            loaded_from_cache: false,
        };
        response.skipped_reason = response.skipped_reason.take();
        response
    }

    /// Get a header value by name (case-insensitive lookup)
    pub fn get_header(&self, name: &str) -> Option<&String> {
        let lower = name.to_lowercase();
        self.headers.get(&lower)
    }

    /// Get the content-type header value
    pub fn get_content_type(&self) -> Option<&String> {
        self.get_header("content-type")
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_redirect_meta() {
        let mut headers = HashMap::new();
        headers.insert("location".to_string(), vec!["https://example.com/new".to_string()]);
        let response = HttpResponse::new("https://example.com/old".to_string(), 301, None, headers, 0.1);
        assert!(response.body_text().map(|b| b.contains("Redirecting")).unwrap_or(false));
        assert_eq!(
            response.headers.get("content-type").map(|s| s.as_str()),
            Some("text/html")
        );
    }

    #[test]
    fn test_skipped_response() {
        let response = HttpResponse::create_skipped("https://example.com".to_string(), "test reason".to_string());
        assert!(response.is_skipped());
        assert_eq!(response.status_code, -6);
    }

    #[test]
    fn test_no_redirect_for_200() {
        let headers = HashMap::new();
        let response = HttpResponse::new(
            "https://example.com/".to_string(),
            200,
            Some(b"<html>ok</html>".to_vec()),
            headers,
            0.05,
        );
        assert_eq!(response.body_text().as_deref(), Some("<html>ok</html>"));
    }
}


================================================
FILE: src/engine/initiator.rs
================================================
// SiteOne Crawler - Initiator
// (c) Jan Reges <jan.reges@siteone.cz>
//
// Parses CLI arguments, validates options, creates and returns Manager.

use crate::analysis::manager::AnalysisManager;
use crate::engine::manager::Manager;
use crate::error::CrawlerResult;
use crate::options::core_options;
use crate::utils;
use crate::version;

// Import all analyzers for registration
use crate::analysis::caching_analyzer::CachingAnalyzer;
use crate::analysis::content_type_analyzer::ContentTypeAnalyzer;
use crate::analysis::dns_analyzer::DnsAnalyzer;
use crate::analysis::external_links_analyzer::ExternalLinksAnalyzer;
use crate::analysis::fastest_analyzer::FastestAnalyzer;
use crate::analysis::headers_analyzer::HeadersAnalyzer;
use crate::analysis::page404_analyzer::Page404Analyzer;
use crate::analysis::redirects_analyzer::RedirectsAnalyzer;
use crate::analysis::skipped_urls_analyzer::SkippedUrlsAnalyzer;
use crate::analysis::slowest_analyzer::SlowestAnalyzer;
use crate::analysis::source_domains_analyzer::SourceDomainsAnalyzer;

// Import complex analyzers
use crate::analysis::accessibility_analyzer::AccessibilityAnalyzer;
use crate::analysis::best_practice_analyzer::BestPracticeAnalyzer;
use crate::analysis::security_analyzer::SecurityAnalyzer;
use crate::analysis::seo_opengraph_analyzer::SeoAndOpenGraphAnalyzer;
use crate::analysis::ssl_tls_analyzer::SslTlsAnalyzer;

pub struct Initiator {
    options: core_options::CoreOptions,
    analysis_manager: AnalysisManager,
}

impl Initiator {
    /// Create a new Initiator by parsing CLI arguments
    pub fn new(argv: &[String]) -> CrawlerResult<Self> {
        // Handle --help and --version before full parsing
        for arg in argv {
            if arg == "--help" || arg == "-h" {
                Self::print_help();
                std::process::exit(2);
            } else if arg == "--version" || arg == "-v" {
                println!(
                    "{}",
                    utils::get_color_text(&format!("Version: {}", version::CODE), "blue", false,)
                );
                std::process::exit(2);
            }
        }

        // Parse core options from argv
        let options = core_options::parse_argv(argv)?;

        // Handle special options that were parsed
        if options.show_help_only {
            Self::print_help();
            std::process::exit(2);
        }
        if options.show_version_only {
            println!(
                "{}",
                utils::get_color_text(&format!("Version: {}", version::CODE), "blue", false,)
            );
            std::process::exit(2);
        }

        // Create and populate analysis manager
        let mut analysis_manager = AnalysisManager::new();
        Self::register_analyzers(&mut analysis_manager, &options);
        analysis_manager.auto_activate_analyzers();

        // Apply analyzer filter regex if specified
        if let Some(ref filter_regex) = options.analyzer_filter_regex {
            analysis_manager.filter_analyzers_by_regex(filter_regex);
        }

        Ok(Self {
            options,
            analysis_manager,
        })
    }

    /// Create and return a Manager ready to run
    pub fn create_manager(self) -> CrawlerResult<Manager> {
        Manager::new(self.options, self.analysis_manager)
    }

    /// Get reference to parsed options
    pub fn get_options(&self) -> &core_options::CoreOptions {
        &self.options
    }

    /// Register all analyzers with the analysis manager.
    fn register_analyzers(analysis_manager: &mut AnalysisManager, options: &core_options::CoreOptions) {
        // Register all analyzers in alphabetical order
        analysis_manager.register_analyzer(Box::new(AccessibilityAnalyzer::new()));
        analysis_manager.register_analyzer(Box::new(BestPracticeAnalyzer::new()));
        analysis_manager.register_analyzer(Box::new(CachingAnalyzer::new()));
        analysis_manager.register_analyzer(Box::new(ContentTypeAnalyzer::new()));
        analysis_manager.register_analyzer(Box::new(DnsAnalyzer::new()));
        analysis_manager.register_analyzer(Box::new(ExternalLinksAnalyzer::new()));

        // FastestAnalyzer: pass fastest_top_limit and fastest_max_time from options
        let mut fastest = FastestAnalyzer::new();
        fastest.set_config(options.fastest_top_limit as usize, options.fastest_max_time);
        analysis_manager.register_analyzer(Box::new(fastest));

        analysis_manager.register_analyzer(Box::new(HeadersAnalyzer::new()));
        analysis_manager.register_analyzer(Box::new(Page404Analyzer::new()));
        analysis_manager.register_analyzer(Box::new(RedirectsAnalyzer::new()));
        analysis_manager.register_analyzer(Box::new(SecurityAnalyzer::new()));

        // SeoAndOpenGraphAnalyzer: pass max_heading_level from options
        let mut seo = SeoAndOpenGraphAnalyzer::new();
        seo.set_config(options.max_heading_level as i32);
        analysis_manager.register_analyzer(Box::new(seo));

        analysis_manager.register_analyzer(Box::new(SkippedUrlsAnalyzer::new()));

        // SlowestAnalyzer: pass slowest_top_limit, slowest_min_time, slowest_max_time from options
        let mut slowest = SlowestAnalyzer::new();
        slowest.set_config(
            options.slowest_top_limit as usize,
            options.slowest_min_time,
            options.slowest_max_time,
        );
        analysis_manager.register_analyzer(Box::new(slowest));

        analysis_manager.register_analyzer(Box::new(SourceDomainsAnalyzer::new()));
        analysis_manager.register_analyzer(Box::new(SslTlsAnalyzer::new()));
    }

    /// Print help text.
    pub fn print_help() {
        println!();
        println!(
            "{}",
            utils::get_color_text(
                "Usage: siteone-crawler --url=https://mydomain.tld/ [options]",
                "yellow",
                false,
            )
        );
        println!(
            "{}",
            utils::get_color_text(&format!("Version: {}", version::CODE), "blue", false,)
        );
        println!();

        let help_text = core_options::get_help_text();
        print!("{}", help_text);

        println!();
        println!("For more detailed descriptions of parameters, see README.md.");
        println!();
        println!(
            "{}{}{}",
            utils::get_color_text("Created with ", "gray", false),
            utils::get_color_text("\u{2665}", "red", false),
            utils::get_color_text(
                " by J\u{00e1}n Rege\u{0161} (jan.reges@siteone.cz) from www.SiteOne.io (Czech Republic) [2023-2026]",
                "gray",
                false,
            )
        );
    }
}


================================================
FILE: src/engine/manager.rs
================================================
// SiteOne Crawler - Manager
// (c) Jan Reges <jan.reges@siteone.cz>
//
// Orchestrates the crawler: initializes all components, runs the crawl,
// then runs post-crawl analysis and exporters.

use std::sync::Arc;
use std::time::Instant;

use crate::analysis::manager::AnalysisManager;
use crate::components::super_table::SuperTable;
use crate::content_processor::astro_processor::AstroProcessor;
use crate::content_processor::base_processor::ProcessorConfig;
use crate::content_processor::css_processor::CssProcessor;
use crate::content_processor::html_processor::HtmlProcessor;
use crate::content_processor::javascript_processor::JavaScriptProcessor;
use crate::content_processor::manager::{ContentProcessorManager, SUPER_TABLE_CONTENT_PROCESSORS_STATS};
use crate::content_processor::nextjs_processor::NextJsProcessor;
use crate::content_processor::svelte_processor::SvelteProcessor;
use crate::content_processor::xml_processor::XmlProcessor;
use crate::engine::crawler::Crawler;
use crate::engine::http_client::HttpClient;
use crate::engine::parsed_url::ParsedUrl;
use crate::error::{CrawlerError, CrawlerResult};
use crate::export::exporter::Exporter;
use crate::export::file_exporter::FileExporter;
use crate::export::html_report::report::HtmlReport;
use crate::export::mailer_exporter::MailerExporter;
use crate::export::markdown_exporter::MarkdownExporter;
use crate::export::offline_website_exporter::OfflineWebsiteExporter;
use crate::export::sitemap_exporter::SitemapExporter;
use crate::export::upload_exporter::UploadExporter;
use crate::info::Info;
use crate::options::core_options::{CoreOptions, StorageType};
use crate::output::json_output::JsonOutput;
use crate::output::multi_output::MultiOutput;
use crate::output::output::{CrawlerInfo, Output};
use crate::output::text_output::TextOutput;
use crate::result::status::Status;
use crate::result::storage::file_storage::FileStorage;
use crate::result::storage::memory_storage::MemoryStorage;
use crate::scoring::ci_gate;
use crate::scoring::scorer;
use crate::types::OutputType;
use crate::utils;
use crate::version;

pub struct Manager {
    options: Arc<CoreOptions>,
    analysis_manager: Option<AnalysisManager>,
    start_time: Instant,
}

impl Manager {
    pub fn new(options: CoreOptions, analysis_manager: AnalysisManager) -> CrawlerResult<Self> {
        let start_time = Instant::now();

        // Apply color settings
        if options.no_color {
            utils::disable_colors();
        } else if options.force_color {
            utils::force_enabled_colors();
        }

        // Apply forced console width if specified
        if let Some(width) = options.console_width
            && width > 0
        {
            utils::set_forced_console_width(width as usize);
        }

        // Apply hard rows limit for analysis tables
        SuperTable::set_hard_rows_limit(options.rows_limit as usize);

        Ok(Self {
            options: Arc::new(options),
            analysis_manager: Some(analysis_manager),
            start_time,
        })
    }

    /// Run the complete crawl process: init, crawl, analyze, export, summarize.
    /// Returns an exit code: 0 = success, 10 = CI gate failed.
    pub async fn run(&mut self) -> CrawlerResult<i32> {
        let options = self.options.clone();

        // Build crawler info
        let command = std::env::args().collect::<Vec<_>>().join(" ");
        let hostname = gethostname::gethostname().to_string_lossy().to_string();

        // Build the final user agent the same way Crawler does
        let final_user_agent = {
            let base = if let Some(ref ua) = options.user_agent {
                ua.clone()
            } else {
                match options.device {
                    crate::types::DeviceType::Desktop => format!(
                        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/{}.0.0.0 Safari/537.36",
                        chrono::Utc::now().format("%y")
                    ),
                    crate::types::DeviceType::Mobile => "Mozilla/5.0 (iPhone; CPU iPhone OS 15_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.0 Mobile/15A5370a Safari/604.1".to_string(),
                    crate::types::DeviceType::Tablet => "Mozilla/5.0 (Linux; Android 11; SAMSUNG SM-T875) AppleWebKit/537.36 (KHTML, like Gecko) SamsungBrowser/14.0 Chrome/87.0.4280.141 Safari/537.36".to_string(),
                }
            };
            if base.ends_with('!') {
                base.trim_end_matches('!').trim_end().to_string()
            } else {
                format!("{} siteone-crawler/{}", base, version::CODE)
            }
        };

        let crawler_info = Info::new(
            "SiteOne Crawler".to_string(),
            version::CODE.to_string(),
            chrono::Utc::now().format("%Y-%m-%d %H:%M:%S").to_string(),
            utils::get_safe_command(&command),
            hostname,
            final_user_agent,
            options.url.clone(),
        );

        // Create storage
        let origin_url = ParsedUrl::parse(&options.url, None);
        let origin_url_domain = format!(
            "{}{}",
            origin_url.host.as_deref().unwrap_or(""),
            origin_url.port.map(|p| format!("-{}", p)).unwrap_or_default()
        );

        let storage: Box<dyn crate::result::storage::storage::Storage> = match options.result_storage {
            StorageType::Memory => Box::new(MemoryStorage::new(options.result_storage_compression)),
            StorageType::File => {
                let result_storage_dir = crate::utils::get_absolute_path(&options.result_storage_dir);
                Box::new(FileStorage::new(
                    &result_storage_dir,
                    options.result_storage_compression,
                    &origin_url_domain,
                )?)
            }
        };

        // Create status
        let status = Status::new(storage, true, crawler_info.clone(), self.start_time);

        // Create output
        let output = self.create_output(&options, &crawler_info)?;

        // Create HTTP client
        let http_cache_dir =
            if options.http_cache_dir.as_deref() == Some("off") || options.http_cache_dir.as_deref() == Some("") {
                None
            } else {
                options
                    .http_cache_dir
                    .as_ref()
                    .map(|dir| crate::utils::get_absolute_path(dir))
            };

        let http_client = HttpClient::new(
            options.proxy.clone(),
            options.http_auth.clone(),
            http_cache_dir,
            options.http_cache_compression,
            options.http_cache_ttl,
            options.accept_invalid_certs,
        );

        // Create content processor manager and register processors
        let content_processor_manager = Self::create_content_processor_manager(&options);

        // Take the analysis_manager out of self (it will live inside the Crawler)
        let analysis_manager = self
            .analysis_manager
            .take()
            .ok_or_else(|| CrawlerError::Config("AnalysisManager already consumed".to_string()))?;

        // Create crawler
        let mut crawler = Crawler::new(
            options.clone(),
            http_client,
            content_processor_manager,
            analysis_manager,
            output,
            status,
        );

        // Set extra columns from analyzers (for Access., Best pr. columns in progress table)
        if let (Ok(am), Ok(mut out)) = (crawler.get_analysis_manager().lock(), crawler.get_output().lock()) {
            let extra_cols = am.get_extra_columns();
            out.set_extra_columns_from_analysis(extra_cols);
        }

        // Print banner
        if let Ok(mut out) = crawler.get_output().lock() {
            out.add_banner();
        }

        // Fetch initial robots.txt
        let initial_scheme = options.get_initial_scheme();
        let initial_host = options.get_initial_host(false);
        let initial_port = ParsedUrl::parse(&options.url, None)
            .port
            .unwrap_or(if initial_scheme == "https" { 443 } else { 80 });
        crawler
            .fetch_robots_txt(&initial_host, initial_port, &initial_scheme)
            .await;

        // Run the crawler
        crawler.run().await?;

        // Post-crawl: run analyzers
        let exit_code = self.run_post_crawl(&crawler);

        Ok(exit_code)
    }

    /// Run post-crawl analysis and produce final output.
    /// Returns exit code: 0 = success, 3 = no pages crawled, 10 = CI gate failed.
    fn run_post_crawl(&mut self, crawler: &Crawler) -> i32 {
        let status = crawler.get_status();
        let output = crawler.get_output();
        let analysis_manager = crawler.get_analysis_manager();

        // Transfer skipped URLs from crawler to status
        {
            let skipped = crawler.get_skipped();
            if let Ok(mut st) = status.lock() {
                for entry in skipped.iter() {
                    st.add_skipped_url(
                        entry.url.clone(),
                        entry.reason,
                        entry.source_uq_id.clone(),
                        entry.source_attr,
                    );
                }
            }
        }

        // Run post-crawl analyzers
        if let (Ok(mut am), Ok(st), Ok(mut out)) = (analysis_manager.lock(), status.lock(), output.lock()) {
            am.run_analyzers(&st, &mut **out);
        }

        // Add content processor stats
        if let Ok(cpm) = crawler.get_content_processor_manager().lock() {
            let mut super_table = cpm.get_stats().get_super_table(
                SUPER_TABLE_CONTENT_PROCESSORS_STATS,
                "Content processor stats",
                "No content processors found.",
                None,
                None,
            );

            if let Ok(st) = status.lock() {
                st.configure_super_table_url_stripping(&mut super_table);
            }
            if let Ok(mut out) = output.lock() {
                out.add_super_table(&super_table);
            }
            if let Ok(st) = status.lock() {
                st.add_super_table_at_end(super_table);
            }
        }

        // Run exporters
        self.run_exporters(crawler);

        // Print used options
        if let Ok(mut out) = output.lock() {
            out.add_used_options();
        }

        // Print total stats
        if let Ok(st) = status.lock() {
            let basic_stats = st.get_basic_stats();
            let output_stats = crate::output::output::BasicStats {
                total_urls: basic_stats.total_urls,
                total_size: basic_stats.total_size,
                total_size_formatted: basic_stats.total_size_formatted.clone(),
                total_execution_time: basic_stats.total_execution_time,
                total_requests_times: basic_stats.total_requests_times,
                total_requests_times_avg: basic_stats.total_requests_times_avg,
                total_requests_times_min: basic_stats.total_requests_times_min,
                total_requests_times_max: basic_stats.total_requests_times_max,
                count_by_status: basic_stats.count_by_status.clone(),
                count_by_content_type: basic_stats.count_by_content_type.clone(),
            };
            if let Ok(mut out) = output.lock() {
                out.add_total_stats(&output_stats);
            }
        }

        // Calculate and print quality scores, then CI gate, then summary
        let mut ci_exit_code = 0i32;
        if let Ok(st) = status.lock() {
            let mut summary = st.get_summary();
            let basic_stats = st.get_basic_stats();
            let output_stats = crate::output::output::BasicStats {
                total_urls: basic_stats.total_urls,
                total_size: basic_stats.total_size,
                total_size_formatted: basic_stats.total_size_formatted.clone(),
                total_execution_time: basic_stats.total_execution_time,
                total_requests_times: basic_stats.total_requests_times,
                total_requests_times_avg: basic_stats.total_requests_times_avg,
                total_requests_times_min: basic_stats.total_requests_times_min,
                total_requests_times_max: basic_stats.total_requests_times_max,
                count_by_status: basic_stats.count_by_status.clone(),
                count_by_content_type: basic_stats.count_by_content_type.clone(),
            };
            let quality_scores = scorer::calculate_scores(&summary, &output_stats);
            if let Ok(mut out) = output.lock() {
                out.add_quality_scores(&quality_scores);
            }

            // CI/CD quality gate evaluation
            if self.options.ci {
                let ci_result = ci_gate::evaluate(&self.options, &quality_scores, &output_stats, &summary);
                ci_exit_code = ci_result.exit_code;
                if let Ok(mut out) = output.lock() {
                    out.add_ci_gate_result(&ci_result);
                }
            }

            if let Ok(mut out) = output.lock() {
                out.add_summary(&mut summary);
            }
        }

        // Check if no pages were successfully crawled (e.g. initial URL failed with timeout, DNS error, etc.)
        // URLs with negative status codes (-1 connection error, -2 timeout, etc.) are counted in
        // total_urls but don't represent successful responses, so we check for any positive status code.
        let no_pages_crawled = match status.lock() {
            Ok(st) => {
                let stats = st.get_basic_stats();
                !stats.count_by_status.keys().any(|&code| code > 0)
            }
            _ => false,
        };

        // Finalize output
        if let Ok(mut out) = output.lock() {
            out.end();
        }

        // Save text/JSON report files after output is finalized (includes quality scores,
        // CI gate result, and summary that were missing when run_exporters captured content)
        if self.options.output_text_file.is_some() || self.options.output_json_file.is_some() {
            let initial_host = Some(self.options.get_initial_host(false));
            let mut file_exporter = FileExporter::new(
                None,
                None,
                self.options.output_json_file.clone(),
                self.options.output_text_file.clone(),
                self.options.add_timestamp_to_output_file,
                self.options.add_host_to_output_file,
                initial_host,
            );
            if let Ok(out) = output.lock() {
                if let Some(text) = out.get_output_text() {
                    file_exporter.set_text_output_content(text);
                }
                if let Some(json) = out.get_json_content() {
                    file_exporter.set_json_output_content(json);
                }
            }
            if let Ok(st) = status.lock()
                && let Ok(out) = output.lock()
                && let Err(e) = file_exporter.export(&st, &**out)
            {
                eprintln!("Error saving text/JSON report files: {}", e);
            }
        }

        if ci_exit_code != 0 {
            ci_exit_code
        } else if no_pages_crawled {
            3
        } else {
            0
        }
    }

    /// Run all activated exporters after crawling and analysis.
    fn run_exporters(&self, crawler: &Crawler) {
        let status = crawler.get_status();
        let output = crawler.get_output();
        let options = &self.options;

        // Generate HTML report content if any exporter needs it
        let html_report_needed =
            options.output_html_report.is_some() || !options.mail_to.is_empty() || options.upload_enabled;

        let html_report_content = if html_report_needed {
            match status.lock() {
                Ok(st) => {
                    let report = HtmlReport::new(&st, 5, options.html_report_options.as_deref());
                    Some(report.get_html())
                }
                _ => None,
            }
        } else {
            None
        };

        // Build list of activated exporters (excluding offline/markdown which run separately)
        let mut exporters: Vec<Box<dyn Exporter>> = Vec::new();

        // 1. SitemapExporter
        {
            let sitemap = SitemapExporter::new(
                options.sitemap_xml_file.clone(),
                options.sitemap_txt_file.clone(),
                options.sitemap_base_priority,
                options.sitemap_priority_increase,
            );
            if sitemap.should_be_activated() {
                exporters.push(Box::new(sitemap));
            }
        }

        // 2. OfflineWebsiteExporter — run separately to collect exported file paths
        let offline_paths = {
            let mut offline = OfflineWebsiteExporter::new();
            offline.set_offline_export_directory(options.offline_export_dir.clone());
            offline.set_offline_export_store_only_url_regex(options.offline_export_store_only_url_regex.clone());
            offline.set_offline_export_remove_unwanted_code(options.offline_export_remove_unwanted_code);
            offline.set_offline_export_no_auto_redirect_html(options.offline_export_no_auto_redirect_html);
            offline.set_offline_export_preserve_url_structure(options.offline_export_preserve_url_structure);
            offline.set_offline_export_lowercase(options.offline_export_lowercase);
            offline.set_ignore_store_file_error(options.ignore_store_file_error);
            offline.set_replace_content(options.replace_content.clone());
            offline.set_replace_query_string(options.replace_query_string.clone());
            let initial_parsed = ParsedUrl::parse(&options.url, None);
            offline.set_initial_parsed_url(initial_parsed);
            offline.set_content_processor_manager(crawler.get_content_processor_manager().clone());
            if offline.should_be_activated() {
                if let (Ok(st), Ok(out)) = (status.lock(), output.lock())
                    && let Err(e) = offline.export(&st, &**out)
                {
                    st.add_critical_to_summary(offline.get_name(), &format!("{} error: {}", offline.get_name(), e));
                }
                let paths = offline.get_exported_file_paths().clone();
                if paths.is_empty() { None } else { Some(paths) }
            } else {
                None
            }
        };

        // 3. MarkdownExporter — run separately to collect exported file paths
        let markdown_paths = {
            let mut markdown = MarkdownExporter::new();
            markdown.set_markdown_export_directory(options.markdown_export_dir.clone());
            markdown.set_markdown_export_single_file(options.markdown_export_single_file.clone());
            markdown.set_markdown_move_content_before_h1_to_end(options.markdown_move_content_before_h1_to_end);
            markdown.set_markdown_disable_images(options.markdown_disable_images);
            markdown.set_markdown_disable_files(options.markdown_disable_files);
            markdown.set_markdown_remove_links_and_images_from_single_file(
                options.markdown_remove_links_and_images_from_single_file,
            );
            markdown.set_markdown_exclude_selector(options.markdown_exclude_selector.clone());
            markdown.set_markdown_replace_content(options.markdown_replace_content.clone());
            markdown.set_markdown_replace_query_string(options.markdown_replace_query_string.clone());
            markdown.set_markdown_export_store_only_url_regex(options.markdown_export_store_only_url_regex.clone());
            markdown.set_markdown_ignore_store_file_error(options.markdown_ignore_store_file_error);
            markdown.set_initial_parsed_url(ParsedUrl::parse(&options.url, None));
            markdown.set_ignore_regexes(options.ignore_regex.clone());
            markdown.set_initial_url(options.url.clone());
            markdown.set_content_processor_manager(crawler.get_content_processor_manager().clone());
            if markdown.should_be_activated() {
                if let (Ok(st), Ok(out)) = (status.lock(), output.lock())
                    && let Err(e) = markdown.export(&st, &**out)
                {
                    st.add_critical_to_summary(markdown.get_name(), &format!("{} error: {}", markdown.get_name(), e));
                }
                let paths = markdown.get_exported_file_paths().clone();
                if paths.is_empty() { None } else { Some(paths) }
            } else {
                None
            }
        };

        // Inject exported file paths into JSON output results
        if (offline_paths.is_some() || markdown_paths.is_some())
            && let Ok(mut out) = output.lock()
        {
            out.set_export_file_paths(offline_paths.as_ref(), markdown_paths.as_ref());
        }

        // 4. FileExporter for HTML report only (text/JSON files are saved later in
        //    run_post_crawl after quality scores and summary have been added to output)
        {
            let initial_host = Some(options.get_initial_host(false));
            let mut file_exporter = FileExporter::new(
                options.output_html_report.clone(),
                options.html_report_options.clone(),
                None,
                None,
                options.add_timestamp_to_output_file,
                options.add_host_to_output_file,
                initial_host,
            );
            if let Some(ref content) = html_report_content {
                file_exporter.set_html_report_content(content.clone());
            }
            if file_exporter.should_be_activated() {
                exporters.push(Box::new(file_exporter));
            }
        }

        // 5. MailerExporter
        {
            let initial_host = Some(options.get_initial_host(false));
            let mut mailer = MailerExporter::new(
                options.mail_to.clone(),
                options.mail_from.clone(),
                options.mail_from_name.clone(),
                options.mail_smtp_host.clone(),
                options.mail_smtp_port.clamp(1, 65535) as u16,
                options.mail_smtp_user.clone(),
                options.mail_smtp_pass.clone(),
                options.mail_subject_template.clone(),
                initial_host,
            );
            if let Some(ref content) = html_report_content {
                mailer.set_html_report_content(content.clone());
            }
            if mailer.should_be_activated() {
                exporters.push(Box::new(mailer));
            }
        }

        // 6. UploadExporter
        {
            let mut upload = UploadExporter::new(
                options.upload_enabled,
                options.upload_to.clone(),
                Some(options.upload_retention.clone()),
                options.upload_password.clone(),
                options.upload_timeout as u64,
            );
            if let Some(ref content) = html_report_content {
                upload.set_html_report_content(content.clone());
            }
            if upload.should_be_activated() {
                exporters.push(Box::new(upload));
            }
        }

        // Run remaining activated exporters (sitemap, file, mailer, upload)
        for exporter in &mut exporters {
            if let (Ok(st), Ok(out)) = (status.lock(), output.lock())
                && let Err(e) = exporter.export(&st, &**out)
            {
                st.add_critical_to_summary(exporter.get_name(), &format!("{} error: {}", exporter.get_name(), e));
            }
        }
    }

    /// Create output based on options
    fn create_output(&self, options: &CoreOptions, crawler_info: &Info) -> CrawlerResult<Box<dyn Output>> {
        let output_crawler_info = CrawlerInfo {
            name: crawler_info.name.clone(),
            version: crawler_info.version.clone(),
            executed_at: crawler_info.executed_at.clone(),
            command: crawler_info.command.clone(),
            hostname: crawler_info.hostname.clone(),
            final_user_agent: crawler_info.final_user_agent.clone(),
            url: options.url.clone(),
            device: options.device.as_str().to_string(),
            workers: options.workers as usize,
        };

        // Create MultiOutput with both TextOutput and JsonOutput when FileExporter is active.
        // TextOutput prints to stdout only when output_type == Text.
        // JsonOutput prints to stdout only when output_type == Json.
        // Both are always created so FileExporter can save both formats.
        let file_exporter_active = options.output_html_report.is_some()
            || options.output_json_file.is_some()
            || options.output_text_file.is_some();

        let need_text = options.output_type == OutputType::Text || file_exporter_active;
        let need_json = options.output_type == OutputType::Json
            || file_exporter_active
            || !options.mail_to.is_empty()
            || options.sitemap_xml_file.is_some()
            || options.sitemap_txt_file.is_some();

        let mut outputs: Vec<Box<dyn Output>> = Vec::new();

        if need_text {
            outputs.push(Box::new(TextOutput::new(
                output_crawler_info.clone(),
                options.extra_columns.clone(),
                options.hide_progress_bar,
                options.show_scheme_and_host,
                options.do_not_truncate_url,
                options.add_random_query_params,
                options.url_column_size.map(|s| s as usize),
                options.show_inline_criticals,
                options.show_inline_warnings,
                options.hide_columns.clone(),
                options.workers as usize,
                options.memory_limit.clone(),
                options.output_type == OutputType::Text, // print_to_output
                options.ci,                              // disable_animation
            )));
        }

        if need_json {
            let options_json = serde_json::to_value(options).ok();
            outputs.push(Box::new(JsonOutput::new(
                output_crawler_info,
                options.extra_columns.clone(),
                options.hide_progress_bar,
                options.output_type == OutputType::Json, // print_to_output
                options_json,
            )));
        }

        if outputs.len() > 1 {
            let mut multi = MultiOutput::new();
            for out in outputs {
                multi.add_output(out);
            }
            Ok(Box::new(multi))
        } else {
            match outputs.into_iter().next() {
                Some(out) => Ok(out),
                _ => Err(CrawlerError::Config("Unknown output type".to_string())),
            }
        }
    }

    /// Create and register all content processors
    fn create_content_processor_manager(options: &CoreOptions) -> ContentProcessorManager {
        let initial_url = ParsedUrl::parse(&options.url, None);
        let mut config = ProcessorConfig::new(initial_url);
        config.single_page = options.single_page;
        config.single_foreign_page = options.single_foreign_page;
        config.max_depth = options.max_depth;
        config.files_enabled = !options.disable_files;
        config.images_enabled = !options.disable_images;
        config.scripts_enabled = !options.disable_javascript;
        config.styles_enabled = !options.disable_styles;
        config.fonts_enabled = !options.disable_fonts;
        config.disable_javascript = options.disable_javascript;
        config.remove_all_anchor_listeners = options.remove_all_anchor_listeners;
        config.ignore_regex = options.ignore_regex.clone();
        config.disable_astro_inline_modules = options.disable_astro_inline_modules;
        config.offline_export_preserve_urls = options.offline_export_preserve_urls;
        config.compile_ignore_regex();

        let mut cpm = ContentProcessorManager::new();

        // Register processors
        let _ = cpm.register_processor(Box::new(AstroProcessor::new(config.clone())));
        let _ = cpm.register_processor(Box::new(HtmlProcessor::new(config.clone())));
        let _ = cpm.register_processor(Box::new(JavaScriptProcessor::new(config.clone())));
        let _ = cpm.register_processor(Box::new(CssProcessor::new(config.clone())));
        let _ = cpm.register_processor(Box::new(XmlProcessor::new(config.clone())));
        let _ = cpm.register_processor(Box::new(NextJsProcessor::new(config.clone())));
        let _ = cpm.register_processor(Box::new(SvelteProcessor::new(config)));

        cpm
    }
}


================================================
FILE: src/engine/mod.rs
================================================
// Engine module - core crawling engine
// (c) Jan Reges <jan.reges@siteone.cz>

pub mod crawler;
pub mod found_url;
pub mod found_urls;
pub mod http_client;
pub mod http_response;
pub mod initiator;
pub mod manager;
pub mod parsed_url;
pub mod robots_txt;


================================================
FILE: src/engine/parsed_url.rs
================================================
// SiteOne Crawler - ParsedUrl
// (c) Jan Reges <jan.reges@siteone.cz>

use std::collections::HashMap;
use std::path::Path;
use std::sync::Mutex;

use once_cell::sync::Lazy;
use regex::Regex;

/// Regex for detecting HTML page extensions (not static files)
static HTML_EXTENSIONS_RE: Lazy<Regex> = Lazy::new(|| {
    Regex::new(r"(?i)\.(htm|html|shtml|php|phtml|ashx|xhtml|asp|aspx|jsp|jspx|do|cfm|cgi|pl|rb|erb|gsp)$").unwrap()
});

/// Regex for detecting file extension at end of path
static FILE_EXTENSION_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"(?i)\.([a-z0-9]{1,10})$").unwrap());

/// Regex for detecting image extensions in path
static IMAGE_PATH_RE: Lazy<Regex> =
    Lazy::new(|| Regex::new(r"(?i)\.(png|gif|jpg|jpeg|ico|webp|avif|tif|bmp|svg)").unwrap());

/// Regex for detecting dynamic image query params
static IMAGE_QUERY_RE: Lazy<Regex> =
    Lazy::new(|| Regex::new(r"(?i)(png|gif|jpg|jpeg|ico|webp|avif|tif|bmp|svg|crop|size|landscape)").unwrap());

/// Regex for detecting font extensions
static FONT_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"(?i)\.(eot|ttf|woff2|woff|otf)").unwrap());

/// Regex for 2nd level domain extraction
static DOMAIN_2ND_LEVEL_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"(?i)([a-z0-9\-]+\.[a-z][a-z0-9]{0,10})$").unwrap());

/// Regex for extracting extensions from path+query
static ESTIMATE_EXT_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"(?i)\.([0-9a-z]{1,5})").unwrap());

/// Regex for relative URL detection (starts with alphanumeric or underscore)
static RELATIVE_URL_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"(?i)^[a-z0-9_]").unwrap());

/// Parsed URL struct with all URL components
#[derive(Debug)]
pub struct ParsedUrl {
    pub url: String,
    pub scheme: Option<String>,
    pub host: Option<String>,
    pub port: Option<u16>,
    pub path: String,
    pub query: Option<String>,
    pub fragment: Option<String>,
    pub extension: Option<String>,
    pub domain_2nd_level: Option<String>,

    full_url_cache: Mutex<HashMap<String, String>>,
    debug: bool,
}

impl Clone for ParsedUrl {
    fn clone(&self) -> Self {
        Self {
            url: self.url.clone(),
            scheme: self.scheme.clone(),
            host: self.host.clone(),
            port: self.port,
            path: self.path.clone(),
            query: self.query.clone(),
            fragment: self.fragment.clone(),
            extension: self.extension.clone(),
            domain_2nd_level: self.domain_2nd_level.clone(),
            full_url_cache: Mutex::new(HashMap::new()),
            debug: self.debug,
        }
    }
}

impl PartialEq for ParsedUrl {
    fn eq(&self, other: &Self) -> bool {
        self.url == other.url
            && self.scheme == other.scheme
            && self.host == other.host
            && self.port == other.port
            && self.path == other.path
            && self.query == other.query
            && self.fragment == other.fragment
    }
}

impl Eq for ParsedUrl {}

impl std::hash::Hash for ParsedUrl {
    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
        self.url.hash(state);
        self.scheme.hash(state);
        self.host.hash(state);
        self.port.hash(state);
        self.path.hash(state);
        self.query.hash(state);
        self.fragment.hash(state);
    }
}

impl ParsedUrl {
    #[allow(clippy::too_many_arguments)]
    pub fn new(
        url: String,
        scheme: Option<String>,
        host: Option<String>,
        port: Option<u16>,
        path: String,
        query: Option<String>,
        fragment: Option<String>,
        extension: Option<String>,
        domain_2nd_level: Option<String>,
    ) -> Self {
        let fragment = match fragment.as_deref() {
            Some("") => None,
            _ => fragment,
        };
        Self {
            url,
            scheme,
            host,
            port,
            path,
            query,
            fragment,
            extension,
            domain_2nd_level,
            full_url_cache: Mutex::new(HashMap::new()),
            debug: false,
        }
    }

    /// Get full URL with optional scheme+host and optional fragment
    pub fn get_full_url(&self, include_scheme_and_host: bool, include_fragment: bool) -> String {
        let cache_key = format!(
            "{}{}",
            if include_scheme_and_host { '1' } else { '0' },
            if include_fragment { '1' } else { '0' }
        );

        if let Ok(cache) = self.full_url_cache.lock()
            && let Some(cached) = cache.get(&cache_key)
        {
            return cached.clone();
        }

        let mut full_url = self.path.clone();
        if let Some(ref q) = self.query {
            full_url.push('?');
            full_url.push_str(q);
        }
        if include_fragment && let Some(ref f) = self.fragment {
            full_url.push('#');
            full_url.push_str(f);
        }

        if include_scheme_and_host {
            if let (Some(scheme), Some(host)) = (&self.scheme, &self.host) {
                let mut port = self.port;
                if (port == Some(80) && scheme == "http") || (port == Some(443) && scheme == "https") {
                    port = None;
                }
                let port_str = match port {
                    Some(p) => format!(":{}", p),
                    None => String::new(),
                };
                full_url = format!("{}://{}{}{}", scheme, host, port_str, full_url);
            } else if self.scheme.is_none()
                && let Some(ref host) = self.host
            {
                let port = match self.port {
                    Some(p) if p != 80 && p != 443 => Some(p),
                    _ => None,
                };
                let port_str = match port {
                    Some(p) => format!(":{}", p),
                    None => String::new(),
                };
                full_url = format!("//{}{}{}", host, port_str, full_url);
            }
        }

        if let Ok(mut cache) = self.full_url_cache.lock() {
            cache.insert(cache_key, full_url.clone());
        }

        full_url
    }

    /// Is probably static file/asset and probably not the HTML page?
    pub fn is_static_file(&self) -> bool {
        if FILE_EXTENSION_RE.is_match(&self.path) {
            // Has extension - check it's not numeric
            let is_numeric = self
                .extension
                .as_ref()
                .map(|e| e.parse::<f64>().is_ok())
                .unwrap_or(false);

            if !is_numeric && !HTML_EXTENSIONS_RE.is_match(&self.path) {
                return true;
            }
        }

        if self.is_image() || self.is_css() {
            return true;
        }

        false
    }

    /// Is probably image? Has an image extension or is dynamic image
    pub fn is_image(&self) -> bool {
        let has_image_extension = IMAGE_PATH_RE.is_match(&self.path);
        let is_dynamic_image = self.query.as_ref().map(|q| IMAGE_QUERY_RE.is_match(q)).unwrap_or(false);
        has_image_extension || is_dynamic_image
    }

    /// Is font file?
    pub fn is_font(&self) -> bool {
        FONT_RE.is_match(&self.path)
    }

    /// Is CSS file?
    pub fn is_css(&self) -> bool {
        self.extension.as_deref() == Some("css") || self.url.to_lowercase().contains("fonts.googleapis.com/css")
    }

    /// Is Origin header required for this resource?
    pub fn is_origin_required(&self) -> bool {
        self.is_font()
    }

    /// Estimate file extension from URL
    pub fn estimate_extension(&self) -> Option<String> {
        // if extension is numeric, it is probably not a real extension
        if let Some(ref ext) = self.extension {
            if ext.parse::<f64>().is_ok() {
                return None;
            }
            return Some(ext.to_lowercase());
        }

        let combined = format!("{}?{}", self.path, self.query.as_deref().unwrap_or(""));
        let mut last_ext = None;
        for caps in ESTIMATE_EXT_RE.captures_iter(&combined) {
            if let Some(m) = caps.get(1) {
                last_ext = Some(m.as_str().to_lowercase());
            }
        }
        last_ext
    }

    /// Copy scheme/host/port from another ParsedUrl
    pub fn set_attributes(&mut self, url: &ParsedUrl, scheme: bool, host: bool, port: bool) {
        if scheme {
            self.scheme = url.scheme.clone();
        }
        if host {
            self.host = url.host.clone();
        }
        if port {
            self.port = url.port;
        }
        self.clear_cache();
    }

    pub fn set_path(&mut self, path: String) {
        self.path = path;
        self.extension = extract_extension(&self.path);
        self.clear_cache();
    }

    /// Change depth by adding/removing ../ prefixes
    pub fn change_depth(&mut self, change: i32) {
        let mut new_path = self.path.clone();
        if change > 0 {
            let clean_path = new_path.trim_start_matches('/');
            new_path = format!("{}{}", "../".repeat(change as usize), clean_path);
        } else if change < 0 {
            let count = change.unsigned_abs() as usize;
            for _ in 0..count {
                if let Some(rest) = new_path.strip_prefix("../") {
                    new_path = rest.to_string();
                } else {
                    break;
                }
            }
        }

        if new_path != self.path {
            self.set_path(new_path);
        }
        self.clear_cache();
    }

    pub fn set_query(&mut self, query: Option<String>) {
        self.query = query;
        self.clear_cache();
    }

    pub fn set_fragment(&mut self, fragment: Option<String>) {
        self.fragment = fragment;
        self.clear_cache();
    }

    pub fn set_extension(&mut self, extension: Option<String>) {
        self.extension = extension;
        self.clear_cache();
    }

    pub fn set_debug(&mut self, debug: bool) {
        self.debug = debug;
    }

    /// URL is only a fragment reference (#something)
    pub fn is_only_fragment(&self) -> bool {
        self.path.is_empty() && self.query.is_none() && self.host.is_none() && self.fragment.is_some()
    }

    /// Get full homepage URL (scheme://host[:port]) without trailing slash
    pub fn get_full_homepage_url(&self) -> String {
        let port_str = match self.port {
            Some(p) => format!(":{}", p),
            None => String::new(),
        };
        format!(
            "{}://{}{}",
            self.scheme.as_deref().unwrap_or("https"),
            self.host.as_deref().unwrap_or(""),
            port_str
        )
    }

    /// Parse URL string and return ParsedUrl object
    /// When base_url is provided, it fills in missing parts (scheme, host, port)
    pub fn parse(url: &str, base_url: Option<&ParsedUrl>) -> Self {
        let mut url = url.to_string();

        if let Some(base) = base_url {
            if url.starts_with("./") {
                // Relative URL via ./xyz
                if base.path.ends_with('/') {
                    url = format!("{}{}", base.path, &url[2..]);
                } else {
                    let dir = parent_path(&base.path);
                    let file = &url[2..];
                    if dir == "/" {
                        url = format!("/{}", file);
                    } else {
                        url = format!("{}/{}", dir, file);
                    }
                }
            } else if !url.starts_with("http:") && !url.starts_with("https:") && RELATIVE_URL_RE.is_match(&url) {
                // Relative URL via xyz/abc
                if base.path.ends_with('/') {
                    url = format!("{}{}", base.path, url);
                } else {
                    url = format!("{}{}", parent_path(&base.path), url);
                }
            } else if url.starts_with('/') && !url.starts_with("//") {
                // Absolute path /xyz/abc
                url = format!("{}{}", base.get_full_homepage_url(), url);
            }
        }

        // Use url::Url for parsing when it's a full URL, otherwise manual parse
        let (scheme, host, port_parsed, path, query, fragment) =
            if url.starts_with("http://") || url.starts_with("https://") || url.starts_with("//") {
                // For protocol-relative URLs, prepend a scheme for parsing
                let parse_url = if url.starts_with("//") {
                    format!("https:{}", url)
                } else {
                    url.clone()
                };

                match url::Url::parse(&parse_url) {
                    Ok(parsed) => {
                        let s = if url.starts_with("//") {
                            None
                        } else {
                            Some(parsed.scheme().to_string())
                        };
                        let h = parsed.host_str().map(|h| h.to_string());
                        let p = parsed.port();
                        let path = if parsed.path().is_empty() {
                            "/".to_string()
                        } else {
                            parsed.path().to_string()
                        };
                        let q = parsed.query().map(|q| q.to_string());
                        let f = parsed.fragment().map(|f| f.to_string());
                        (s, h, p, path, q, f)
                    }
                    Err(_) => parse_url_manually(&url),
                }
            } else {
                parse_url_manually(&url)
            };

        let scheme = scheme.or_else(|| base_url.and_then(|b| b.scheme.clone()));
        let has_parsed_host = host.is_some();
        let host = host.or_else(|| base_url.and_then(|b| b.host.clone()));
        let port = port_parsed.or_else(|| {
            if !has_parsed_host {
                base_url.and_then(|b| b.port)
            } else {
                None
            }
        });
        let port = port.or(match scheme.as_deref() {
            Some("http") => Some(80),
            _ => Some(443),
        });

        let path = if path.is_empty() && has_parsed_host {
            "/".to_string()
        } else {
            path
        };

        let extension = if !path.is_empty() && path.contains('.') {
            extract_extension(&path)
        } else {
            None
        };

        let domain_2nd_level = host.as_ref().and_then(|h| {
            DOMAIN_2ND_LEVEL_RE
                .captures(h)
                .and_then(|c| c.get(1))
                .map(|m| m.as_str().to_string())
        });

        Self::new(
            url,
            scheme,
            host,
            port,
            path,
            query,
            fragment,
            extension,
            domain_2nd_level,
        )
    }

    pub fn is_https(&self) -> bool {
        self.scheme.as_deref() == Some("https")
    }

    /// Extract 2nd-level domain from a host string (e.g., "www.example.com" -> "example.com")
    pub fn extract_2nd_level_domain(host: &str) -> Option<String> {
        DOMAIN_2ND_LEVEL_RE
            .captures(host)
            .and_then(|c| c.get(1))
            .map(|m| m.as_str().to_string())
    }

    /// Get base name (last path part) of the URL
    pub fn get_base_name(&self) -> Option<String> {
        if self.path.is_empty() || self.path == "/" {
            return None;
        }

        let path = self.path.trim_end_matches('/');
        let result = path.rsplit('/').next().filter(|s| !s.is_empty());

        result.map(|r| {
            // if query string contains path, return path with this query
            if let Some(ref q) = self.query
                && (q.contains('/') || q.contains("%2F"))
            {
                return format!("{}?{}", r, q);
            }
            r.to_string()
        })
    }

    /// Get depth of the URL path
    /// / -> 0, /about -> 1, /about/me -> 2, etc.
    pub fn get_depth(&self) -> usize {
        let trimmed = self.path.trim_end_matches('/');
        let slash_count = trimmed.matches('/').count();
        let dotdot_count = self.path.matches("/..").count();
        slash_count.saturating_sub(dotdot_count)
    }

    fn clear_cache(&self) {
        if let Ok(mut cache) = self.full_url_cache.lock() {
            cache.clear();
        }
    }
}

impl std::fmt::Display for ParsedUrl {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        write!(f, "{}", self.get_full_url(true, true))
    }
}

/// Extract file extension from path.
fn extract_extension(path: &str) -> Option<String> {
    Path::new(path)
        .extension()
        .and_then(|e| e.to_str())
        .filter(|e| !e.is_empty())
        .map(|e| e.to_string())
}

/// Get parent directory of a path.
fn parent_path(path: &str) -> String {
    match path.rfind('/') {
        Some(0) => "/".to_string(),
        Some(pos) => path[..pos].to_string(),
        None => ".".to_string(),
    }
}

/// Manual URL parsing for non-standard URLs (relative paths, fragments, etc.)
#[allow(clippy::type_complexity)]
fn parse_url_manually(
    url: &str,
) -> (
    Option<String>,
    Option<String>,
    Option<u16>,
    String,
    Option<String>,
    Option<String>,
) {
    let mut remaining = url;

    // Extract fragment
    let fragment = if let Some(hash_pos) = remaining.find('#') {
        let f = &remaining[hash_pos + 1..];
        remaining = &remaining[..hash_pos];
        if f.is_empty() { None } else { Some(f.to_string()) }
    } else {
        None
    };

    // Extract query
    let query = if let Some(q_pos) = remaining.find('?') {
        let q = &remaining[q_pos + 1..];
        remaining = &remaining[..q_pos];
        if q.is_empty() { None } else { Some(q.to_string()) }
    } else {
        None
    };

    let path = remaining.to_string();

    (None, None, None, path, query, fragment)
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_parse_full_url() {
        let parsed = ParsedUrl::parse("https://example.com/path/to/page?q=1#section", None);
        assert_eq!(parsed.scheme.as_deref(), Some("https"));
        assert_eq!(parsed.host.as_deref(), Some("example.com"));
        assert_eq!(parsed.path, "/path/to/page");
        assert_eq!(parsed.query.as_deref(), Some("q=1"));
        assert_eq!(parsed.fragment.as_deref(), Some("section"));
    }

    #[test]
    fn test_depth() {
        assert_eq!(ParsedUrl::parse("/", None).get_depth(), 0);
        assert_eq!(ParsedUrl::parse("/about", None).get_depth(), 1);
        assert_eq!(ParsedUrl::parse("/about/", None).get_depth(), 1);
        assert_eq!(ParsedUrl::parse("/about/me", None).get_depth(), 2);
        assert_eq!(ParsedUrl::parse("/about/me/", None).get_depth(), 2);
    }

    #[test]
    fn test_is_static_file() {
        let css = ParsedUrl::parse("https://example.com/style.css", None);
        assert!(css.is_static_file());

        let html = ParsedUrl::parse("https://example.com/page.html", None);
        assert!(!html.is_static_file());

        let page = ParsedUrl::parse("https://example.com/about", None);
        assert!(!page.is_static_file());
    }

    #[test]
    fn test_relative_url_resolution() {
        let base = ParsedUrl::parse("https://example.com/dir/page", None);
        let relative = ParsedUrl::parse("./other", Some(&base));
        assert_eq!(relative.path, "/dir/other");
    }

    #[test]
    fn test_get_full_url() {
        let parsed = ParsedUrl::parse("https://example.com/path?q=1#frag", None);
        assert_eq!(parsed.get_full_url(true, true), "https://example.com/path?q=1#frag");
        assert_eq!(parsed.get_full_url(true, false), "https://example.com/path?q=1");
        assert_eq!(parsed.get_full_url(false, true), "/path?q=1#frag");
    }

    #[test]
    fn test_get_base_name() {
        let p1 = ParsedUrl::parse("https://example.com/foo/bar", None);
        assert_eq!(p1.get_base_name(), Some("bar".to_string()));

        let p2 = ParsedUrl::parse("https://example.com/", None);
        assert_eq!(p2.get_base_name(), None);
    }

    #[test]
    fn test_domain_2nd_level() {
        let parsed = ParsedUrl::parse("https://sub.example.com/page", None);
        assert_eq!(parsed.domain_2nd_level.as_deref(), Some("example.com"));
    }
}


================================================
FILE: src/engine/robots_txt.rs
================================================
// SiteOne Crawler - robots.txt parser
// (c) Jan Reges <jan.reges@siteone.cz>

use once_cell::sync::Lazy;
use regex::Regex;

/// Regex for matching frontend asset extensions that are always allowed
static ASSET_EXTENSION_RE: Lazy<Regex> = Lazy::new(|| {
    Regex::new(r"(?i)\.(js|css|json|eot|ttf|woff2|woff|otf|png|gif|jpg|jpeg|ico|webp|avif|tif|bmp|svg)").unwrap()
});

/// Regex for User-agent directive
static USER_AGENT_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"(?i)^User-agent:\s*(.*)").unwrap());

/// Regex for Disallow directive
static DISALLOW_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"(?i)^Disallow:\s*(.*)").unwrap());

/// Regex for Allow directive
static ALLOW_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"(?i)^Allow:\s*(.*)").unwrap());

/// Regex for Sitemap directive
static SITEMAP_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"(?i)^Sitemap:\s*(.*)").unwrap());

/// Regex to strip comments
static COMMENT_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"#.*").unwrap());

/// Parsed robots.txt data for a single domain
#[derive(Debug, Clone)]
pub struct RobotsTxt {
    /// Disallowed paths for the relevant user agents (* and SiteOne-Crawler)
    disallowed_paths: Vec<String>,
    /// Allowed paths (override disallows) for the relevant user agents
    allowed_paths: Vec<String>,
    /// Sitemap URLs declared in robots.txt
    sitemaps: Vec<String>,
    /// Raw content of robots.txt
    raw_content: String,
}

impl RobotsTxt {
    /// Parse robots.txt content and extract rules for * and SiteOne-Crawler user agents
    pub fn parse(content: &str) -> Self {
        let mut disallowed_paths = Vec::new();
        let mut allowed_paths = Vec::new();
        let mut sitemaps = Vec::new();
        let mut current_user_agent: Option<String> = None;

        for line in content.lines() {
            // Remove comments
            let line = COMMENT_RE.replace(line, "");
            let line = line.trim();

            if line.is_empty() {
                continue;
            }

            if let Some(caps) = USER_AGENT_RE.captures(line) {
                if let Some(m) = caps.get(1) {
                    current_user_agent = Some(m.as_str().trim().to_string());
                }
            } else if let Some(ref ua) = current_user_agent
                && (ua == "*" || ua == "SiteOne-Crawler")
            {
                if let Some(caps) = DISALLOW_RE.captures(line) {
                    if let Some(m) = caps.get(1) {
                        let path = m.as_str().trim().to_string();
                        if !path.is_empty() {
                            disallowed_paths.push(path);
                        }
                    }
                } else if let Some(caps) = ALLOW_RE.captures(line)
                    && let Some(m) = caps.get(1)
                {
                    let path = m.as_str().trim().to_string();
                    if !path.is_empty() {
                        allowed_paths.push(path);
                    }
                }
            }

            // Sitemaps are always parsed regardless of user-agent section
            if let Some(caps) = SITEMAP_RE.captures(line)
                && let Some(m) = caps.get(1)
            {
                let sitemap_url = m.as_str().trim().to_string();
                if !sitemap_url.is_empty() {
                    sitemaps.push(sitemap_url);
                }
            }
        }

        Self {
            disallowed_paths,
            allowed_paths,
            sitemaps,
            raw_content: content.to_string(),
        }
    }

    /// Check if a URL path is allowed by the robots.txt rules.
    /// Frontend assets (js, css, images, fonts) are always allowed.
    ///
    /// A URL is disallowed if its path starts with any disallowed path
    /// (case-insensitive prefix match).
    pub fn is_allowed(&self, url: &str) -> bool {
        // Frontend assets are always allowed
        if ASSET_EXTENSION_RE.is_match(url) {
            return true;
        }

        // If no disallowed paths, everything is allowed
        if self.disallowed_paths.is_empty() {
            return true;
        }

        // Extract path from URL
        let url_path = url::Url::parse(url).ok().map(|u| u.path().to_string()).or_else(|| {
            // If it's not a full URL, try treating it as a path
            let path_part = if let Some(q_pos) = url.find('?') {
                &url[..q_pos]
            } else {
                url
            };
            Some(path_part.to_string())
        });

        let url_path = match url_path {
            Some(p) => p,
            None => return true,
        };

        // Check allowed paths first (they override disallows)
        for allowed_path in &self.allowed_paths {
            if path_matches(&url_path, allowed_path) {
                return true;
            }
        }

        // Check disallowed paths
        for disallowed_path in &self.disallowed_paths {
            if path_matches(&url_path, disallowed_path) {
                return false;
            }
        }

        true
    }

    /// Get sitemap URLs declared in robots.txt
    pub fn get_sitemaps(&self) -> &[String] {
        &self.sitemaps
    }

    /// Get disallowed paths
    pub fn get_disallowed_paths(&self) -> &[String] {
        &self.disallowed_paths
    }

    /// Get allowed paths
    pub fn get_allowed_paths(&self) -> &[String] {
        &self.allowed_paths
    }

    /// Get raw robots.txt content
    pub fn get_raw_content(&self) -> &str {
        &self.raw_content
    }
}

/// Check if a URL path matches a robots.txt path pattern.
/// Supports:
/// - Simple prefix matching
/// - Wildcard (*) matching
/// - End-of-string ($) anchor
fn path_matches(url_path: &str, pattern: &str) -> bool {
    // Handle $ anchor at end
    if let Some(pattern_without_anchor) = pattern.strip_suffix('$') {
        if pattern_without_anchor.contains('*') {
            return wildcard_match(url_path, pattern_without_anchor, true);
        }
        return url_path.to_lowercase() == pattern_without_anchor.to_lowercase();
    }

    // Handle wildcard patterns
    if pattern.contains('*') {
        return wildcard_match(url_path, pattern, false);
    }

    // Simple case-insensitive prefix match
    url_path.to_lowercase().starts_with(&pattern.to_lowercase())
}

/// Match a URL path against a wildcard pattern (* matches any sequence of characters)
fn wildcard_match(url_path: &str, pattern: &str, exact_end: bool) -> bool {
    let parts: Vec<&str> = pattern.split('*').collect();
    let url_lower = url_path.to_lowercase();
    let mut search_from = 0;

    for (i, part) in parts.iter().enumerate() {
        if part.is_empty() {
            continue;
        }
        let part_lower = part.to_lowercase();

        match url_lower[search_from..].find(&part_lower) {
            Some(pos) => {
                // First part must match at start
                if i == 0 && pos != 0 {
                    return false;
                }
                search_from += pos + part_lower.len();
            }
            None => return false,
        }
    }

    if exact_end {
        // The last part must end at the end of the URL path
        return search_from == url_lower.len();
    }

    true
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_parse_basic() {
        let content = r#"
User-agent: *
Disallow: /admin/
Disallow: /private/
Allow: /admin/public/

Sitemap: https://example.com/sitemap.xml
"#;
        let robots = RobotsTxt::parse(content);
        assert_eq!(robots.disallowed_paths.len(), 2);
        assert_eq!(robots.allowed_paths.len(), 1);
        assert_eq!(robots.sitemaps.len(), 1);
        assert_eq!(robots.sitemaps[0], "https://example.com/sitemap.xml");
    }

    #[test]
    fn test_is_allowed() {
        let content = r#"
User-agent: *
Disallow: /admin/
Disallow: /private/
"#;
        let robots = RobotsTxt::parse(content);
        assert!(robots.is_allowed("/public/page"));
        assert!(!robots.is_allowed("/admin/settings"));
        assert!(!robots.is_allowed("/private/data"));
        assert!(robots.is_allowed("/"));
    }

    #[test]
    fn test_assets_always_allowed() {
        let content = r#"
User-agent: *
Disallow: /
"#;
        let robots = RobotsTxt::parse(content);
        assert!(robots.is_allowed("/style.css"));
        assert!(robots.is_allowed("/script.js"));
        assert!(robots.is_allowed("/image.png"));
        assert!(!robots.is_allowed("/page"));
    }

    #[test]
    fn test_wildcard_matching() {
        assert!(path_matches("/search?q=test", "/search"));
        assert!(path_matches("/admin/page", "/admin/"));
        assert!(!path_matches("/public/page", "/admin/"));
    }

    #[test]
    fn test_wildcard_star() {
        assert!(path_matches("/path/to/file.pdf", "/*.pdf"));
        assert!(!path_matches("/path/to/file.html", "/*.pdf"));
    }

    #[test]
    fn test_anchor_matching() {
        assert!(path_matches("/page.html", "/page.html$"));
        assert!(!path_matches("/page.html?q=1", "/page.html$"));
    }

    #[test]
    fn test_siteone_crawler_user_agent() {
        let content = r#"
User-agent: SiteOne-Crawler
Disallow: /blocked/

User-agent: Googlebot
Disallow: /google-only/
"#;
        let robots = RobotsTxt::parse(content);
        assert!(!robots.is_allowed("/blocked/page"));
        // /google-only/ is not disallowed for SiteOne-Crawler or *
        assert!(robots.is_allowed("/google-only/page"));
    }

    #[test]
    fn test_comments_stripped() {
        let content = r#"
User-agent: * # all bots
Disallow: /admin/ # admin panel
# Disallow: /not-really-disallowed/
"#;
        let robots = RobotsTxt::parse(content);
        assert_eq!(robots.disallowed_paths.len(), 1);
        assert_eq!(robots.disallowed_paths[0], "/admin/");
    }

    #[test]
    fn test_empty_disallow() {
        let content = r#"
User-agent: *
Disallow:
"#;
        let robots = RobotsTxt::parse(content);
        assert!(robots.disallowed_paths.is_empty());
        assert!(robots.is_allowed("/anything"));
    }

    #[test]
    fn test_multiple_sitemaps() {
        let content = r#"
User-agent: *
Disallow:

Sitemap: https://example.com/sitemap1.xml
Sitemap: https://example.com/sitemap2.xml
"#;
        let robots = RobotsTxt::parse(content);
        assert_eq!(robots.sitemaps.len(), 2);
    }
}


================================================
FILE: src/error.rs
================================================
// SiteOne Crawler - Error types
// (c) Jan Reges <jan.reges@siteone.cz>

use thiserror::Error;

#[derive(Error, Debug)]
pub enum CrawlerError {
    #[error("IO error: {0}")]
    Io(#[from] std::io::Error),

    #[error("HTTP error: {0}")]
    Http(#[from] reqwest::Error),

    #[error("URL parse error: {0}")]
    UrlParse(#[from] url::ParseError),

    #[error("Parse error: {0}")]
    Parse(String),

    #[error("Config error: {0}")]
    Config(String),

    #[error("Regex error: {0}")]
    Regex(#[from] regex::Error),

    #[error("JSON error: {0}")]
    Json(#[from] serde_json::Error),

    #[error("XML error: {0}")]
    Xml(#[from] quick_xml::Error),

    #[error("DNS resolution error: {0}")]
    Dns(String),

    #[error("TLS/SSL error: {0}")]
    Tls(String),

    #[error("Mail error: {0}")]
    Mail(String),

    #[error("Export error: {0}")]
    Export(String),

    #[error("Analysis error: {0}")]
    Analysis(String),

    #[error("{0}")]
    Other(String),
}

pub type CrawlerResult<T> = std::result::Result<T, CrawlerError>;


================================================
FILE: src/export/base_exporter.rs
================================================
// SiteOne Crawler - BaseExporter (shared helpers for all exporters)
// (c) Jan Reges <jan.reges@siteone.cz>
//

use std::fs;
use std::path::Path;

use chrono::Local;
use regex::Regex;

use crate::error::{CrawlerError, CrawlerResult};

/// Get the export file path with optional host and timestamp suffixes.
///
/// - If the file has no extension, the given `default_extension` is appended.
/// - If `add_host` is true, the host is inserted before the extension.
/// - If `add_timestamp` is true, a timestamp is inserted before the extension.
pub fn get_export_file_path(
    file: &str,
    default_extension: &str,
    add_host: bool,
    host: Option<&str>,
    add_timestamp: bool,
) -> CrawlerResult<String> {
    let mut file = file.to_string();

    // Add default extension if missing
    let has_extension = Regex::new(r"\.[a-zA-Z0-9]{1,10}$")
        .map(|re| re.is_match(&file))
        .unwrap_or(false);
    if !has_extension {
        file = format!("{}.{}", file, default_extension);
    }

    // Add host before extension
    if add_host
        && let Some(h) = host
        && let Ok(re) = Regex::new(r"\.[a-zA-Z0-9]{1,10}$")
    {
        file = re
            .replace(&file, |caps: &regex::Captures| {
                format!(".{}{}", h, caps.get(0).map_or("", |m| m.as_str()))
            })
            .to_string();
    }

    // Add timestamp before extension
    if add_timestamp {
        let timestamp = Local::now().format("%Y-%m-%d.%H-%M-%S").to_string();
        if let Ok(re) = Regex::new(r"\.[a-zA-Z0-9]{1,10}$") {
            file = re
                .replace(&file, |caps: &regex::Captures| {
                    format!(".{}{}", timestamp, caps.get(0).map_or("", |m| m.as_str()))
                })
                .to_string();
        }
    }

    // Ensure parent directory exists and is writable
    let path = Path::new(&file);
    if let Some(parent) = path.parent()
        && !parent.exists()
    {
        fs::create_dir_all(parent).map_err(|e| {
            CrawlerError::Export(format!("Cannot create output directory '{}': {}", parent.display(), e))
        })?;
    }

    Ok(file)
}


================================================
FILE: src/export/exporter.rs
================================================
// SiteOne Crawler - Exporter trait
// (c) Jan Reges <jan.reges@siteone.cz>
//

use crate::error::CrawlerResult;
use crate::output::output::Output;
use crate::result::status::Status;

/// Trait for all exporters (file, sitemap, upload, mailer, offline, markdown).
/// Each exporter can save crawl results in a different format or send them somewhere.
pub trait Exporter: Send + Sync {
    /// Get the name of this exporter (for logging/debugging).
    fn get_name(&self) -> &str;

    /// Should this exporter be activated based on the provided options?
    fn should_be_activated(&self) -> bool;

    /// Perform the export (save to file, send to server, etc.).
    /// Uses the Output trait to report progress/results to the user.
    fn export(&mut self, status: &Status, output: &dyn Output) -> CrawlerResult<()>;
}


================================================
FILE: src/export/file_exporter.rs
================================================
// SiteOne Crawler - FileExporter
// (c) Jan Reges <jan.reges@siteone.cz>
//
// Saves crawl results to HTML, JSON, and/or text files.

use std::fs;
use std::time::Instant;

use crate::error::{CrawlerError, CrawlerResult};
use crate::export::base_exporter;
use crate::export::exporter::Exporter;
use crate::output::output::Output;
use crate::result::status::Status;
use crate::utils;

pub struct FileExporter {
    /// Path for HTML report output (--output-html-report)
    pub output_html_report: Option<String>,
    /// Comma-separated list of sections for HTML report (--html-report-options)
    pub html_report_options: Option<String>,
    /// Path for JSON output (--output-json-file)
    pub output_json_file: Option<String>,
    /// Path for text output (--output-text-file)
    pub output_text_file: Option<String>,
    /// Add timestamp to output filename (--add-timestamp-to-output-file)
    pub add_timestamp_to_output_file: bool,
    /// Add host to output filename (--add-host-to-output-file)
    pub add_host_to_output_file: bool,
    /// Initial host from the crawled URL (for filename generation)
    pub initial_host: Option<String>,
    /// Cached text output to save to file
    pub text_output_content: Option<String>,
    /// Cached JSON output to save to file
    pub json_output_content: Option<String>,
    /// Cached HTML report content to save to file
    pub html_report_content: Option<String>,
}

impl FileExporter {
    pub fn new(
        output_html_report: Option<String>,
        html_report_options: Option<String>,
        output_json_file: Option<String>,
        output_text_file: Option<String>,
        add_timestamp_to_output_file: bool,
        add_host_to_output_file: bool,
        initial_host: Option<String>,
    ) -> Self {
        Self {
            output_html_report,
            html_report_options,
            output_json_file,
            output_text_file,
            add_timestamp_to_output_file,
            add_host_to_output_file,
            initial_host,
            text_output_content: None,
            json_output_content: None,
            html_report_content: None,
        }
    }

    /// Set the text output content to be saved (from TextOutput)
    pub fn set_text_output_content(&mut self, content: String) {
        self.text_output_content = Some(content);
    }

    /// Set the JSON output content to be saved (from JsonOutput)
    pub fn set_json_output_content(&mut self, content: String) {
        self.json_output_content = Some(content);
    }

    /// Set the HTML report content to be saved (from HtmlReport)
    pub fn set_html_report_content(&mut self, content: String) {
        self.html_report_content = Some(content);
    }

    /// Get the export file path with host/timestamp modifications.
    fn get_export_file_path(&self, file: &str, extension: &str) -> CrawlerResult<String> {
        base_exporter::get_export_file_path(
            file,
            extension,
            self.add_host_to_output_file,
            self.initial_host.as_deref(),
            self.add_timestamp_to_output_file,
        )
    }
}

impl Exporter for FileExporter {
    fn get_name(&self) -> &str {
        "FileExporter"
    }

    fn should_be_activated(&self) -> bool {
        self.output_html_report.is_some() || self.output_json_file.is_some() || self.output_text_file.is_some()
    }

    fn export(&mut self, status: &Status, _output: &dyn Output) -> CrawlerResult<()> {
        // Export text file
        if let Some(ref output_text_file) = self.output_text_file.clone() {
            let start = Instant::now();
            let report_file = self.get_export_file_path(output_text_file, "txt")?;

            let content = match &self.text_output_content {
                Some(c) => utils::remove_ansi_colors(c),
                None => {
                    return Err(CrawlerError::Export(
                        "Text output content not available for FileExporter".to_string(),
                    ));
                }
            };

            fs::write(&report_file, &content).map_err(|e| {
                CrawlerError::Export(format!("Failed to write text report to '{}': {}", report_file, e))
            })?;

            let elapsed = start.elapsed().as_secs_f64();
            let report_file_display = utils::get_output_formatted_path(&report_file);
            status.add_info_to_summary(
                "export-to-text",
                &format!(
                    "Text report saved to '{}' and took {}",
                    report_file_display,
                    utils::get_formatted_duration(elapsed)
                ),
            );
        }

        // Export JSON file
        if let Some(ref output_json_file) = self.output_json_file.clone() {
            let start = Instant::now();
            let report_file = self.get_export_file_path(output_json_file, "json")?;

            let content = match &self.json_output_content {
                Some(c) => c.clone(),
                None => {
                    return Err(CrawlerError::Export(
                        "JSON output content not available for FileExporter".to_string(),
                    ));
                }
            };

            fs::write(&report_file, &content).map_err(|e| {
                CrawlerError::Export(format!("Failed to write JSON report to '{}': {}", report_file, e))
            })?;

            let elapsed = start.elapsed().as_secs_f64();
            let report_file_display = utils::get_output_formatted_path(&report_file);
            status.add_info_to_summary(
                "export-to-json",
                &format!(
                    "JSON report saved to '{}' and took {}",
                    report_file_display,
                    utils::get_formatted_duration(elapsed)
                ),
            );
        }

        // Export HTML report
        if let Some(ref output_html_report) = self.output_html_report.clone() {
            let start = Instant::now();
            let report_file = self.get_export_file_path(output_html_report, "html")?;

            let content = match &self.html_report_content {
                Some(c) => c.clone(),
                None => {
                    return Err(CrawlerError::Export(
                        "HTML report content not available. Set it via set_html_report_content() before export."
                            .to_string(),
                    ));
                }
            };

            fs::write(&report_file, &content).map_err(|e| {
                CrawlerError::Export(format!("Failed to write HTML report to '{}': {}", report_file, e))
            })?;

            let elapsed = start.elapsed().as_secs_f64();
            let report_file_display = utils::get_output_formatted_path(&report_file);
            status.add_info_to_summary(
                "export-to-html",
                &format!(
                    "HTML report saved to '{}' and took {}",
                    report_file_display,
                    utils::get_formatted_duration(elapsed)
                ),
            );
        }

        Ok(())
    }
}


================================================
FILE: src/export/html_report/badge.rs
================================================
// SiteOne Crawler - Badge for HTML Report
// (c) Jan Reges <jan.reges@siteone.cz>

/// Badge colors used in HTML report tabs and content
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum BadgeColor {
    Red,
    Orange,
    Green,
    Blue,
    Neutral,
}

impl BadgeColor {
    pub fn as_css_class(&self) -> &'static str {
        match self {
            BadgeColor::Red => "red",
            BadgeColor::Orange => "orange",
            BadgeColor::Green => "green",
            BadgeColor::Blue => "blue",
            BadgeColor::Neutral => "neutral",
        }
    }
}

/// Badge displayed in tab titles or content to show counts/status
#[derive(Debug, Clone)]
pub struct Badge {
    pub value: String,
    pub color: BadgeColor,
    pub title: Option<String>,
}

impl Badge {
    pub fn new(value: String, color: BadgeColor) -> Self {
        Self {
            value,
            color,
            title: None,
        }
    }

    pub fn with_title(value: String, color: BadgeColor, title: &str) -> Self {
        Self {
            value,
            color,
            title: Some(title.to_string()),
        }
    }
}


================================================
FILE: src/export/html_report/mod.rs
================================================
// SiteOne Crawler - HTML Report module
// (c) Jan Reges <jan.reges@siteone.cz>

pub mod badge;
pub mod report;
pub mod tab;


================================================
FILE: src/export/html_report/report.rs
================================================
// SiteOne Crawler - HTML Report Generator
// (c) Jan Reges <jan.reges@siteone.cz>

use std::collections::HashMap;

use regex::Regex;

use crate::components::summary::item_status::ItemStatus;
use crate::components::super_table::SuperTable;
use crate::components::super_table_column::SuperTableColumn;
use crate::output::output::BasicStats;
use crate::result::status::Status;
use crate::result::visited_url;
use crate::scoring::scorer;
use crate::utils;
use crate::version;

use super::badge::{Badge, BadgeColor};
use super::tab::Tab;

// SuperTable apl_code constants (matching the analyzer module constants)
const SUPER_TABLE_VISITED_URLS: &str = "visited-urls";

// Analysis manager
const ST_ANALYSIS_STATS: &str = "analysis-stats";

// Content processor
const ST_CONTENT_PROCESSORS_STATS: &str = "content-processors-stats";

// Analyzers
const ST_HEADERS: &str = "headers";
const ST_HEADERS_VALUES: &str = "headers-values";
const ST_SEO: &str = "seo";
const ST_OPEN_GRAPH: &str = "open-graph";
const ST_SEO_HEADINGS: &str = "seo-headings";
const ST_DNS: &str = "dns";
const ST_CERTIFICATE_INFO: &str = "certificate-info";
const ST_NON_UNIQUE_TITLES: &str = "non-unique-titles";
const ST_NON_UNIQUE_DESCRIPTIONS: &str = "non-unique-descriptions";
const ST_CONTENT_TYPES: &str = "content-types";
const ST_CONTENT_MIME_TYPES: &str = "content-types-raw";
const ST_SKIPPED_SUMMARY: &str = "skipped-summary";
const ST_SKIPPED: &str = "skipped";
const ST_CACHING_PER_CONTENT_TYPE: &str = "caching-per-content-type";
const ST_CACHING_PER_DOMAIN: &str = "caching-per-domain";
const ST_CACHING_PER_DOMAIN_AND_CONTENT_TYPE: &str = "caching-per-domain-and-content-type";
const ST_REDIRECTS: &str = "redirects";
const ST_404: &str = "404";
const ST_FASTEST_URLS: &str = "fastest-urls";
const ST_SLOWEST_URLS: &str = "slowest-urls";
const ST_BEST_PRACTICES: &str = "best-practices";
const ST_ACCESSIBILITY: &str = "accessibility";
const ST_EXTERNAL_URLS: &str = "external-urls";
const ST_SECURITY: &str = "security";
const ST_SOURCE_DOMAINS: &str = "source-domains";

/// Analysis names for Best Practices
const BEST_PRACTICE_ANALYSIS_NAMES: &[&str] = &[
    "Large inline SVGs",
    "Duplicate inline SVGs",
    "Invalid inline SVGs",
    "Missing quotes on attributes",
    "DOM depth",
    "Heading structure",
    "Non-clickable phone numbers",
    "Title uniqueness",
    "Description uniqueness",
];

/// Analysis names for Accessibility
const ACCESSIBILITY_ANALYSIS_NAMES: &[&str] = &[
    "Valid HTML",
    "Missing image alt attributes",
    "Missing form labels",
    "Missing aria labels",
    "Missing roles",
    "Missing html lang attribute",
];

/// Analysis names for Security
const SECURITY_ANALYSIS_NAMES: &[&str] = &["Security headers"];

/// Severity order for sorting
const SEVERITY_ORDER_CRITICAL: i32 = 1;
const SEVERITY_ORDER_WARNING: i32 = 2;
const SEVERITY_ORDER_NOTICE: i32 = 3;

/// Max example URLs to show per finding
const MAX_EXAMPLE_URLS: usize = 5;

/// HTML template embedded at compile time
const TEMPLATE_HTML: &str = include_str!("template.html");

/// SuperTable apl_codes that are handled by dedicated tabs (not shown as generic tabs)
const SKIPPED_SUPER_TABLES: &[&str] = &[
    ST_ANALYSIS_STATS,
    ST_HEADERS_VALUES,
    ST_SEO,
    ST_OPEN_GRAPH,
    ST_DNS,
    ST_CERTIFICATE_INFO,
    ST_NON_UNIQUE_TITLES,
    ST_NON_UNIQUE_DESCRIPTIONS,
    ST_CONTENT_MIME_TYPES,
    ST_SKIPPED,
    ST_CACHING_PER_DOMAIN,
    ST_CACHING_PER_DOMAIN_AND_CONTENT_TYPE,
    ST_CONTENT_PROCESSORS_STATS,
];

/// Lightweight extracted info from a SuperTable (since SuperTable is not Clone)
struct SuperTableInfo {
    apl_code: String,
    title: String,
    forced_tab_label: Option<String>,
    html_output: String,
    total_rows: usize,
    data: Vec<HashMap<String, String>>,
}

/// Extract info from a SuperTable reference
fn extract_info(st: &SuperTable) -> SuperTableInfo {
    SuperTableInfo {
        apl_code: st.apl_code.clone(),
        title: st.title.clone(),
        forced_tab_label: st.forced_tab_label.clone(),
        html_output: st.get_html_output(),
        total_rows: st.get_total_rows(),
        data: st.get_data().to_vec(),
    }
}

/// SuperTable tab order
fn get_super_table_order(apl_code: &str) -> i32 {
    const ORDERS: &[&str] = &[
        SUPER_TABLE_VISITED_URLS,
        ST_BEST_PRACTICES,
        ST_ACCESSIBILITY,
        ST_SECURITY,
        ST_SEO,
        ST_SEO_HEADINGS,
        ST_404,
        ST_REDIRECTS,
        ST_SKIPPED_SUMMARY,
        ST_EXTERNAL_URLS,
        ST_FASTEST_URLS,
        ST_SLOWEST_URLS,
        ST_CONTENT_TYPES,
        ST_SOURCE_DOMAINS,
        ST_HEADERS,
        ST_CACHING_PER_CONTENT_TYPE,
        ST_DNS,
    ];

    ORDERS
        .iter()
        .position(|&code| code == apl_code)
        .map(|i| i as i32)
        .unwrap_or(1000)
}

/// Map SuperTable apl_code to section name for filtering
fn get_section_name_by_apl_code(apl_code: &str) -> Option<&'static str> {
    match apl_code {
        "accessibility" => Some("accessibility"),
        "404" => Some("404-pages"),
        "source-domains" => Some("source-domains"),
        "caching-per-content-type" | "caching-per-domain" | "caching-per-domain-and-content-type" => Some("caching"),
        "headers" | "headers-values" => Some("headers"),
        "slowest-urls" => Some("slowest-urls"),
        "fastest-urls" => Some("fastest-urls"),
        "best-practices" => Some("best-practices"),
        "skipped-summary" | "skipped" => Some("skipped-urls"),
        "external-urls" => Some("external-urls"),
        "redirects" => Some("redirects"),
        "security" => Some("security"),
        "content-types" | "content-types-raw" => Some("content-types"),
        "dns" | "certificate-info" => Some("dns-ssl"),
        "seo" | "open-graph" | "seo-headings" | "non-unique-titles" | "non-unique-descriptions" => {
            Some("seo-opengraph")
        }
        _ => None,
    }
}

/// HTML Report generator
pub struct HtmlReport<'a> {
    status: &'a Status,
    #[allow(dead_code)]
    max_example_urls: usize,
    allowed_sections: Option<Vec<String>>,
}

impl<'a> HtmlReport<'a> {
    pub fn new(status: &'a Status, max_example_urls: usize, html_report_options: Option<&str>) -> Self {
        let allowed_sections = html_report_options.filter(|s| !s.is_empty()).map(|opts| {
            opts.split(',')
                .map(|s| s.trim().to_string())
                .filter(|s| !s.is_empty())
                .collect()
        });

        Self {
            status,
            max_example_urls,
            allowed_sections,
        }
    }

    /// Generate the complete HTML report
    pub fn get_html(&self) -> String {
        let mut html = TEMPLATE_HTML.to_string();
        let template_variables = self.get_template_variables();

        for (var_name, var_value) in &template_variables {
            let placeholder = format!("{{${}}}", var_name);
            html = html.replace(&placeholder, var_value);
        }

        self.finalize_html(html)
    }

    /// Check if a section is allowed based on html_report_options
    fn is_section_allowed(&self, section_name: &str) -> bool {
        match &self.allowed_sections {
            None => true,
            Some(sections) => sections.iter().any(|s| s == section_name),
        }
    }

    /// Build template variables
    fn get_template_variables(&self) -> HashMap<String, String> {
        let info = self.status.get_crawler_info();
        let initial_host = self.get_initial_host();
        let initial_url = self.get_initial_url();
        let tabs = self.get_tabs();

        let command = info.command.clone();
        // Strip leading binary name prefix (e.g. "crawler.php ")
        let command = match Regex::new(r"^\S+\.php\s+") {
            Ok(re) => re.replace(&command, "").to_string(),
            _ => command,
        };
        let command = utils::get_safe_command(&command);

        let mut vars = HashMap::new();
        vars.insert("initialHost".to_string(), initial_host);
        vars.insert("initialUrl".to_string(), initial_url);
        vars.insert("version".to_string(), version::CODE.to_string());
        vars.insert("executedAt".to_string(), info.executed_at.clone());
        vars.insert("command".to_string(), command);
        vars.insert("hostname".to_string(), info.hostname.clone());
        vars.insert("userAgent".to_string(), info.final_user_agent.clone());
        vars.insert("tabs".to_string(), self.get_tabs_html(&tabs));
        vars.insert("tabsRadios".to_string(), self.get_tabs_radios(&tabs));
        vars.insert("tabsContent".to_string(), self.get_tabs_content_html(&tabs));
        vars.insert("tabsCss".to_string(), self.get_tabs_css(&tabs));
        vars
    }

    /// Post-process the HTML: convert colors, add badge classes, etc.
    fn finalize_html(&self, mut html: String) -> String {
        // Add badge class to colored spans
        if let Ok(re) = Regex::new(r#"(<span)\s+(style="background-color:[^"]+">)"#) {
            html = re.replace_all(&html, r#"$1 class="badge" $2"#).to_string();
        }
        if let Ok(re) = Regex::new(r#"(<span)\s+(style="color:[^"]+">)"#) {
            html = re.replace_all(&html, r#"$1 class="badge in-table" $2"#).to_string();
        }

        html = html.replace(
            r#"style="background-color: #ffff00""#,
            r#"style="background-color: #ffff00; color: #1F2937""#,
        );

        if let Ok(re) = Regex::new(r"(<td data-value='[0-9]+'[^>]*>)([0-9\-]+)(</td>)") {
            html = re
                .replace_all(&html, r#"$1<span class="badge">$2</span>$3"#)
                .to_string();
        }

        // Change magenta to orange
        html = html.replace("color: #ff00ff", "color: #ff9234");

        // Add spaces around slashes in content-type cells
        if let Ok(re) = Regex::new(r"(?i)(<td[^>]*>)(\s*[a-z0-9. /]+/[a-z0-9. /]+\s*)(</td>)") {
            html = re
                .replace_all(&html, |caps: &regex::Captures| {
                    let td_open = caps.get(1).map_or("", |m| m.as_str());
                    let content = caps.get(2).map_or("", |m| m.as_str());
                    let td_close = caps.get(3).map_or("", |m| m.as_str());
                    match Regex::new(r"\s*/\s*") {
                        Ok(slash_re) => {
                            let cleaned = slash_re.replace_all(content, " / ");
                            format!("{}{}{}", td_open, cleaned, td_close)
                        }
                        _ => {
                            format!("{}{}{}", td_open, content, td_close)
                        }
                    }
                })
                .to_string();
        }

        // Replace specific badge color styles with CSS classes
        let color_replacements = [
            (
                r#"<span class="badge in-table" style="color: #00ff00">"#,
                r#"<span class="badge green">"#,
            ),
            (
                r#"<span class="badge in-table" style="color: #ff9234">"#,
                r#"<span class="badge orange">"#,
            ),
            (
                r#"<span class="badge in-table" style="color: #ff0000">"#,
                r#"<span class="badge red">"#,
            ),
            (
                r#"<span class="badge in-table" style="background-color: #ffff00; color: #1F2937">"#,
                r#"<span class="badge yellow">"#,
            ),
            (
                r#"<span class="badge" style="background-color: #ffff00; color: #1F2937">"#,
                r#"<span class="badge yellow">"#,
            ),
            (
                r#"<span class="badge in-table" style="color: #ffff00">"#,
                r#"<span class="badge yellow">"#,
            ),
            (
                r#"<span class="badge in-table" style="color: #0000ff">"#,
                r#"<span class="badge blue">"#,
            ),
        ];

        for (from, to) in &color_replacements {
            html = html.replace(from, to);
        }

        // Remove excess whitespace from HTML
        html = remove_whitespaces_from_html(&html);

        html
    }

    /// Extract info from all SuperTables (within mutex closures) so we can work with them freely.
    fn extract_all_super_table_infos(&self) -> Vec<SuperTableInfo> {
        let mut all = Vec::new();
        let host = Some(self.get_initial_host());
        let scheme = Some(self.get_initial_scheme());
        let initial_url = Some(self.get_initial_url());
        self.status.with_super_tables_at_beginning_mut(|tables| {
            for st in tables.iter_mut() {
                st.set_host_to_strip_from_urls(host.clone(), scheme.clone());
                st.set_initial_url(initial_url.clone());
                all.push(extract_info(st));
            }
        });
        self.status.with_super_tables_at_end_mut(|tables| {
            for st in tables.iter_mut() {
                st.set_host_to_strip_from_urls(host.clone(), scheme.clone());
                st.set_initial_url(initial_url.clone());
                all.push(extract_info(st));
            }
        });
        all
    }

    /// Gather all tabs for the report
    fn get_tabs(&self) -> Vec<Tab> {
        let mut tabs: Vec<Tab> = Vec::new();

        if self.is_section_allowed("summary")
            && let Some(tab) = self.get_summary_tab()
        {
            tabs.push(tab);
        }
        if self.is_section_allowed("seo-opengraph")
            && let Some(tab) = self.get_seo_and_opengraph_tab()
        {
            tabs.push(tab);
        }
        if self.is_section_allowed("image-gallery")
            && let Some(tab) = self.get_image_gallery_tab()
        {
            tabs.push(tab);
        }
        if self.is_section_allowed("video-gallery")
            && let Some(tab) = self.get_video_gallery_tab()
        {
            tabs.push(tab);
        }
        if self.is_section_allowed("visited-urls") {
            tabs.push(self.get_visited_urls_tab());
        }
        if self.is_section_allowed("dns-ssl")
            && let Some(tab) = self.get_dns_and_ssl_tls_tab()
        {
            tabs.push(tab);
        }
        if self.is_section_allowed("crawler-stats") {
            tabs.push(self.get_crawler_stats_tab());
        }
        if self.is_section_allowed("crawler-info") {
            tabs.push(self.get_crawler_info_tab());
        }

        // Add tabs from SuperTables (analysis results)
        let super_table_tabs = self.get_super_table_tabs();
        tabs.extend(super_table_tabs);

        // Remove empty tabs
        tabs.retain(|tab| !tab.tab_content.is_empty());

        // Sort tabs by order
        tabs.sort_by_key(|tab| tab.get_final_sort_order());

        tabs
    }

    /// Build tabs from SuperTables that are not in SKIPPED_SUPER_TABLES
    fn get_super_table_tabs(&self) -> Vec<Tab> {
        let all_infos = self.extract_all_super_table_infos();
        let mut result = Vec::new();

        // Build analysis detail sub-tables
        let analysis_detail_html = self.build_analysis_detail_tables();

        for info in &all_infos {
            if SKIPPED_SUPER_TABLES.contains(&info.apl_code.as_str()) {
                continue;
            }

            // Check if this SuperTable's section is allowed
            if let Some(section_name) = get_section_name_by_apl_code(&info.apl_code)
                && !self.is_section_allowed(section_name)
            {
                continue;
            }

            let badges = get_super_table_badges_by_apl_code(info, &all_infos);
            let tab_label = info.forced_tab_label.as_deref().unwrap_or(&info.title);
            let content = get_tab_content_by_super_table(info, &all_infos, &analysis_detail_html);
            let order = get_super_table_order(&info.apl_code);

            result.push(Tab::new(tab_label, None, content, false, badges, Some(order)));
        }

        result
    }

    /// Generate hidden radio buttons for tabs
    fn get_tabs_radios(&self, tabs: &[Tab]) -> String {
        let mut html = String::new();
        let mut is_first = true;

        for tab in tabs {
            html.push_str(&format!(
                "<input type=\"radio\" id=\"{}\" name=\"tabs\" arial-label=\"Show tab {}\" class=\"tabs__radio\"{}>\n",
                html_escape(&tab.radio_html_id),
                html_escape(&tab.name),
                if is_first { " checked" } else { "" }
            ));
            if is_first {
                is_first = false;
            }
        }

        html
    }

    /// Generate tab navigation labels with badges
    fn get_tabs_html(&self, tabs: &[Tab]) -> String {
        let mut html = String::new();

        for tab in tabs {
            let mut badges_html = String::new();
            for badge in &tab.badges {
                let title_attr = if let Some(ref title) = badge.title {
                    format!(" style=\"cursor: help\" title=\"{}\"", html_escape(title))
                } else {
                    String::new()
                };
                badges_html.push_str(&format!(
                    "<span class=\"badge {}\"{}>{}</span> ",
                    badge.color.as_css_class(),
                    title_attr,
                    html_escape(&badge.value),
                ));
            }

            let badges_part = if !badges_html.is_empty() {
                format!(" {}", badges_html)
            } else {
                String::new()
            };

            html.push_str(&format!(
                "<label for=\"{}\" class=\"tabs__title {}\">{}{}</label>\n",
                html_escape(&tab.radio_html_id),
                html_escape(&tab.radio_html_id),
                html_escape(&tab.name),
                badges_part,
            ));
        }

        html
    }

    /// Generate tab content panels
    fn get_tabs_content_html(&self, tabs: &[Tab]) -> String {
        let mut html = String::new();
        let line_prefix = "                ";

        for tab in tabs {
            html.push_str(&format!(
                "{}<div class=\"tabs__tab {}\">\n",
                line_prefix,
                html_escape(&tab.content_html_id),
            ));
            if tab.add_heading {
                html.push_str(&format!("{}    <h2>{}</h2>\n", line_prefix, html_escape(&tab.name),));
            }

            let indented_content = tab.tab_content.replace('\n', &format!("\n{}    ", line_prefix));
            html.push_str(&format!("{}    {}\n", line_prefix, indented_content));
            html.push_str(&format!("{}</div>\n", line_prefix));
        }

        html
    }

    /// Generate CSS for tab radio button selectors
    fn get_tabs_css(&self, tabs: &[Tab]) -> String {
        let line_prefix = "        ";

        // Content visibility selectors
        let content_selectors: Vec<String> = tabs
            .iter()
            .map(|tab| {
                format!(
                    "#{radio}:checked ~ .tabs__content .{content}",
                    radio = tab.radio_html_id,
                    content = tab.content_html_id,
                )
            })
            .collect();

        let mut css = format!("{} {{\n", content_selectors.join(", "));
        css.push_str(&format!("{}    display: block;\n", line_prefix));
        css.push_str(&format!("{}}}\n", line_prefix));

        // Active tab title selectors
        let title_selectors: Vec<String> = tabs
            .iter()
            .map(|tab| {
                format!(
                    "#{radio}:checked ~ .tabs__navigation .{radio}",
                    radio = tab.radio_html_id,
                )
            })
            .collect();

        css.push_str(&format!("{} {{\n", title_selectors.join(", ")));
        css.push_str(&format!(
            "{}    background-color: var(--color-blue-600);\n",
            line_prefix
        ));
        css.push_str(&format!("{}    color: var(--color-white);\n", line_prefix));
        css.push_str(&format!("{}}}\n", line_prefix));

        css
    }

    // -------------------------------------------------------------------------
    // Individual tab generators
    // -------------------------------------------------------------------------

    /// Summary tab
    fn get_summary_tab(&self) -> Option<Tab> {
        let mut summary = self.status.get_summary();
        if summary.get_items().is_empty() {
            return None;
        }

        let color_to_count = [
            (BadgeColor::Red, summary.get_count_by_item_status(ItemStatus::Critical)),
            (
                BadgeColor::Orange,
                summary.get_count_by_item_status(ItemStatus::Warning),
            ),
            (BadgeColor::Blue, summary.get_count_by_item_status(ItemStatus::Notice)),
            (BadgeColor::Green, summary.get_count_by_item_status(ItemStatus::Ok)),
            (BadgeColor::Neutral, summary.get_count_by_item_status(ItemStatus::Info)),
        ];

        let badges: Vec<Badge> = color_to_count
            .into_iter()
            .filter(|(_, count)| *count > 0)
            .map(|(color, count)| Badge::new(count.to_string(), color))
            .collect();

        // Build quality scores HTML
        let basic_stats = self.status.get_basic_stats();
        let output_stats = BasicStats {
            total_urls: basic_stats.total_urls,
            total_size: basic_stats.total_size,
            total_size_formatted: basic_stats.total_size_formatted.clone(),
            total_execution_time: basic_stats.total_execution_time,
            total_requests_times: basic_stats.total_requests_times,
            total_requests_times_avg: basic_stats.total_requests_times_avg,
            total_requests_times_min: basic_stats.total_requests_times_min,
            total_requests_times_max: basic_stats.total_requests_times_max,
            count_by_status: basic_stats.count_by_status.clone(),
            count_by_content_type: basic_stats.count_by_content_type.clone(),
        };
        let scores = scorer::calculate_scores(&summary, &output_stats);
        let quality_html = build_quality_scores_html(&scores);

        let content = format!("{}\n{}", quality_html, summary.get_as_html());

        Some(Tab::new("Summary", None, content, true, badges, Some(-100)))
    }

    /// SEO and OpenGraph tab
    fn get_seo_and_opengraph_tab(&self) -> Option<Tab> {
        let all_infos = self.extract_all_super_table_infos();

        let mut html = String::new();
        let super_table_codes = [ST_NON_UNIQUE_TITLES, ST_NON_UNIQUE_DESCRIPTIONS, ST_SEO, ST_OPEN_GRAPH];

        let mut badge_count = 0usize;
        let mut order: Option<i32> = None;

        for code in &super_table_codes {
            if let Some(info) = all_infos.iter().find(|i| i.apl_code == *code) {
                html.push_str(&info.html_output);
                html.push_str("<br/>");
                if badge_count == 0 {
                    badge_count = info.total_rows;
                }
                if *code == ST_SEO {
                    order = Some(get_super_table_order(ST_SEO));
                }
            }
        }

        if html.is_empty() {
            return None;
        }

        let mut badges = Vec::new();

        if let Some(info) = all_infos.iter().find(|i| i.apl_code == ST_NON_UNIQUE_TITLES)
            && info.total_rows > 0
        {
            badges.push(Badge::with_title(
                info.total_rows.to_string(),
                BadgeColor::Orange,
                "Non-unique titles",
            ));
        }

        if let Some(info) = all_infos.iter().find(|i| i.apl_code == ST_NON_UNIQUE_DESCRIPTIONS)
            && info.total_rows > 0
        {
            badges.push(Badge::with_title(
                info.total_rows.to_string(),
                BadgeColor::Orange,
                "Non-unique descriptions",
            ));
        }

        badges.push(Badge::with_title(
            badge_count.to_string(),
            BadgeColor::Neutral,
            "Total URL with SEO info",
        ));

        Some(Tab::new("SEO and OpenGraph", None, html, false, badges, order))
    }

    /// Image Gallery tab
    fn get_image_gallery_tab(&self) -> Option<Tab> {
        let summary = self.status.get_summary();
        if summary.get_items().is_empty() {
            return None;
        }

        let visited_urls = self.status.get_visited_urls();
        let images: Vec<_> = visited_urls
            .iter()
            .filter(|v| {
                v.is_image()
                    && v.status_code == 200
                    && matches!(
                        v.source_attr,
                        visited_url::SOURCE_IMG_SRC | visited_url::SOURCE_INPUT_SRC | visited_url::SOURCE_CSS_URL
                    )
            })
            .collect();

        if images.is_empty() {
            return None;
        }

        let mut html = self.get_image_gallery_form_html();
        html.push_str("<div id=\"igc\" class=\"small\"><div id=\"igcf\" class=\"scaleDown\"><div id=\"image-gallery\" class=\"image-gallery\">");

        for image in &images {
            let size = image.size.unwrap_or(0);
            let content_type = image.content_type_header.as_deref().unwrap_or("");
            let source_url = self.status.get_url_by_uq_id(&image.source_uq_id);
            let source_url_str = source_url.as_deref().unwrap_or("");

            let image_description = format!(
                "{} ({}), found as {}",
                utils::get_formatted_size(size, 0),
                content_type,
                image.get_source_description(Some(source_url_str)),
            );

            let image_type = content_type.replace("image/", "");

            html.push_str(&format!(
                "<a href=\"{}\" target=\"_blank\" data-size=\"{}\" data-source=\"{}\" data-type=\"{}\" data-sizematch=\"1\" data-typematch=\"1\" data-sourcematch=\"1\">",
                html_escape(&image.url),
                size,
                html_escape(image.get_source_short_name()),
                html_escape(&image_type),
            ));
            html.push_str(&format!(
                "<img loading=\"lazy\" width=\"140\" height=\"140\" src=\"{}\" alt=\"{}\" title=\"{}\">",
                html_escape(&image.url),
                html_escape(&image_description),
                html_escape(&image_description),
            ));
            html.push_str("</a>\n");
        }
        html.push_str("</div></div></div>");

        let badges = vec![Badge::with_title(
            images.len().to_string(),
            BadgeColor::Neutral,
            "Found images",
        )];

        Some(Tab::new("Image Gallery", None, html, true, badges, Some(6)))
    }

    /// Video Gallery tab
    fn get_video_gallery_tab(&self) -> Option<Tab> {
        let summary = self.status.get_summary();
        if summary.get_items().is_empty() {
            return None;
        }

        let visited_urls = self.status.get_visited_urls();
        let videos: Vec<_> = visited_urls
            .iter()
            .filter(|v| v.is_video() && v.status_code == 200)
            .collect();

        if videos.is_empty() {
            return None;
        }

        let mut html = String::from(
            "<button onclick=\"playVideos()\" class=\"btn\">&#9654; Play the first 2 seconds of each video</button>",
        );
        html.push_str("<div id=\"vgc\" class=\"small\"><div id=\"vgcf\" class=\"scaleDown\"><div id=\"video-gallery\" class=\"video-container\">");

        for video in &videos {
            let size = video.size.unwrap_or(0);
            let content_type = video.content_type_header.as_deref().unwrap_or("");
            let source_url = self.status.get_url_by_uq_id(&video.source_uq_id);
            let source_url_str = source_url.as_deref().unwrap_or("");

            let video_description = format!(
                "{} ({}), <a href=\"{}\" target=\"_blank\">video</a> found on <a href=\"{}\" target=\"_blank\">this page</a>",
                utils::get_formatted_size(size, 0),
                content_type,
                html_escape(&video.url),
                html_escape(source_url_str),
            );

            html.push_str(&format!(
                "<div class=\"video-card\">\
                    <video data-src=\"{}\" preload=\"metadata\" controls></video>\
                    <div class=\"video-caption\">{}</div>\
                </div>\n",
                html_escape(&video.url),
                video_description,
            ));
        }
        html.push_str("</div></div></div>");

        html.push_str(VIDEO_GALLERY_SCRIPT);

        let badges = vec![Badge::with_title(
            videos.len().to_string(),
            BadgeColor::Neutral,
            "Found videos",
        )];

        Some(Tab::new("Video Gallery", None, html, true, badges, Some(6)))
    }

    /// DNS and SSL/TLS tab
    fn get_dns_and_ssl_tls_tab(&self) -> Option<Tab> {
        let all_infos = self.extract_all_super_table_infos();

        let mut html = String::new();
        let mut order: Option<i32> = None;
        let mut badges = Vec::new();

        // DNS table
        if let Some(dns_info) = all_infos.iter().find(|i| i.apl_code == ST_DNS) {
            html.push_str(&dns_info.html_output);
            html.push_str("<br/>");
            order = Some(get_super_table_order(ST_DNS));

            let mut ipv4 = 0usize;
            let mut ipv6 = 0usize;
            for row in &dns_info.data {
                if let Some(info_val) = row.get("info") {
                    let info_lower = info_val.to_lowercase();
                    if info_lower.contains("ipv4") {
                        ipv4 += 1;
                    } else if info_lower.contains("ipv6") {
                        ipv6 += 1;
                    }
                }
            }
            if ipv4 > 0 {
                let color = if ipv4 > 1 {
                    BadgeColor::Green
                } else {
                    BadgeColor::Neutral
                };
                badges.push(Badge::new(format!("{}x IPv4", ipv4), color));
            }
            if ipv6 > 0 {
                let color = if ipv6 > 1 {
                    BadgeColor::Green
                } else {
                    BadgeColor::Neutral
                };
                badges.push(Badge::new(format!("{}x IPv6", ipv6), color));
            }
        }

        // SSL/TLS certificate table
        if let Some(cert_info) = all_infos.iter().find(|i| i.apl_code == ST_CERTIFICATE_INFO) {
            html.push_str(&cert_info.html_output);
            html.push_str("<br/>");

            let mut errors = 0usize;
            for row in &cert_info.data {
                if let Some(info_val) = row.get("info")
                    && info_val == "Errors"
                    && let Some(value) = row.get("value")
                    && !value.is_empty()
                    && value != "[]"
                {
                    errors += 1;
                }
            }
            let tls_color = if errors > 0 { BadgeColor::Red } else { BadgeColor::Green };
            let tls_title = if errors > 0 {
                format!("SSL/TLS certificate: {} error(s)", errors)
            } else {
                "SSL/TLS certificate OK".to_string()
            };
            badges.push(Badge::with_title("TLS".to_string(), tls_color, &tls_title));
        }

        if html.is_empty() {
            return None;
        }

        Some(Tab::new("DNS and SSL", None, html, false, badges, order))
    }

    /// Crawler stats tab
    fn get_crawler_stats_tab(&self) -> Tab {
        let stats = self.status.get_basic_stats();
        let all_infos = self.extract_all_super_table_infos();

        let badges = vec![
            Badge::with_title(stats.total_urls.to_string(), BadgeColor::Neutral, "Total visited URLs"),
            Badge::with_title(
                stats.total_size_formatted.clone(),
                BadgeColor::Neutral,
                "Total size of all visited URLs",
            ),
            Badge::with_title(
                utils::get_formatted_duration(stats.total_execution_time),
                BadgeColor::Neutral,
                "Total execution time",
            ),
        ];

        let mut html = stats.get_as_html();

        if let Some(analysis_stats) = all_infos.iter().find(|i| i.apl_code == ST_ANALYSIS_STATS) {
            html.push_str("<br/>");
            html.push_str(&analysis_stats.html_output);
        }

        if let Some(cp_stats) = all_infos.iter().find(|i| i.apl_code == ST_CONTENT_PROCESSORS_STATS) {
            html.push_str("<br/>");
            html.push_str(&cp_stats.html_output);
        }

        Tab::new("Crawler stats", None, html, true, badges, Some(900))
    }

    /// Crawler info tab
    fn get_crawler_info_tab(&self) -> Tab {
        let info = self.status.get_crawler_info();
        let command = utils::get_safe_command(&info.command);

        let html = format!(
            r#"
            <h2>Crawler info</h2>
            <div class="info__wrapper">
                <table style="border-collapse: collapse;">
                    <tr>
                        <th>Version</th>
                        <td>{}</td>
                    </tr>
                    <tr>
                        <th>Executed At</th>
                        <td>{}</td>
                    </tr>
                    <tr>
                        <th>Command</th>
                        <td>{}</td>
                    </tr>
                    <tr>
                        <th>Hostname</th>
                        <td>{}</td>
                    </tr>
                    <tr>
                        <th>User-Agent</th>
                        <td>{}</td>
                    </tr>
                </table>
            </div>"#,
            html_escape(&info.version),
            html_escape(&info.executed_at),
            html_escape(&command),
            html_escape(&info.hostname),
            html_escape(&info.final_user_agent),
        );

        let badges = vec![Badge::with_title(
            format!("v{}", version::CODE),
            BadgeColor::Neutral,
            "Crawler version",
        )];

        Tab::new("Crawler info", None, html, false, badges, Some(5000))
    }

    /// Visited URLs tab
    fn get_visited_urls_tab(&self) -> Tab {
        let mut visited_urls_table = self.get_visited_urls_table();
        visited_urls_table.set_host_to_strip_from_urls(Some(self.get_initial_host()), Some(self.get_initial_scheme()));
        let badges = get_visited_urls_badges(&visited_urls_table);
        let order = get_super_table_order(SUPER_TABLE_VISITED_URLS);

        Tab::new(
            &visited_urls_table.title,
            visited_urls_table.description.as_deref(),
            visited_urls_table.get_html_output(),
            false,
            badges,
            Some(order),
        )
    }

    /// Build the visited URLs SuperTable
    fn get_visited_urls_table(&self) -> SuperTable {
        let visited_urls = self.status.get_visited_urls();

        let mut data: Vec<HashMap<String, String>> = Vec::new();
        for vu in &visited_urls {
            if vu.status_code == visited_url::ERROR_SKIPPED {
                continue;
            }

            let mut row = HashMap::new();
            row.insert("url".to_string(), vu.url.clone());
            row.insert("status".to_string(), vu.status_code.to_string());
            row.insert(
                "type".to_string(),
                utils::get_content_type_name_by_id(vu.content_type).to_string(),
            );
            row.insert("time".to_string(), format!("{:.3}", vu.request_time));
            row.insert("size".to_string(), vu.size.unwrap_or(0).to_string());
            row.insert(
                "sizeFormatted".to_string(),
                vu.size_formatted.clone().unwrap_or_default(),
            );
            row.insert("cacheTypeFlags".to_string(), vu.cache_type_flags.to_string());
            row.insert(
                "cacheLifetime".to_string(),
                vu.cache_lifetime.map(|v| v.to_string()).unwrap_or_default(),
            );

            if let Some(ref extras) = vu.extras {
                for (key, value) in extras {
                    row.insert(key.clone(), value.clone());
                }
            }

            data.push(row);
        }

        let initial_host = self.get_initial_host();
        let initial_scheme = self.get_initial_scheme();

        let columns = vec![
            SuperTableColumn::new(
                "url".to_string(),
                "URL".to_string(),
                -1,
                None,
                Some(Box::new(move |row: &HashMap<String, String>, _render_into: &str| {
                    let url = row.get("url").map(|s| s.as_str()).unwrap_or("");
                    let truncated =
                        utils::truncate_url(url, 80, "\u{2026}", Some(&initial_host), Some(&initial_scheme), None);
                    format!(
                        "<a href=\"{}\" target=\"_blank\">{}</a>",
                        url.replace('&', "&amp;")
                            .replace('"', "&quot;")
                            .replace('<', "&lt;")
                            .replace('>', "&gt;"),
                        truncated,
                    )
                })),
                true,
                false,
                false,
                false,
                None,
            ),
            SuperTableColumn::new(
                "status".to_string(),
                "Status".to_string(),
                6,
                Some(Box::new(|value: &str, _render_into: &str| {
                    if let Ok(v) = value.parse::<i32>() {
                        utils::get_colored_status_code(v, 6)
                    } else {
                        value.to_string()
                    }
                })),
                None,
                false,
                true,
                false,
                true,
                None,
            ),
            SuperTableColumn::new(
                "type".to_string(),
                "Type".to_string(),
                8,
                None,
                None,
                true,
                false,
                false,
                false,
                None,
            ),
            SuperTableColumn::new(
                "time".to_string(),
                "Time (s)".to_string(),
                8,
                None,
                Some(Box::new(|row: &HashMap<String, String>, _render_into: &str| {
                    let time_str = row.get("time").map(|s| s.as_str()).unwrap_or("0");
                    if let Ok(v) = time_str.parse::<f64>() {
                        utils::get_colored_request_time(v, 6)
                    } else {
                        time_str.to_string()
                    }
                })),
                false,
                true,
                false,
                true,
                None,
            ),
            SuperTableColumn::new(
                "size".to_string(),
                "Size".to_string(),
                8,
                None,
                Some(Box::new(|row: &HashMap<String, String>, _render_into: &str| {
                    let size_str = row.get("size").map(|s| s.as_str()).unwrap_or("0");
                    let size: i64 = size_str.parse().unwrap_or(0);
                    let formatted = row.get("sizeFormatted").map(|s| s.as_str()).unwrap_or("");
                    if size > 1024 * 1024 {
                        utils::get_color_text(formatted, "red", true)
                    } else {
                        formatted.to_string()
                    }
                })),
                false,
                true,
                false,
                true,
                None,
            ),
            {
                let mut col = SuperTableColumn::new(
                    "cacheLifetime".to_string(),
                    "Cache".to_string(),
                    8,
                    None,
                    Some(Box::new(|row: &HashMap<String, String>, _render_into: &str| {
                        let cache_lifetime_str = row.get("cacheLifetime").map(|s| s.as_str()).unwrap_or("");
                        let cache_type_flags: u32 = row.get("cacheTypeFlags").and_then(|s| s.parse().ok()).unwrap_or(0);
                        let str_pad_to = 6;

                        if let Ok(lifetime) = cache_lifetime_str.parse::<i64>() {
                            utils::get_colored_cache_lifetime(lifetime, str_pad_to)
                        } else if cache_type_flags & visited_url::CACHE_TYPE_HAS_NO_STORE != 0 {
                            utils::get_color_text(
                                &format!("{:<width$}", "0s (no-store)", width = str_pad_to),
                                "red",
                                true,
                            )
                        } else if cache_type_flags & visited_url::CACHE_TYPE_HAS_NO_CACHE != 0 {
                            utils::get_color_text(
                                &format!("{:<width$}", "0s (no-cache)", width = str_pad_to),
                                "red",
                                false,
                            )
                        } else if cache_type_flags & visited_url::CACHE_TYPE_HAS_ETAG != 0 {
                            utils::get_color_text(
                                &format!("{:<width$}", "ETag-only", width = str_pad_to),
                                "magenta",
                                false,
                            )
                        } else if cache_type_flags & visited_url::CACHE_TYPE_HAS_LAST_MODIFIED != 0 {
                            utils::get_color_text(
                                &format!("{:<width$}", "Last-Mod-only", width = str_pad_to),
                                "magenta",
                                false,
                            )
                        } else {
                            utils::get_color_text(&format!("{:<width$}", "None", width = str_pad_to), "red", false)
                        }
                    })),
                    false,
                    true,
                    false,
                    true,
                    Some(Box::new(|row: &HashMap<String, String>| {
                        let cache_lifetime_str = row.get("cacheLifetime").map(|s| s.as_str()).unwrap_or("");
                        let cache_type_flags: u32 = row.get("cacheTypeFlags").and_then(|s| s.parse().ok()).unwrap_or(0);

                        if let Ok(lifetime) = cache_lifetime_str.parse::<i64>() {
                            lifetime.to_string()
                        } else if cache_type_flags & visited_url::CACHE_TYPE_HAS_NO_STORE != 0 {
                            "-2".to_string()
                        } else if cache_type_flags & visited_url::CACHE_TYPE_HAS_NO_CACHE != 0 {
                            "-1".to_string()
                        } else if cache_type_flags & visited_url::CACHE_TYPE_HAS_ETAG != 0 {
                            "0.1".to_string()
                        } else if cache_type_flags & visited_url::CACHE_TYPE_HAS_LAST_MODIFIED != 0 {
                            "0.2".to_string()
                        } else {
                            "0.01".to_string()
                        }
                    })),
                );
                col.forced_data_type = Some("number".to_string());
                col
            },
        ];

        let mut super_table = SuperTable::new(
            SUPER_TABLE_VISITED_URLS.to_string(),
            "Visited URLs".to_string(),
            "No visited URLs.".to_string(),
            columns,
            false,
            None,
            "ASC".to_string(),
            None,
            None,
            None,
        );
        super_table.set_ignore_hard_rows_limit(true);
        super_table.set_data(data);

        super_table
    }

    // -------------------------------------------------------------------------
    // Helpers
    // -------------------------------------------------------------------------

    /// Image gallery form HTML (size/mode/filter controls)
    fn get_image_gallery_form_html(&self) -> String {
        let mut html = String::from(
            r#"
            <style>
            #imageDisplayForm {
                display: flex;
                gap: 12px;
                flex-wrap: wrap;
                margin-bottom: 20px;
            }
            </style>"#,
        );

        html.push_str(
            r#"<script>
                function updateClassName(elementId, className) {
                    document.getElementById(elementId).className = className;
                    if (elementId === "igc") {
                        var images = document.getElementById(elementId).getElementsByTagName("img");
                        for (var i = 0; i < images.length; i++) {
                            var image = images[i];
                            image.width = className === "small" ? 140 : (className === "medium" ? 200 : 360);
                            image.height = className === "small" ? 140 : (className === "medium" ? 200 : 360);
                        }
                    }
                }
            </script>"#,
        );

        html.push_str(IMAGE_GALLERY_FILTER_SCRIPT);

        html.push_str(r#"<form id="imageDisplayForm">
                <div class="form-group">
                    <div class="btn-group">
                        <input class="idf" type="radio" id="sizeSmall" name="thumbnailSize" value="small" data-key="igc" checked>
                        <label for="sizeSmall">small</label>
                        <input class="idf" type="radio" id="sizeMedium" name="thumbnailSize" value="medium" data-key="igc">
                        <label for="sizeMedium">medium</label>
                        <input class="idf" type="radio" id="sizeLarge" name="thumbnailSize" value="large" data-key="igc">
                        <label for="sizeLarge">large</label>
                    </div>
                </div>
                <div class="form-group">
                    <div class="btn-group">
                        <input class="idf" type="radio" id="modeScaleDown" name="thumbnailMode" value="scaleDown" data-key="igcf" checked>
                        <label for="modeScaleDown">scale-down</label>
                        <input class="idf" type="radio" id="modeContain" name="thumbnailMode" value="contain" data-key="igcf">
                        <label for="modeContain">contain</label>
                        <input class="idf" type="radio" id="modeCover" name="thumbnailMode" value="cover" data-key="igcf">
                        <label for="modeCover">cover</label>
                    </div>
                </div>
                <div class="form-group">
                    <div class="btn-group" id="typeFilters">
                    </div>
                </div>
                <div class="form-group">
                    <div class="btn-group" id="sourceFilters">
                    </div>
                </div>
                <div class="form-group">
                    <div class="btn-group" id="sizeFilters">
                    </div>
                </div>
            </form>"#);

        html
    }

    /// Get initial host from the URL
    fn get_initial_host(&self) -> String {
        let url = self.get_initial_url();
        url::Url::parse(&url)
            .ok()
            .and_then(|u| u.host_str().map(|h| h.to_string()))
            .unwrap_or_default()
    }

    /// Get initial URL from status
    fn get_initial_url(&self) -> String {
        self.status.get_crawler_info().initial_url.clone()
    }

    /// Get initial scheme from the URL
    fn get_initial_scheme(&self) -> String {
        let url = self.get_initial_url();
        url::Url::parse(&url)
            .ok()
            .map(|u| u.scheme().to_string())
            .unwrap_or_else(|| "https".to_string())
    }

    /// Build analysis detail sub-tables for Best Practices, Accessibility, and Security tabs.
    /// Returns a map of analysis_name -> rendered HTML table.
    fn build_analysis_detail_tables(&self) -> HashMap<String, String> {
        let initial_host = self.get_initial_host();
        let initial_scheme = self.get_initial_scheme();

        // Gather all per-URL analysis details, aggregated by analysis_name
        let aggregated = self.get_data_for_super_tables_with_details();

        let mut result = HashMap::new();

        // Build all analysis names from all three analyzers
        let all_names: Vec<&str> = BEST_PRACTICE_ANALYSIS_NAMES
            .iter()
            .chain(ACCESSIBILITY_ANALYSIS_NAMES.iter())
            .chain(SECURITY_ANALYSIS_NAMES.iter())
            .copied()
            .collect();

        for analysis_name in all_names {
            let mut data = aggregated.get(analysis_name).cloned().unwrap_or_default();

            // Sort by severity (ascending) then by count (descending)
            data.sort_by(|a, b| {
                let sev_a: i32 = a.get("severity").and_then(|s| s.parse().ok()).unwrap_or(999);
                let sev_b: i32 = b.get("severity").and_then(|s| s.parse().ok()).unwrap_or(999);
                if sev_a == sev_b {
                    let count_a: usize = a.get("count").and_then(|s| s.parse().ok()).unwrap_or(0);
                    let count_b: usize = b.get("count").and_then(|s| s.parse().ok()).unwrap_or(0);
                    count_b.cmp(&count_a)
                } else {
                    sev_a.cmp(&sev_b)
                }
            });

            let apl_code = analysis_name.to_lowercase().replace(' ', "-");
            let initial_host_clone = initial_host.clone();
            let initial_scheme_clone = initial_scheme.clone();

            let columns = vec![
                SuperTableColumn::new(
                    "severity".to_string(),
                    "Severity".to_string(),
                    10,
                    None,
                    Some(Box::new(|row: &HashMap<String, String>, _render_into: &str| {
                        let sev = row.get("severityFormatted").map(|s| s.as_str()).unwrap_or("");
                        utils::get_colored_severity(sev)
                    })),
                    false,
                    false,
                    false,
                    true,
                    None,
                ),
                SuperTableColumn::new(
                    "count".to_string(),
                    "Occurs".to_string(),
                    8,
                    None,
                    None,
                    false,
                    false,
                    false,
                    true,
                    None,
                ),
                SuperTableColumn::new(
                    "detail".to_string(),
                    "Detail".to_string(),
                    200,
                    Some(Box::new(|value: &str, _render_into: &str| {
                        // HTML-escape for safety, then convert newlines to <br>
                        let escaped = html_escape(value);
                        escaped.replace('\n', "<br>")
                    })),
                    None,
                    false,
                    true,
                    false,
                    false,
                    None,
                ),
                SuperTableColumn::new(
                    "exampleUrls".to_string(),
                    format!("Affected URLs (max {})", MAX_EXAMPLE_URLS),
                    60,
                    None,
                    Some(Box::new(move |row: &HashMap<String, String>, _render_into: &str| {
                        let urls_str = row.get("exampleUrls").map(|s| s.as_str()).unwrap_or("");
                        if urls_str.is_empty() {
                            return String::new();
                        }
                        let urls: Vec<&str> = urls_str.split('\x1E').collect(); // record separator
                        let mut html_out = String::new();
                        if urls.len() == 1 {
                            for url in &urls {
                                let truncated = utils::truncate_url(
                                    url,
                                    60,
                                    "\u{2026}",
                                    Some(&initial_host_clone),
                                    Some(&initial_scheme_clone),
                                    None,
                                );
                                html_out.push_str(&format!(
                                    "<a href=\"{}\" target=\"_blank\">{}</a><br />",
                                    html_escape(url),
                                    html_escape(&truncated),
                                ));
                            }
                        } else {
                            for (i, url) in urls.iter().enumerate() {
                                html_out.push_str(&format!(
                                    "<a href=\"{}\" target=\"_blank\">URL {}</a>, ",
                                    html_escape(url),
                                    i + 1,
                                ));
                            }
                        }
                        html_out.trim_end_matches(", ").to_string()
                    })),
                    false,
                    true,
                    false,
                    false,
                    None,
                ),
            ];

            let mut super_table = SuperTable::new(
                apl_code,
                analysis_name.to_string(),
                "No problems found.".to_string(),
                columns,
                false,
                None,
                "ASC".to_string(),
                None,
                Some(100),
                None,
            );

            super_table.set_data(data);
            let html = super_table.get_html_output();
            result.insert(analysis_name.to_string(), html);
        }

        result
    }

    /// Gather per-URL analysis details, aggregated by analysis_name.
    /// Returns analysis_name -> Vec of aggregated rows.
    fn get_data_for_super_tables_with_details(&self) -> HashMap<String, Vec<HashMap<String, String>>> {
        let analysis_results = self.status.get_visited_url_to_analysis_result();
        let mut raw_data: HashMap<String, Vec<(String, String, i32, String)>> = HashMap::new();

        for (uq_id, entries) in &analysis_results {
            let url = self.status.get_url_by_uq_id(uq_id).unwrap_or_default();

            for entry in entries {
                let result = &entry.result;

                // Critical details
                for (analysis_name, details) in result.get_critical_details() {
                    for detail in details {
                        raw_data.entry(analysis_name.clone()).or_default().push((
                            url.clone(),
                            "critical".to_string(),
                            SEVERITY_ORDER_CRITICAL,
                            detail.clone(),
                        ));
                    }
                }

                // Warning details
                for (analysis_name, details) in result.get_warning_details() {
                    for detail in details {
                        raw_data.entry(analysis_name.clone()).or_default().push((
                            url.clone(),
                            "warning".to_string(),
                            SEVERITY_ORDER_WARNING,
                            detail.clone(),
                        ));
                    }
                }

                // Notice details
                for (analysis_name, details) in result.get_notice_details() {
                    for detail in details {
                        raw_data.entry(analysis_name.clone()).or_default().push((
                            url.clone(),
                            "notice".to_string(),
                            SEVERITY_ORDER_NOTICE,
                            detail.clone(),
                        ));
                    }
                }
            }
        }

        // Aggregate: group identical (severity, aggregated_detail) pairs and count occurrences.
        let mut aggregated: HashMap<String, Vec<HashMap<String, String>>> = HashMap::new();

        for (analysis_name, rows) in &raw_data {
            let mut groups: HashMap<String, HashMap<String, String>> = HashMap::new();
            let mut group_urls: HashMap<String, Vec<String>> = HashMap::new();

            for (url, severity_formatted, severity_order, detail) in rows {
                let agg_detail = aggregate_detail(detail);
                let agg_key = aggregate_detail_key(severity_formatted, &agg_detail);
                let entry = groups.entry(agg_key.clone()).or_insert_with(|| {
                    let mut row = HashMap::new();
                    row.insert("severityFormatted".to_string(), severity_formatted.clone());
                    row.insert("severity".to_string(), severity_order.to_string());
                    row.insert("detail".to_string(), agg_detail.clone());
                    row.insert("count".to_string(), "0".to_string());
                    row
                });
                let count: usize = entry.get("count").and_then(|c| c.parse().ok()).unwrap_or(0);
                entry.insert("count".to_string(), (count + 1).to_string());

                let urls = group_urls.entry(agg_key).or_default();
                if urls.len() < MAX_EXAMPLE_URLS && !urls.contains(url) {
                    urls.push(url.clone());
                }
            }

            let mut result_rows: Vec<HashMap<String, String>> = Vec::new();
            for (key, mut row) in groups {
                if let Some(urls) = group_urls.get(&key) {
                    row.insert("exampleUrls".to_string(), urls.join("\x1E"));
                }
                result_rows.push(row);
            }

            aggregated.insert(analysis_name.clone(), result_rows);
        }

        aggregated
    }
}

// =============================================================================
// Free functions that work on extracted SuperTableInfo (no &self needed)
// =============================================================================

/// Generate tab content for a SuperTable, potentially including related sub-tables
fn get_tab_content_by_super_table(
    info: &SuperTableInfo,
    all_infos: &[SuperTableInfo],
    analysis_detail_html: &HashMap<String, String>,
) -> String {
    let mut html = info.html_output.clone();

    // Add related sub-tables based on apl_code
    let related_codes: Vec<&str> = match info.apl_code.as_str() {
        "skipped-summary" => vec![ST_SKIPPED],
        "headers" => vec![ST_HEADERS_VALUES],
        "content-types" => vec![ST_CONTENT_MIME_TYPES],
        "caching-per-content-type" => vec![ST_CACHING_PER_DOMAIN, ST_CACHING_PER_DOMAIN_AND_CONTENT_TYPE],
        _ => vec![],
    };

    for related_code in related_codes {
        if let Some(related) = all_infos.iter().find(|i| i.apl_code == related_code) {
            html.push_str("<br/>");
            html.push_str(&related.html_output);
        }
    }

    // Add analysis detail sub-tables for best-practices, accessibility, security
    let analysis_names: &[&str] = match info.apl_code.as_str() {
        "best-practices" => BEST_PRACTICE_ANALYSIS_NAMES,
        "accessibility" => ACCESSIBILITY_ANALYSIS_NAMES,
        "security" => SECURITY_ANALYSIS_NAMES,
        _ => &[],
    };

    for analysis_name in analysis_names {
        if let Some(detail_html) = analysis_detail_html.get(*analysis_name) {
            html.push_str("<br/>");
            html.push_str(detail_html);
        }
    }

    html
}

/// Get badges for visited URLs table
fn get_visited_urls_badges(super_table: &SuperTable) -> Vec<Badge> {
    let mut badges = Vec::new();
    let mut red = 0usize;
    let mut orange = 0usize;
    let mut green = 0usize;

    for row in super_table.get_data() {
        let status_code: i32 = row.get("status").and_then(|s| s.parse().ok()).unwrap_or(0);

        if status_code <= 0 || status_code >= 400 {
            red += 1;
        } else if status_code >= 300 {
            orange += 1;
        } else {
            green += 1;
        }
    }

    if red > 0 {
        badges.push(Badge::with_title(
            red.to_string(),
            BadgeColor::Red,
            "Errors (40x, 50x, timeout, etc.)",
        ));
    }
    if orange > 0 {
        badges.push(Badge::with_title(
            orange.to_string(),
            BadgeColor::Orange,
            "Redirects (30x)",
        ));
    }
    if green > 0 {
        badges.push(Badge::with_title(green.to_string(), BadgeColor::Green, "OK (20x)"));
    }

    badges
}

/// Get badges for a SuperTable based on its apl_code
fn get_super_table_badges_by_apl_code(info: &SuperTableInfo, all_infos: &[SuperTableInfo]) -> Vec<Badge> {
    let mut badges = Vec::new();

    match info.apl_code.as_str() {
        "redirects" => {
            let redirects = info.total_rows;
            let color = if redirects > 100 {
                BadgeColor::Red
            } else if redirects > 0 {
                BadgeColor::Orange
            } else {
                BadgeColor::Green
            };
            badges.push(Badge::new(redirects.to_string(), color));
        }
        "404" => {
            let not_found = info.total_rows;
            let color = if not_found > 10 {
                BadgeColor::Red
            } else if not_found > 0 {
                BadgeColor::Orange
            } else {
                BadgeColor::Green
            };
            badges.push(Badge::new(not_found.to_string(), color));
        }
        "skipped-summary" => {
            let skipped = info.total_rows;
            let color = if skipped > 10 {
                BadgeColor::Orange
            } else {
                BadgeColor::Green
            };
            badges.push(Badge::with_title(skipped.to_string(), color, "Skipped URL domains"));
            if let Some(skipped_urls) = all_infos.iter().find(|i| i.apl_code == ST_SKIPPED) {
                badges.push(Badge::with_title(
                    skipped_urls.total_rows.to_string(),
                    BadgeColor::Neutral,
                    "Total skipped URLs",
                ));
            }
        }
        "source-domains" => {
            let domains = info.total_rows;
            let color = if domains > 10 {
                BadgeColor::Orange
            } else {
                BadgeColor::Neutral
            };
            badges.push(Badge::new(domains.to_string(), color));
        }
        "content-types" => {
            let content_types = info.total_rows;
            badges.push(Badge::with_title(
                content_types.to_string(),
                BadgeColor::Neutral,
                "Total content types",
            ));
            if let Some(mime_types) = all_infos.iter().find(|i| i.apl_code == ST_CONTENT_MIME_TYPES) {
                badges.push(Badge::with_title(
                    mime_types.total_rows.to_string(),
                    BadgeColor::Neutral,
                    "Total MIME types",
                ));
            }
        }
        "fastest-urls" => {
            let fastest_time = info
                .data
                .iter()
                .filter_map(|row| row.get("time").and_then(|s| s.parse::<f64>().ok()))
                .fold(None, |acc: Option<f64>, t| Some(acc.map_or(t, |a| a.min(t))));
            if let Some(time) = fastest_time {
                let color = if time < 0.5 {
                    BadgeColor::Green
                } else if time < 2.0 {
                    BadgeColor::Orange
                } else {
                    BadgeColor::Red
                };
                badges.push(Badge::new(utils::get_formatted_duration(time), color));
            }
        }
        "slowest-urls" => {
            let slowest_time = info
                .data
                .iter()
                .filter_map(|row| row.get("time").and_then(|s| s.parse::<f64>().ok()))
                .fold(None, |acc: Option<f64>, t| Some(acc.map_or(t, |a| a.max(t))));
            if let Some(time) = slowest_time {
                let color = if time < 0.5 {
                    BadgeColor::Green
                } else if time < 2.0 {
                    BadgeColor::Orange
                } else {
                    BadgeColor::Red
                };
                badges.push(Badge::new(utils::get_formatted_duration(time), color));
            }
        }
        "headers" => {
            let headers = info.total_rows;
            let color = if headers > 50 {
                BadgeColor::Red
            } else {
                BadgeColor::Neutral
            };
            badges.push(Badge::new(headers.to_string(), color));
        }
        "external-urls" => {
            let count = info.total_rows;
            let color = if count > 0 {
                BadgeColor::Neutral
            } else {
                BadgeColor::Green
            };
            badges.push(Badge::with_title(count.to_string(), color, "External URLs"));
        }
        "caching-per-content-type" => {
            let mut min_cache_lifetime: Option<i64> = None;
            let mut max_cache_lifetime: Option<i64> = None;

            for row in &info.data {
                let content_type = row.get("contentType").map(|s| s.as_str()).unwrap_or("");
                if !["Image", "CSS", "JS", "Font"].contains(&content_type) {
                    continue;
                }
                if let Some(min_val) = row.get("minLifetime").and_then(|s| s.parse::<i64>().ok()) {
                    min_cache_lifetime = Some(min_cache_lifetime.map_or(min_val, |v: i64| v.min(min_val)));
                }
                if let Some(max_val) = row.get("maxLifetime").and_then(|s| s.parse::<i64>().ok()) {
                    max_cache_lifetime = Some(max_cache_lifetime.map_or(max_val, |v: i64| v.max(max_val)));
                }
            }

            if let Some(min_lt) = min_cache_lifetime {
                let color = if min_lt < 60 {
                    BadgeColor::Red
                } else if min_lt < 3600 {
                    BadgeColor::Orange
                } else {
                    BadgeColor::Green
                };
                badges.push(Badge::with_title(
                    utils::get_formatted_cache_lifetime(min_lt),
                    color,
                    "Minimal cache lifetime for images/css/js/fonts",
                ));
            }
            if let Some(max_lt) = max_cache_lifetime {
                let color = if max_lt < 60 {
                    BadgeColor::Red
                } else if max_lt < 3600 {
                    BadgeColor::Orange
                } else {
                    BadgeColor::Green
                };
                badges.push(Badge::with_title(
                    utils::get_formatted_cache_lifetime(max_lt),
                    color,
                    "Maximal cache lifetime for images/css/js/fonts",
                ));
            }
        }
        _ => {
            // Use generic badges for other tables
            badges = get_super_table_generic_badges(info);
        }
    }

    badges
}

/// Get generic badges by counting severity columns
fn get_super_table_generic_badges(info: &SuperTableInfo) -> Vec<Badge> {
    let mut badges = Vec::new();
    let mut red = 0i64;
    let mut orange = 0i64;
    let mut blue = 0i64;
    let mut green = 0i64;
    let mut neutral = 0i64;

    for row in &info.data {
        if let Some(val) = row.get("ok").and_then(|s| s.parse::<i64>().ok()) {
            green += val;
        }
        if let Some(val) = row.get("notice").and_then(|s| s.parse::<i64>().ok()) {
            blue += val;
        }
        if let Some(val) = row.get("warning").and_then(|s| s.parse::<i64>().ok()) {
            orange += val;
        }
        if let Some(val) = row.get("critical").and_then(|s| s.parse::<i64>().ok()) {
            red += val;
        }
        if let Some(val) = row.get("error").and_then(|s| s.parse::<i64>().ok()) {
            red += val;
        }
        if let Some(val) = row.get("info").and_then(|s| s.parse::<i64>().ok()) {
            neutral += val;
        }
    }

    if red > 0 {
        badges.push(Badge::with_title(red.to_string(), BadgeColor::Red, "Critical"));
    }
    if orange > 0 {
        badges.push(Badge::with_title(orange.to_string(), BadgeColor::Orange, "Warning"));
    }
    if blue > 0 {
        badges.push(Badge::with_title(blue.to_string(), BadgeColor::Blue, "Notice"));
    }
    if green > 0 {
        badges.push(Badge::with_title(green.to_string(), BadgeColor::Green, "OK"));
    }
    if neutral > 0 {
        badges.push(Badge::with_title(neutral.to_string(), BadgeColor::Neutral, "Info"));
    }

    badges
}

/// HTML-escape a string
fn html_escape(s: &str) -> String {
    s.replace('&', "&amp;")
        .replace('<', "&lt;")
        .replace('>', "&gt;")
        .replace('"', "&quot;")
        .replace('\'', "&#039;")
}

/// Remove excessive whitespace from HTML.
/// Build the quality scores HTML block (donut chart + category bars) for the Summary tab.
fn build_quality_scores_html(scores: &crate::scoring::quality_score::QualityScores) -> String {
    let overall = &scores.overall;
    let deg = overall.score / 10.0 * 360.0;
    let color = overall.color_hex();

    let mut html = String::new();

    // Embedded styles for light (default) and dark (checked) modes
    html.push_str(concat!(
        "<style>\n",
        ".qs-box{margin-bottom:24px;padding:20px;border-radius:12px;background:#F3F4F6;}\n",
        ".qs-title{margin:0 0 16px;font-size:18px;color:#111827;}\n",
        ".qs-donut-inner{background:#F3F4F6;}\n",
        ".qs-bar-track{background:#E5E7EB;}\n",
        ".qs-cat-name{color:#4B5563;}\n",
        "html:has(.theme-switch__input:checked) .qs-box{background:#1F2937;}\n",
        "html:has(.theme-switch__input:checked) .qs-title{color:#F9FAFB;}\n",
        "html:has(.theme-switch__input:checked) .qs-donut-inner{background:#1F2937;}\n",
        "html:has(.theme-switch__input:checked) .qs-bar-track{background:#374151;}\n",
        "html:has(.theme-switch__input:checked) .qs-cat-name{color:#D1D5DB;}\n",
        "</style>\n",
    ));

    // Container
    html.push_str("<div class=\"qs-box\">\n");
    html.push_str("<h3 class=\"qs-title\">Website Quality Score</h3>\n");

    // Flex container for donut + categories
    html.push_str("<div style=\"display:flex;align-items:center;gap:32px;flex-wrap:wrap;\">\n");

    // Donut chart — track color via qs-bar-track on outer ring
    html.push_str(&format!(
        concat!(
            "<div class=\"qs-bar-track\" style=\"position:relative;width:140px;height:140px;border-radius:50%;",
            "background:conic-gradient({color} 0deg {deg:.1}deg,transparent {deg:.1}deg 360deg);",
            "flex-shrink:0;\">\n",
            "<div class=\"qs-donut-inner\" style=\"position:absolute;top:50%;left:50%;transform:translate(-50%,-50%);",
            "width:100px;height:100px;border-radius:50%;",
            "display:flex;flex-direction:column;align-items:center;justify-content:center;\">\n",
            "<span style=\"font-size:28px;font-weight:bold;color:{color};\">{score:.1}</span>\n",
            "<span style=\"font-size:13px;color:{color};\">{label}</span>\n",
            "</div>\n</div>\n",
        ),
        color = color,
        deg = deg,
        score = overall.score,
        label = overall.label,
    ));

    // Category bars container
    html.push_str("<div style=\"flex:1;min-width:200px;\">\n");

    for cat in &scores.categories {
        let pct = cat.score / 10.0 * 100.0;
        let cat_color = cat.color_hex();
        html.push_str(&format!(
            concat!(
                "<div style=\"display:flex;align-items:center;margin-bottom:8px;\">\n",
                "<span class=\"qs-cat-name\" style=\"width:120px;font-size:13px;\">{name}</span>\n",
                "<div class=\"qs-bar-track\" style=\"flex:1;height:12px;border-radius:6px;margin:0 10px;overflow:hidden;\">\n",
                "<div style=\"width:{pct:.0}%;height:100%;background:{color};border-radius:6px;\"></div>\n",
                "</div>\n",
                "<span style=\"width:36px;color:{color};font-weight:bold;font-size:13px;text-align:right;\">{score:.1}</span>\n",
                "</div>\n",
            ),
            name = cat.name,
            pct = pct,
            color = cat_color,
            score = cat.score,
        ));
    }

    html.push_str("</div>\n"); // end bars container
    html.push_str("</div>\n"); // end flex
    html.push_str("</div>\n"); // end outer container

    html
}

///   1. Inside <script>/<style> blocks: only replace "> <" with "> <"
///   2. Collapse all whitespace to single space
///   3. Replace "> <" with "> <"
fn remove_whitespaces_from_html(html: &str) -> String {
    use once_cell::sync::Lazy;
    use regex::Regex;

    // Separate regexes for script and style (no backreference needed)
    static RE_SCRIPT: Lazy<Regex> = Lazy::new(|| Regex::new(r"(?is)<script\b[^>]*>.*?</script>").unwrap());
    static RE_STYLE: Lazy<Regex> = Lazy::new(|| Regex::new(r"(?is)<style\b[^>]*>.*?</style>").unwrap());
    static RE_WHITESPACE: Lazy<Regex> = Lazy::new(|| Regex::new(r"\s+").unwrap());
    static RE_TAG_WHITESPACE: Lazy<Regex> = Lazy::new(|| Regex::new(r">\s+<").unwrap());

    // Step 1: In script blocks, replace "> <" with "> <"
    let html = RE_SCRIPT.replace_all(html, |caps: &regex::Captures| {
        RE_TAG_WHITESPACE.replace_all(&caps[0], "> <").to_string()
    });

    // Step 1b: In style blocks, replace "> <" with "> <"
    let html = RE_STYLE.replace_all(&html, |caps: &regex::Captures| {
        RE_TAG_WHITESPACE.replace_all(&caps[0], "> <").to_string()
    });

    // Step 2: Collapse all whitespace to single space
    let html = RE_WHITESPACE.replace_all(&html, " ");

    // Step 3: Replace "> <" with "> <"
    let html = RE_TAG_WHITESPACE.replace_all(&html, "> <");

    html.to_string()
}

/// Normalize a detail string for aggregation/deduplication.
/// 1. For SVG details, return as-is
/// 2. Remove all HTML attributes except id, class, name (replace with " *** ")
/// 3. Extract only the first HTML tag
/// 4. Replace trailing numbers before quotes with ***
fn aggregate_detail(detail: &str) -> String {
    use once_cell::sync::Lazy;
    use regex::Regex;

    // SVG details pass through unchanged
    if detail.starts_with("<svg") || detail.contains("x SVG ") {
        return detail.to_string();
    }

    // Step 1: Remove unwanted attributes, keeping only id, class, name
    static RE_TAG_ATTRS: Lazy<Regex> = Lazy::new(|| Regex::new(r"(?is)<([a-z][a-z0-9]*)\s+([^>]*)>").unwrap());
    static RE_ATTR: Lazy<Regex> =
        Lazy::new(|| Regex::new(r#"(?is)([a-z][-a-z0-9_]*)\s*=\s*("(?:[^"]*)"?|'(?:[^']*)'?)"#).unwrap());

    let allowed_attrs = ["id", "class", "name"];
    let svg_tags = [
        "svg",
        "g",
        "path",
        "circle",
        "rect",
        "line",
        "polyline",
        "polygon",
        "text",
        "tspan",
        "use",
        "defs",
        "clippath",
        "mask",
        "pattern",
        "marker",
        "lineargradient",
        "radialgradient",
        "stop",
        "image",
        "foreignobject",
    ];

    let result = RE_TAG_ATTRS.replace_all(detail, |caps: &regex::Captures| {
        let tag_name = &caps[1];
        let attrs_string = &caps[2];

        // Don't modify SVG tags
        if svg_tags.contains(&tag_name.to_lowercase().as_str()) {
            return caps[0].to_string();
        }

        let mut kept_attrs = String::new();
        let mut any_removed = false;

        for attr_match in RE_ATTR.captures_iter(attrs_string) {
            let attr_name = &attr_match[1];
            if allowed_attrs.contains(&attr_name.to_lowercase().as_str()) {
                kept_attrs.push_str(&attr_match[0]);
                kept_attrs.push(' ');
            } else {
                any_removed = true;
            }
        }

        // Also check for valueless attributes (like "disabled", "checked")
        // that weren't caught by the key=value regex
        let kept_trimmed = kept_attrs.trim_end();
        let suffix = if any_removed { " *** " } else { "" };
        if kept_trimmed.is_empty() {
            if any_removed {
                format!("<{} ***>", tag_name)
            } else {
                format!("<{}>", tag_name)
            }
        } else {
            format!("<{} {}{}>", tag_name, kept_trimmed, suffix)
        }
    });

    let mut result = result.to_string();

    // Step 1b: Normalize class attribute values — for each class name containing
    // a hyphen or underscore, keep only the first segment and replace the rest with *.
    // E.g. class="astro-3ii7xxms" → class="astro-*", class="sl-flex astro-wy4te6ga" → class="sl-* astro-*"
    static RE_CLASS_ATTR: Lazy<Regex> = Lazy::new(|| Regex::new(r#"class="([^"]*)""#).unwrap());
    result = RE_CLASS_ATTR
        .replace_all(&result, |caps: &regex::Captures| {
            let class_value = &caps[1];
            let normalized_classes: Vec<String> = class_value
                .split_whitespace()
                .map(|cls| {
                    if let Some(pos) = cls.find(['-', '_']) {
                        format!("{}*", &cls[..=pos])
                    } else {
                        cls.to_string()
                    }
                })
                .collect();
            format!("class=\"{}\"", normalized_classes.join(" "))
        })
        .to_string();

    // Step 2: If result starts with '<', extract only the first HTML tag
    if result.trim_start_matches(&['"', '\'', ' '][..]).starts_with('<') {
        static RE_FIRST_TAG: Lazy<Regex> = Lazy::new(|| Regex::new(r#"(?s)^[\s"']*(<[^>]+>)"#).unwrap());
        if let Some(caps) = RE_FIRST_TAG.captures(&result) {
            result = caps[1].to_string();
        }
    }

    // Step 3: Replace trailing numbers before quotes with ***
    static RE_TRAILING_NUMS: Lazy<Regex> = Lazy::new(|| Regex::new(r#"([0-9]+)(["'])"#).unwrap());
    result = RE_TRAILING_NUMS.replace_all(&result, "***$2").to_string();

    result
}

/// Build aggregation key for a detail (severity + md5 of normalized detail).
fn aggregate_detail_key(severity: &str, detail: &str) -> String {
    use md5::{Digest, Md5};

    let mut clean_detail = detail.to_string();
    // Remove clip-path from SVGs for comparison
    if clean_detail.contains("<svg") {
        use once_cell::sync::Lazy;
        use regex::Regex;
        static RE_CLIPPATH_TAG: Lazy<Regex> = Lazy::new(|| Regex::new(r"(?i)<clipPath[^>]+>").unwrap());
        static RE_CLIPPATH_ATTR: Lazy<Regex> = Lazy::new(|| Regex::new(r#"(?i)clip-path="[^"]+""#).unwrap());
        clean_detail = RE_CLIPPATH_TAG.replace_all(&clean_detail, "").to_string();
        clean_detail = RE_CLIPPATH_ATTR.replace_all(&clean_detail, "").to_string();
    }

    let mut hasher = Md5::new();
    hasher.update(clean_detail.as_bytes());
    let hash = format!("{:x}", hasher.finalize());
    format!("{} | {}", severity, hash)
}

/// JavaScript for image gallery filtering
const IMAGE_GALLERY_FILTER_SCRIPT: &str = r#"<script> function initializeFilters() {
                const links = document.querySelectorAll('#image-gallery a');
                const types = new Set();
                const sources = new Set();
                const sizeCategories = [
                    { label: 'any', filter: () => true },
                    { label: '> 5 MB', filter: size => size > 5 * 1024 * 1024 },
                    { label: '> 1MB', filter: size => size > 1 * 1024 * 1024 },
                    { label: '> 500kB', filter: size => size > 500 * 1024 },
                    { label: '> 100kB', filter: size => size > 100 * 1024 },
                    { label: '> 10kB', filter: size => size > 10 * 1024 },
                    { label: '< 10kB', filter: size => size < 10 * 1024 }
                ];

                links.forEach(link => {
                    types.add(link.dataset.type);
                    sources.add(link.dataset.source);
                });

                addSizeFilters('sizeFilters', sizeCategories, links, filterImagesBySize);
                addToggleButtonsToFilter('typeFilters', ['any'].concat(Array.from(types).sort((a, b) => countLinksOfType(b, links) - countLinksOfType(a, links))), filterImagesByType, links);
                addToggleButtonsToFilter('sourceFilters', ['any'].concat(Array.from(sources).sort((a, b) => countLinksOfSource(b, links) - countLinksOfSource(a, links))), filterImagesBySource, links);
            }

            function addToggleButtonsToFilter(filterId, categories, filterFunction, links) {
                const filterDiv = document.getElementById(filterId);
                categories.forEach((category, index) => {
                    const radioId = filterId + category;
                    const radioInput = document.createElement('input');
                    radioInput.setAttribute('type', 'radio');
                    radioInput.setAttribute('id', radioId);
                    radioInput.setAttribute('name', filterId);
                    radioInput.setAttribute('value', category);
                    if (category === 'any') {
                        radioInput.setAttribute('checked', 'checked');
                    }
                    radioInput.onchange = () => filterFunction(category);

                    const label = document.createElement('label');
                    label.setAttribute('for', radioId);

                    let labelCountText = category;
                    if (category !== 'any') {
                        const count = filterId === 'typeFilters' ? countLinksOfType(category, links) : countLinksOfSource(category, links);
                        labelCountText += ` (${count})`;
                    } else {
                        labelCountText += ' (' + links.length + ')';
                    }
                    label.textContent = labelCountText;

                    filterDiv.appendChild(radioInput);
                    filterDiv.appendChild(label);
                });
            }

            function addToggleButton(filterDiv, filterId, value, labelText, filterFunction) {
                const radioId = filterId + '-' + value.replace(/\s/g, '-');

                const radioInput = document.createElement('input');
                radioInput.setAttribute('type', 'radio');
                radioInput.setAttribute('id', radioId);
                radioInput.setAttribute('name', filterId);
                radioInput.setAttribute('value', value);
                radioInput.addEventListener('change', () => filterFunction(value));

                if (labelText === 'any') {
                    radioInput.setAttribute('checked', 'checked');
                }

                const label = document.createElement('label');
                label.setAttribute('for', radioId);
                label.textContent = labelText;

                filterDiv.appendChild(radioInput);
                filterDiv.appendChild(label);
            }

            function countLinksOfType(type, links) {
                return Array.from(links).filter(link => link.dataset.type === type).length;
            }

            function countLinksOfSource(source, links) {
                return Array.from(links).filter(link => link.dataset.source === source).length;
            }

            function doesSizeMatchCategory(size, category) {
                const sizeInKB = size / 1024;

                switch (category) {
                    case 'any':
                        return true;
                    case '> 5 MB':
                        return sizeInKB > 5120;
                    case '> 1MB':
                        return sizeInKB > 1024;
                    case '> 500kB':
                        return sizeInKB > 500;
                    case '> 100kB':
                        return sizeInKB > 100;
                    case '> 10kB':
                        return sizeInKB > 10;
                    case '< 10kB':
                        return sizeInKB < 10;
                    default:
                        return false;
                }
            }

            function filterImagesByType(selectedType) {
                const links = document.querySelectorAll('#image-gallery a');
                links.forEach(link => {
                    if (selectedType === 'any' || link.dataset.type === selectedType) {
                        link.dataset.typematch = '1';
                    } else {
                        link.dataset.typematch = '0';
                    }
                });
                filterByMatched();
            }

            function filterImagesBySource(selectedSource) {
                const links = document.querySelectorAll('#image-gallery a');
                links.forEach(link => {
                    if (selectedSource === 'any' || link.dataset.source === selectedSource) {
                        link.dataset.sourcematch = '1';
                    } else {
                        link.dataset.sourcematch = '0';
                    }
                });
                filterByMatched();
            }

            function filterImagesBySize(selectedSizeCategory) {
                const links = document.querySelectorAll('#image-gallery a');
                links.forEach(link => {
                    const imageSize = parseInt(link.dataset.size, 10);

                    if (doesSizeMatchCategory(imageSize, selectedSizeCategory)) {
                        link.dataset.sizematch = '1';
                    } else {
                        link.dataset.sizematch = '0';
                    }
                });
                filterByMatched();
            }

            function addSizeFilters(filterId, categories, links, filterFunction) {
                const filterDiv = document.getElementById(filterId);
                categories.forEach(category => {
                    const count = Array.from(links).filter(link => category.filter(parseInt(link.dataset.size, 10))).length;
                    const labelWithCount = `${category.label} (${count})`;
                    if (count > 0) {
                        addToggleButton(filterDiv, filterId, category.label, labelWithCount, filterFunction);
                    }
                });
            }

            function filterByMatched() {
                const links = document.querySelectorAll('#image-gallery a');
                links.forEach(link => {
                    if (link.dataset.sizematch === '1' && link.dataset.typematch === '1' && link.dataset.sourcematch === '1') {
                        link.style.display = 'inline-block'
                    } else {
                        link.style.display = 'none';
                    }
                });
            }

            document.addEventListener('DOMContentLoaded', function() {
                initializeFilters();
            });

            </script>"#;

/// JavaScript for video gallery
const VIDEO_GALLERY_SCRIPT: &str = r#"<script> function playVideos() {
            const videos = document.querySelectorAll("video");

            function playVideoSequentially(index) {
                if (index >= videos.length) return;

                const video = videos[index];
                video.load();
                video.currentTime = 0;

                video.addEventListener("loadeddata", function() {
                    video.play();

                    setTimeout(() => {
                        video.pause();
                        setTimeout(() => playVideoSequentially(index + 1), 10);
                    }, 2000);
                }, { once: true });
            }

            playVideoSequentially(0);
        }

        /* init lazy loading */
        document.addEventListener("DOMContentLoaded", function() {
            const videos = document.querySelectorAll("video");

            const observer = new IntersectionObserver(entries => {
                entries.forEach(entry => {
                    if (entry.isIntersecting) {
                        const video = entry.target;
                        if (!video.src) {
                            video.src = video.dataset.src;
                            video.load();
                        }
                        observer.unobserve(video);
                    }
                });
            });

            videos.forEach(video => {
                observer.observe(video);
            });
        });

        </script>"#;


================================================
FILE: src/export/html_report/tab.rs
================================================
// SiteOne Crawler - Tab for HTML Report
// (c) Jan Reges <jan.reges@siteone.cz>

use regex::Regex;

use super::badge::Badge;

/// A tab in the HTML report
#[derive(Debug, Clone)]
pub struct Tab {
    pub name: String,
    pub description: Option<String>,
    pub radio_html_id: String,
    pub content_html_id: String,
    pub tab_content: String,
    pub add_heading: bool,
    pub fixed_order: Option<i32>,
    pub order: Option<i32>,
    pub badges: Vec<Badge>,
}

impl Tab {
    pub fn new(
        name: &str,
        description: Option<&str>,
        tab_content: String,
        add_heading: bool,
        badges: Vec<Badge>,
        fixed_order: Option<i32>,
    ) -> Self {
        let sanitized = sanitize_id(name);
        let radio_html_id = format!("radio_{}", sanitized);
        let content_html_id = format!("content_{}", sanitized);

        Self {
            name: name.to_string(),
            description: description.map(|s| s.to_string()),
            radio_html_id,
            content_html_id,
            tab_content,
            add_heading,
            fixed_order,
            order: None,
            badges,
        }
    }

    pub fn set_order(&mut self, order: Option<i32>) {
        self.order = order;
    }

    /// Returns the final sort order: order > fixed_order > 1000 (default)
    pub fn get_final_sort_order(&self) -> i32 {
        if let Some(order) = self.order {
            order
        } else {
            self.fixed_order.unwrap_or(1000)
        }
    }
}

/// Sanitize a tab name into a valid HTML ID
fn sanitize_id(name: &str) -> String {
    let re = Regex::new(r"[^a-zA-Z0-9\-]+").unwrap_or_else(|_| Regex::new(r"\W+").unwrap());
    re.replace_all(name, "_").to_lowercase()
}


================================================
FILE: src/export/html_report/template.html
================================================
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <meta name="author" content="Ján Regeš, https://www.siteone.io/">
    <title>SiteOne Crawler Report - {$initialHost} | {$executedAt}</title>

    <style>
        :root {
            /* colors */
            --color-white: #fff;
            --color-black: #000000;
            --color-gray-50: #f9fafb;
            --color-gray-100: #eeecec;
            --color-gray-200: #E5E7EB;
            --color-gray-300: #D1D5DB;
            --color-gray-400: #9CA3AF;
            --color-gray-500: #6B7280;
            --color-gray-600: #4B5563;
            --color-gray-700: #374151;
            --color-gray-800: #1F2937;
            --color-gray-900: #111827;
            --color-gray-950: rgb(9, 15, 32);
            --color-blue-100: #DBEAFE;
            --color-blue-500: #3B82F6;
            --color-blue-600: #2563EB;
            --color-blue-950: rgb(23, 37, 84);
            /* text-colors */
            --text-color-default: var(--color-gray-900);
            --text-color-neutral: var(--color-gray-300);
            --text-color-blue: var(--color-blue-500);
            --text-color-blue-dark: var(--color-blue-600);
            /* background-colors */
            --background-color-default: var(--color-white);
            --background-color-neutral: var(--color-gray-100);

            --border-radius: 1.5rem;
            --padding-inline: 2rem;
        }

        html * {
            box-sizing: border-box;
        }

        body {
            margin: 0;
            padding: 0;
            width: 100%;
            font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
            background-color: var(--background-color-neutral);
            color: var(--text-color-default);
            transition: background-color 0.2s ease-out, color 0.2s ease-out;
        }

        main {
            display: flex;
            flex-direction: column;
            gap: 2rem;
        }

        section {
            background-color: var(--background-color-default);
            padding: 2rem var(--padding-inline);
            border-radius: var(--border-radius);
            transition: background-color 0.2s ease-out;
        }

        body h2 {
            margin-top: 0;
        }

        body a {
            color: var(--text-color-default);
            transition: color 0.2s ease-out;
        }

        table {
            border-collapse: collapse;
            min-width: 300px;
        }

        table.table-compact {
            font-size: 0.7em;
        }

        table,
        table th,
        table td {
            border: 1px solid var(--color-gray-100);
            padding: 2px 4px !important;
            vertical-align: top;
            text-align: left;
            transition: border-color 0.2s ease-out, background-color 0.2s ease-out;
        }

        table td.warning {
            text-align: center;
            color: var(--text-color-neutral);
        }

        table td svg {
            max-width: 100px;
            max-height: 18px;
        }

        .table-sortable th {
            cursor: pointer;
        }

        html:has(.theme-switch__input:checked) table,
        html:has(.theme-switch__input:checked) table td,
        html:has(.theme-switch__input:checked) table th {
            border-color: var(--background-color-neutral);
        }

        table th {
            background-color: var(--background-color-neutral);
        }

        html:has(.theme-switch__input:checked) table th {
            background-color: var(--background-color-neutral);
        }

        table th table.table-two-col th {
            width: 20%;
        }

        .container {
            display: flex;
            flex-direction: column;
            gap: 2rem;
            max-width: 187.5rem;
            margin: 0 auto;
            padding: 1.5rem 4rem;
        }

        header.header {
            display: flex;
            align-items: center;
            justify-content: space-between;
            background-color: var(--background-color-default);
            padding-inline: var(--padding-inline);
            padding-block: 2rem;
            border-radius: 1000px;
            border: 1px solid var(--color-gray-100);
            transition: background-color 0.2s ease-out, border-color 0.2s ease-out;
        }

        header.header a.logo {
            display: flex;
        }

        header.header h1 {
            margin: 0;
        }

        html:has(.theme-switch__input:checked) .body {
            --text-color-default: var(--color-gray-200);
            --text-color-neutral: var(--color-gray-600);
            --background-color-default: var(--color-gray-900);
            --background-color-neutral: var(--color-gray-800);
        }

        html:has(.theme-switch__input:checked) header.header {
            border-color: transparent;
        }

        .theme-switch__dot--light {
            transform: translateX(0rem);
        }

        html:has(.theme-switch__input:checked) .theme-switch__dot{
            transform: translateX(2rem);
            background-color: var(--color-gray-300);
        }

        .theme-switch label {
            display: flex;
            align-items: center;
            gap: .375rem;
            cursor: pointer;
        }

        .theme-switch__wrapper {
            display: flex;
            align-items: center;
            position: relative;
            border-radius: var(--border-radius);
            background-color: var(--color-gray-300);
            width: 4rem;
            height: 2rem;
            transition: all 0.2s ease-out;
        }

        html:has(.theme-switch__input:checked) .theme-switch__wrapper {
            background-color: var(--color-gray-600);
        }

        .theme-switch__input {
            display: none;
        }

        .theme-switch__dot {
            position: absolute;
            height: 1.5rem;
            width: 1.5rem;
            left: .25rem;
            background-color: var(--color-gray-600);
            border-radius: 50%;
            transition: all .2s ease-out;
        }

        .theme-switch__dot--light {
            transform: translateX(0rem);
        }

        .theme-switch__dot--dark {
            transform: translateX(2rem);
            background-color: var(--color-gray-300);
        }

        .info__wrapper {
            overflow-x: auto;
        }

        .summary ul {
            padding-left: 20px;
        }

        .tabs {
            display: grid;
            grid-template-columns: minmax(0, 18.225rem) 1fr;
            gap: 2rem;
        }

        .tabs__navigation {
            display: flex;
            flex-direction: column;
            gap: 0.5rem;
            border-right: 1px solid var(--color-gray-700);
            padding-right: 2rem;
        }

        .tabs__radio {
            display: none;
        }

        .tabs__tab {
            display: none;
        }

        .tabs__content {
            overflow: auto;
            font-size: 0.8em;
        }

        .tabs__content ul {
            padding-left: 0px;
        }
        .tabs__content ul li {
            list-style: none;
        }

        .tabs__title {
            margin: 0;
            font-size: 1rem;
            padding: .5rem .75rem;
            background-color: var(--background-color-neutral);
            border-radius: var(--border-radius);
            transition: background-color .2s ease-out;
            cursor: pointer;
        }


        .tabs__title:hover {
            background-color: var(--color-blue-500);
            color: var(--color-white);
        }

        html:has(.theme-switch__input:checked) .tabs__title {
            background-color: var(--background-color-neutral);
        }

        html:has(.theme-switch__input:checked) .tabs__title:hover {
            background-color: var(--color-blue-600);
            color: var(--color-white);
        }

        .tabs__table {
            border-radius: var(--border-radius);
            font-size: 0.875rem;
        }

        .badge {
            display: inline-block;
            padding: 2px 4px;
            color: var(--color-white);
            border-radius: 8px;
            font-size: 0.75rem;
            line-height: 1;
            min-width: 18px;
            text-align: center;
        }
        html:has(.theme-switch__input:checked) .badge {
            color: var(--color-white);
        }
        .badge.red {
            background-color: #e3342f;
        }
        .badge.green {
            background-color: #38c172;
        }
        .badge.blue {
            background-color: #3490dc;
        }
        .badge.orange {
            background-color: #ff9234;
        }
        .badge.yellow {
            background-color: #ffed4a;
            color: var(--color-gray-800) !important;
        }
        .badge.neutral {
            background-color: #9ba7b4;
        }

        html:has(.theme-switch__input:checked) .badge.neutral {
            background-color: #718096;
        }

        .badge.in-table {
            background-color: var(--background-color-neutral);
        }

        .fulltext-container {
            display: none;
            margin-bottom: 8px;
        }

        .fulltext-container .found-rows {
            margin-bottom: 8px;
        }

        .js-enabled .fulltext-container {
            display: block;
        }

        input.fulltext {
            width: 100%;
            padding: 0.4rem;
            border-radius: 2px;
            border: 1px solid var(--background-color-neutral);
            background-color: var(--background-color-default);
            color: var(--text-color-blue);
            transition: border-color 0.2s ease-out;
            margin-right: 8px;
        }

        input.fulltext:focus, input.fulltext:focus-visible {
            outline: none;
            box-shadow: 0 0 0 1px rgba(255, 255, 255, 0.1);
            border-color: var(--background-color-neutral) !important;
        }

        table tr.empty-fulltext {
            display: none;
        }

        ::placeholder {
            color: var(--text-color-neutral);
            opacity: 1;
        }

        ::-ms-input-placeholder {
            color: var(--text-color-neutral);
        }

        @media screen and (max-width: 64rem) {
            main {
                gap: 2.5rem;
            }

            h1 {
                font-size: 1.25rem;
            }

            .container {
                padding: 2.5rem;
            }

            .tabs {
                grid-template-columns: 1fr;
            }

            .tabs__navigation {
                justify-content: center;
                flex-direction: row;
                flex-wrap: wrap;
                padding: 0;
                border: none;
                text-align: center;
            }

            .tabs__tab {
                justify-content: center;
            }
        }

        @media screen and (max-width: 40rem) {
            :root {
                --padding-inline: 1rem;
            }

            main {
                gap: 2rem;
            }

            h1 {
                font-size: 1.25rem;
                text-align: center;
            }

            h2 {
                font-size: 1.125rem;
            }

            .container {
                padding: 1.25rem;
                gap: 2rem;
            }

            header.header {
                display: grid;
                grid-template-columns: 1fr 1fr;
                grid-template-rows: auto auto;
                gap: 1rem;
                padding-block: 1.5rem;
                border-radius: var(--border-radius);
            }

            header.header .theme-switch {
                justify-self: end;
            }

            header.header h1 {
                grid-row: 2;
                grid-column: 1 / 3;
            }

            .tabs__title {
                font-size: .875rem;
                padding: .25rem .5rem;
            }
        }

        table.seo-headings td ul {
            margin: 0;
        }

        table.seo-headings td ul ul {
            padding-left: 20px;
        }
        table.seo-headings td ul li {
            list-style: none;
        }
        table.seo-headings td.headings .badge {
            text-align: left;
            background-color: transparent;
            color: #ff9234 !important;
        }

        .table-container.show-more {
            display: block;
            max-height: 658px !important; /* based on typical sidebar height */
            overflow: hidden;
            transition: max-height 0.5s cubic-bezier(0, 1, 0, 1);
        }

        .show-more-label {
            display: block;
            cursor: pointer;
            text-align: left;
            padding: 10px 50px 10px 0;
            color: var(--color-blue-600);
        }

        .show-more-checkbox {
            display: none;
        }

        .show-more-checkbox:checked ~ .table-container.show-more {
            transition: max-height 1s ease;
            max-height: none !important;
            /* overflow: auto; */
        }

        .show-more-checkbox:checked ~ .show-more-label {
            display: none;
        }

        table tr td.urlPathAndQuery, table tr td.url {
            overflow: hidden;
            text-overflow: ellipsis;
            white-space: nowrap;
        }

        table tr td.urlPathAndQuery {
            max-width: 300px;
        }

        /*
        table tr td.url {
            max-width: 800px;
        }
         */

        table.large-inline-svgs tr td.detail, table.duplicate-inline-svgs tr td.detail, table.invalid-inline-svgs tr td.detail  {
            min-width: 150px;
            /*
            display: flex;
            justify-content: space-between;
            align-items: center;
             */
        }

        table.large-inline-svgs tr td.detail svg, table.duplicate-inline-svgs tr td.detail svg, table.invalid-inline-svgs tr td.detail svg {
            float: right;
            /* margin-left: auto; */
        }

        table tr td.header {
            min-width: 120px;
            white-space: nowrap;
        }

        td.status, td.type, td.time, td.size {
            white-space: nowrap;
        }

        .text-muted {
            color: var(--color-gray-600);
        }

        .help {
            cursor: help;
        }

        .form-group {
            flex-grow: 0;
            flex-basis: auto;
        }

        .btn-group {
            display: flex;
            flex-wrap: wrap;
            justify-content: flex-start;
            gap: 1px;
            border: 0;
        }

        .btn-group label {
            padding: 3px 6px;
            cursor: pointer;
            background-color: var(--background-color-neutral);
            color: var(--text-color-default);
            transition: background-color 0.3s, color 0.3s;
            white-space: nowrap;
            z-index: 1000;
        }

        .btn-group label:hover,
        .btn-group input[type="radio"]:checked + label {
            background-color: var(--color-blue-600);
            color: var(--color-white);
        }

        .btn-group input[type="radio"] {
            display: none;
        }

        .btn-group label:first-of-type {
            border-radius: 4px 0 0 4px;
        }

        .btn-group label:last-of-type {
            border-radius: 0 4px 4px 0;
        }

        .btn {
            padding: 0.5rem 1rem;
            border-radius: 1rem;
            border: 0;
            background-color: var(--color-blue-600);
            color: var(--color-white);
            cursor: pointer;
            margin-bottom: 1rem;

        }

        .image-gallery {
            display: flex;
            flex-wrap: wrap;
            gap: 6px;
        }

        .image-gallery img {
            object-fit: contain;
            transition: transform 0.3s ease;
            cursor: help;
        }

        .small .image-gallery img {
            max-width: 140px;
            max-height: 140px;
        }

        .medium .image-gallery img {
            max-width: 200px;
            max-height: 200px;
        }

        .large .image-gallery img {
            max-width: 360px;
            max-height: 360px;
        }

        .cover .image-gallery img {
            object-fit: cover;
        }

        .contain .image-gallery img {
            object-fit: contain;
        }

        .scaleDown .image-gallery img {
            object-fit: scale-down;
        }

        .image-gallery img:hover {
            transform: scale(1.3);
            z-index: -1000;
        }

        .image-gallery .highlighted {
            border: 2px solid red;
            padding: 4px;
            box-sizing: border-box;
        }

        .video-container {
            display: flex;
            flex-wrap: wrap;
            gap: 10px;
        }

        .video-card {
            display: flex;
            flex-direction: column;
            border: 1px solid var(--background-color-default);
            padding: 10px;
            background-color: var(--background-color-neutral);
            box-sizing: border-box;
            min-width: 400px;
            flex: 1 1 400px;
        }

        .video-card video {
            min-width: 100%;
            min-height: 300px;
        }

        .video-caption {
            margin-top: 10px;
            text-align: center;
            font-size: 14px;
            color: var(--text-color-default);
            flex-grow: 1;
        }

        .video-container {
            align-items: stretch;
        }

        .iconset-preview {
            display: flex;
            flex-wrap: wrap;
            width: 100%;
            justify-content: flex-start;
            align-items: center;
            gap: 6px;
        }

        .iconset-icon {
            max-width: 20px;
            max-height: 20px;
            height: auto;
        }

        {$tabsCss}

    </style>
</head>

<body class="body">
<script>
    document.getElementsByTagName('body')[0].setAttribute('class', 'body js-enabled');
</script>
<div class="container">
    <header class="header">
        <a href="https://crawler.siteone.io/?utm_source=siteone_crawler&utm_medium=logo&utm_campaign=crawler_report&utm_content=v{$version}"
           target="_blank" class="logo" aria-label="Clickable logo of SiteOne Crawler linking to crawler.siteone.io">
            <svg width="70px" height="34px" fill="none" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" x="0px" y="0px"
                 viewBox="0 0 119 59" xml:space="preserve">
                <path d="M49.4 29.1L49.4 29.1L49.4 29.1l8.8-8.8l0 0h0V0h-9.9v16.2l-5.9 5.9L29.1 8.9L15.9 22.1l-5.9-5.9V0H0v20.2h0l0 0l8.8 8.8
                    l0 0l0 0L0 37.9l0 0h0v20.2h9.9V42l5.9-5.9l13.3 13.3l13.3-13.3l5.9 5.9v16.2h9.9V38h0l0 0L49.4 29.1z M29.1 35.4l-6.3-6.3l6.3-6.3
                    l6.3 6.3L29.1 35.4z" fill="var(&#45;&#45;text-color-default)"/>
                <path fill-rule="evenodd" clip-rule="evenodd"
                      d="M92.3 15v33.2H75.5v10H119v-10h-16.4V0h-9.3L67.1 26.2l7 7C74.1 33.2 92.3 15 92.3 15z"
                      fill="var(&#45;&#45;text-color-neutral)"/>
            </svg>
        </a>
        <h1>Crawler Report for <a href="{$initialUrl}" aria-label="Link to Crawler target website {$initialHost}">{$initialHost}</a></h1>
        <div class="theme-switch">
            <label>
                <input type="checkbox" class="theme-switch__input" arial-label="Theme switch - dark mode is enabled by default" checked>
                <svg width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2"
                     stroke-linecap="round" stroke-linejoin="round" class="feather feather-sun">
                    <circle cx="12" cy="12" r="5"></circle>
                    <line x1="12" y1="1" x2="12" y2="3"></line>
                    <line x1="12" y1="21" x2="12" y2="23"></line>
                    <line x1="4.22" y1="4.22" x2="5.64" y2="5.64"></line>
                    <line x1="18.36" y1="18.36" x2="19.78" y2="19.78"></line>
                    <line x1="1" y1="12" x2="3" y2="12"></line>
                    <line x1="21" y1="12" x2="23" y2="12"></line>
                    <line x1="4.22" y1="19.78" x2="5.64" y2="18.36"></line>
                    <line x1="18.36" y1="5.64" x2="19.78" y2="4.22"></line>
                </svg>
                <div class="theme-switch__wrapper">
                    <div class="theme-switch__dot"></div>
                </div>
                <svg width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2"
                     stroke-linecap="round" stroke-linejoin="round" class="feather feather-moon">
                    <path d="M21 12.79A9 9 0 1 1 11.21 3 7 7 0 0 0 21 12.79z"></path>
                </svg>
            </label>
        </div>
    </header>
    <main>
        <section class="tabs">
            {$tabsRadios}
            <nav class="tabs__navigation" aria-label="Navigation">
                {$tabs}
            </nav>
            <div class="tabs__content">
                {$tabsContent}
            </div>
        </section>
    </main>
    <footer aria-label="Footer with basic info about report and crawler">
        <br/>
        <hr/>
        The report was generated <strong>{$executedAt}</strong> using the <span style="color: red;">♥</span>
        <a href="https://github.com/janreges/siteone-crawler" aria-label="Link to SiteOne Crawler Github repository"><strong>SiteOne Crawler</strong></a>
        by Ján Regeš from <a
            href="https://www.siteone.io/?utm_source=siteone_crawler&utm_medium=email&utm_campaign=crawler_report&utm_content=v{$version}" aria-label="Link to SiteOne.io - main author of this crawler"><strong>SiteOne</strong></a>
        (Czech Republic).<br/>
        <br/>
    </footer>
    <script>
        function sortTable(tableId, columnKey) {

            const table = document.querySelector('#' + tableId);
            const tbody = table.querySelector('tbody');
            const rows = Array.from(tbody.querySelectorAll('tr'));
            const headerCells = Array.from(table.querySelectorAll('thead th'));
            const columnIndex = Array.from(table.querySelectorAll('thead th')).findIndex(th => th.getAttribute('data-key') === columnKey);
            const direction = headerCells[columnIndex].getAttribute('data-direction');
            const dataType = headerCells[columnIndex].getAttribute('data-type');

            rows.sort((a, b) => {
                if (a.children.length !== b.children.length) {
                    return 0;
                }
                let aValue = a.children[columnIndex].getAttribute('data-value');
                let bValue = b.children[columnIndex].getAttribute('data-value');

                if (dataType === 'number') {
                    aValue = parseFloat(aValue);
                    bValue = parseFloat(bValue);
                }

                if (direction === 'ASC') {
                    return aValue > bValue ? 1 : aValue < bValue ? -1 : 0;
                } else {
                    return aValue < bValue ? 1 : aValue > bValue ? -1 : 0;
                }
            });

            rows.forEach(row => tbody.appendChild(row));

            headerCells.forEach(th => {
                let label = th.getAttribute('data-label');
                if (th.getAttribute('data-key') === columnKey) {
                    th.innerHTML = label + (direction === 'ASC' ? '&nbsp;🔼' : '&nbsp;🔽');
                    th.setAttribute('data-direction', direction === 'ASC' ? 'DESC' : 'ASC');
                } else {
                    th.textContent = label;
                }
            });
        }

        function debounce(callback, timeout) {
            let timerId;
            return function (...args) {
                clearTimeout(timerId);
                timerId = setTimeout(() => {
                    callback.apply(this, args);
                }, timeout);
            };
        }

        function tableFulltext(tableId, searchTerm) {
            var table = document.getElementById(tableId);
            if (!table) {
                console.warn('Table with id "' + tableId + '" was not found.');
                return;
            }
            searchTerm = searchTerm.trim().toLowerCase();

            /* index table rows for effective fulltext search */
            if (table.getAttribute('data-fulltext-ready') !== '1') {
                var rows = table.getElementsByTagName('tr');
                for (var i = 0; i < rows.length; i++) {
                    var cells = rows[i].getElementsByTagName('td');
                    var fulltextValue = '';
                    for (var j = 0; j < cells.length; j++) {
                        fulltextValue += cells[j].textContent + ' ';
                    }
                    fulltextValue = fulltextValue.trim();
                    rows[i].setAttribute('data-fulltext', fulltextValue.toLowerCase());
                }
                table.setAttribute('data-fulltext-ready', '1');
            }

            var terms = searchTerm.split(' ');

            var tbody = table.getElementsByTagName('tbody')[0];
            var rows = tbody.getElementsByTagName('tr');
            var foundRows = 0;
            for (var i = 0; i < rows.length; i++) {
                var rowFulltext = rows[i].getAttribute('data-fulltext') || '';
                var display = terms.every(function(term) {
                    return rowFulltext.includes(term);
                }) ? '' : 'none';

                if (display === '') {
                    foundRows++;
                }

                rows[i].style.display = display;
            }

            var emptyFulltextRow = table.getElementsByClassName('empty-fulltext')[0];
            emptyFulltextRow.style.display = foundRows > 0 ? 'none' : 'table-row';

            document.getElementById('foundRows_' + tableId).textContent = 'Found ' + foundRows + ' row(s).';
        }

        function debouncedTableFulltext(tableId, searchTerm) {
            debounce(tableFulltext, 250)(tableId, searchTerm);
        }

        document.addEventListener('DOMContentLoaded', function () {
            /* add event listeners to fulltext inputs above super tables */
            function onFulltextKeyup(event) {
                const dataUqId = event.target.getAttribute('data-uq-id');
                const inputValue = event.target.value;
                debouncedTableFulltext(dataUqId, inputValue);
            }

            const inputs = document.querySelectorAll('input.fulltext[data-uq-id]');
            inputs.forEach(input => {
                input.addEventListener('keyup', onFulltextKeyup);
            });

            /* add event listeners to sortable table headers */
            function onTableHeaderClick(event) {
                const dataUqId = event.target.getAttribute('data-uq-id');
                const dataKey = event.target.getAttribute('data-key');
                sortTable(dataUqId, dataKey);
            }

            const tableHeaders = document.querySelectorAll('th.sortable-th[data-uq-id]');
            tableHeaders.forEach(th => {
                th.addEventListener('click', onTableHeaderClick);
            });

            /* add event listeners to image gallery filters */
            function onImageFilterClick(event) {
                const dataKey = event.target.getAttribute('data-key');
                updateClassName(dataKey, event.target.value);
            }

            const imgFilters = document.querySelectorAll('input.idf[data-key]');
            imgFilters.forEach(f => {
                f.addEventListener('change', onImageFilterClick);
            });
        });

    </script>
</div>
</body>

</html>

================================================
FILE: src/export/mailer_exporter.rs
================================================
// SiteOne Crawler - MailerExporter
// (c) Jan Reges <jan.reges@siteone.cz>
//
// Sends crawl report via SMTP email using the lettre crate.

use std::sync::atomic::{AtomicBool, Ordering};

use lettre::message::{Attachment, MultiPart, SinglePart, header::ContentType};
use lettre::transport::smtp::authentication::Credentials;
use lettre::{Message, SmtpTransport, Transport};

use crate::error::{CrawlerError, CrawlerResult};
use crate::export::exporter::Exporter;
use crate::output::output::Output;
use crate::result::status::Status;
use crate::version;

/// Global flag to prevent sending emails when crawler is interrupted (CTRL+C).
static CRAWLER_INTERRUPTED: AtomicBool = AtomicBool::new(false);

pub fn set_crawler_interrupted(interrupted: bool) {
    CRAWLER_INTERRUPTED.store(interrupted, Ordering::SeqCst);
}

pub fn is_crawler_interrupted() -> bool {
    CRAWLER_INTERRUPTED.load(Ordering::SeqCst)
}

pub struct MailerExporter {
    /// Recipient email addresses (--mail-to, can be multiple)
    pub mail_to: Vec<String>,
    /// Sender email address (--mail-from)
    pub mail_from: String,
    /// Sender display name (--mail-from-name)
    pub mail_from_name: String,
    /// SMTP host (--mail-smtp-host)
    pub mail_smtp_host: String,
    /// SMTP port (--mail-smtp-port)
    pub mail_smtp_port: u16,
    /// SMTP username (--mail-smtp-user)
    pub mail_smtp_user: Option<String>,
    /// SMTP password (--mail-smtp-pass)
    pub mail_smtp_pass: Option<String>,
    /// Email subject template (--mail-subject-template)
    pub mail_subject_template: String,
    /// Initial host from crawled URL (for subject/body interpolation)
    pub initial_host: Option<String>,
    /// HTML report content to attach (set before export)
    pub html_report_content: Option<String>,
}

impl MailerExporter {
    #[allow(clippy::too_many_arguments)]
    pub fn new(
        mail_to: Vec<String>,
        mail_from: String,
        mail_from_name: String,
        mail_smtp_host: String,
        mail_smtp_port: u16,
        mail_smtp_user: Option<String>,
        mail_smtp_pass: Option<String>,
        mail_subject_template: String,
        initial_host: Option<String>,
    ) -> Self {
        Self {
            mail_to,
            mail_from,
            mail_from_name,
            mail_smtp_host,
            mail_smtp_port,
            mail_smtp_user,
            mail_smtp_pass,
            mail_subject_template,
            initial_host,
            html_report_content: None,
        }
    }

    /// Set the HTML report content to be attached to the email.
    pub fn set_html_report_content(&mut self, content: String) {
        self.html_report_content = Some(content);
    }

    /// Build the email body HTML.
    fn get_email_body(&self, host: &str) -> String {
        let version_code = version::CODE;

        format!(
            r#"Hello,<br>
<br>
We are pleased to deliver the attached report detailing a thorough crawling and analysis of your website, <b>{host}</b>. Our advanced website crawler has identified key areas that require your attention, including found redirects, 404 error pages, and potential issues in accessibility, best practices, performance, and security.<br>
<br>
The report is in HTML format and for full functionality, it should be opened in a JavaScript-enabled browser. This will allow you to access advanced features such as searching and sorting data within tables. Some mobile email clients may not support all interactive elements.<br>
<br>
In case you have any suggestions for improvements and other useful features, feel free to send them as Feature requests to <a href="https://github.com/janreges/siteone-crawler/issues/">our project's GitHub</a>.<br>
<br>
Best regards,<br>
<br>
<a href="https://crawler.siteone.io/?utm_source=siteone_crawler&utm_medium=email-report&utm_campaign=crawler_report&utm_content=v{version_code}">SiteOne Crawler</a> Team"#,
            host = host,
            version_code = version_code,
        )
    }

    /// Add inline styles to the email body for better email client rendering.
    fn style_html_body_for_email(&self, html: &str) -> String {
        let styled_body = r#"<body style="font-family: Arial, Helvetica, sans-serif;">
<style>
table {
    border-collapse: collapse;
}
body table, body table th, body table td {
    border: 1px solid #555555;
    padding: 3px !important;
    vertical-align: top;
    text-align: left;
}
</style>
"#;
        html.replace("<body>", styled_body)
    }

    /// Build the email subject from the template.
    /// Replaces %domain%, %date%, %datetime% placeholders.
    fn build_subject(&self) -> String {
        let host = self.initial_host.as_deref().unwrap_or("unknown");
        let now = chrono::Local::now();
        let date = now.format("%Y-%m-%d").to_string();
        let datetime = now.format("%Y-%m-%d %H:%M").to_string();

        self.mail_subject_template
            .replace("%domain%", host)
            .replace("%date%", &date)
            .replace("%datetime%", &datetime)
    }

    /// Resolve the sender address.
    /// Replaces @your-hostname.com with @<actual-hostname>.
    fn resolve_mail_from(&self) -> String {
        let hostname = gethostname::gethostname().to_string_lossy().to_string();
        self.mail_from.replace("@your-hostname.com", &format!("@{}", hostname))
    }

    /// Send the email via SMTP using the lettre crate.
    fn send_email(
        &self,
        email_body_html: &str,
        attachment_filename: Option<&str>,
        attachment_content: Option<&str>,
    ) -> CrawlerResult<()> {
        let from_addr = self.resolve_mail_from();
        let subject = self.build_subject();

        // Build the message for the first recipient, then iterate
        if self.mail_to.is_empty() {
            return Err(CrawlerError::Mail("No recipients specified for email".to_string()));
        }

        // Parse from address
        let from_mailbox: lettre::message::Mailbox = format!("{} <{}>", self.mail_from_name, from_addr)
            .parse()
            .map_err(|e| CrawlerError::Mail(format!("Invalid sender address '{}': {}", from_addr, e)))?;

        // Build email body with optional attachment
        let styled_body = self.style_html_body_for_email(email_body_html);
        let email_body = SinglePart::builder().header(ContentType::TEXT_HTML).body(styled_body);

        let multipart = if let (Some(filename), Some(content)) = (attachment_filename, attachment_content) {
            let attachment = Attachment::new(filename.to_string()).body(
                content.as_bytes().to_vec(),
                "application/octet-stream"
                    .parse()
                    .map_err(|_| CrawlerError::Mail("Failed to parse MIME type for attachment".to_string()))?,
            );
            MultiPart::mixed().singlepart(email_body).singlepart(attachment)
        } else {
            MultiPart::mixed().singlepart(email_body)
        };

        // Send to each recipient
        for recipient in &self.mail_to {
            let to_mailbox = recipient
                .parse()
                .map_err(|e| CrawlerError::Mail(format!("Invalid recipient address '{}': {}", recipient, e)))?;

            let email = Message::builder()
                .from(from_mailbox.clone())
                .to(to_mailbox)
                .subject(&subject)
                .multipart(multipart.clone())
                .map_err(|e| CrawlerError::Mail(format!("Failed to build email message: {}", e)))?;

            // Build SMTP transport
            let mut smtp_builder = if self.mail_smtp_port == 465 {
                // Port 465 = implicit TLS
                SmtpTransport::relay(&self.mail_smtp_host)
                    .map_err(|e| {
                        CrawlerError::Mail(format!(
                            "Failed to connect to SMTP server '{}:{}': {}",
                            self.mail_smtp_host, self.mail_smtp_port, e
                        ))
                    })?
                    .port(self.mail_smtp_port)
            } else if self.mail_smtp_port == 587 {
                // Port 587 = STARTTLS
                SmtpTransport::starttls_relay(&self.mail_smtp_host)
                    .map_err(|e| {
                        CrawlerError::Mail(format!(
                            "Failed to connect to SMTP server '{}:{}': {}",
                            self.mail_smtp_host, self.mail_smtp_port, e
                        ))
                    })?
                    .port(self.mail_smtp_port)
            } else {
                // Other ports (25, etc) = no encryption by default
                SmtpTransport::builder_dangerous(&self.mail_smtp_host).port(self.mail_smtp_port)
            };

            // Add credentials if provided
            if let (Some(user), Some(pass)) = (&self.mail_smtp_user, &self.mail_smtp_pass) {
                smtp_builder = smtp_builder.credentials(Credentials::new(user.clone(), pass.clone()));
            }

            let mailer = smtp_builder.build();

            mailer
                .send(&email)
                .map_err(|e| CrawlerError::Mail(format!("Failed to send email to '{}': {}", recipient, e)))?;
        }

        Ok(())
    }
}

impl Exporter for MailerExporter {
    fn get_name(&self) -> &str {
        "MailerExporter"
    }

    fn should_be_activated(&self) -> bool {
        !self.mail_to.is_empty()
    }

    fn export(&mut self, status: &Status, _output: &dyn Output) -> CrawlerResult<()> {
        // Do not send emails if crawler was interrupted
        if is_crawler_interrupted() {
            return Ok(());
        }

        let host = self.initial_host.as_deref().unwrap_or("unknown");
        let datetime = chrono::Local::now().format("%Y%m%d%H%M%S").to_string();
        let email_body = self.get_email_body(host);
        let attachment_filename = format!("report-{}-{}.html", host, datetime);

        let html_report = match &self.html_report_content {
            Some(c) => c.clone(),
            None => {
                return Err(CrawlerError::Export(
                    "HTML report content not available. Set it via set_html_report_content() before export."
                        .to_string(),
                ));
            }
        };

        match self.send_email(&email_body, Some(&attachment_filename), Some(&html_report)) {
            Ok(()) => {
                let recipients = self.mail_to.join(", ");
                status.add_info_to_summary(
                    "mail-report-sent",
                    &format!(
                        "HTML report sent to {} using {}:{}",
                        recipients, self.mail_smtp_host, self.mail_smtp_port
                    ),
                );
            }
            Err(e) => {
                status.add_critical_to_summary("mail-report-failed", &format!("Failed to send email report: {}", e));
            }
        }

        Ok(())
    }
}


================================================
FILE: src/export/markdown_exporter.rs
================================================
// SiteOne Crawler - MarkdownExporter
// (c) Jan Reges <jan.reges@siteone.cz>
//
// Converts crawled HTML pages to Markdown format.

use std::collections::HashMap;
use std::fs;
use std::path::Path;
use std::sync::{Arc, Mutex};
use std::time::Instant;

use regex::Regex;

use crate::content_processor::manager::ContentProcessorManager;
use crate::engine::parsed_url::ParsedUrl;
use crate::error::{CrawlerError, CrawlerResult};
use crate::export::exporter::Exporter;
use crate::export::utils::html_to_markdown::HtmlToMarkdownConverter;
use crate::export::utils::markdown_site_aggregator::MarkdownSiteAggregator;
use crate::export::utils::offline_url_converter::OfflineUrlConverter;
use crate::export::utils::target_domain_relation::TargetDomainRelation;
use crate::output::output::Output;
use crate::result::status::Status;
use crate::result::visited_url::{SOURCE_A_HREF, SOURCE_IMG_SRC, VisitedUrl};
use crate::types::ContentTypeId;
use crate::utils;

/// Content types that require URL rewriting
const CONTENT_TYPES_REQUIRING_CHANGES: &[ContentTypeId] = &[ContentTypeId::Html, ContentTypeId::Redirect];

/// Exports crawled HTML pages as Markdown files.
/// Supports single-file combination, selector exclusion, content replacement.
pub struct MarkdownExporter {
    markdown_export_directory: Option<String>,
    markdown_export_single_file: Option<String>,
    markdown_disable_images: bool,
    markdown_disable_files: bool,
    markdown_remove_links_and_images_from_single_file: bool,
    markdown_exclude_selector: Vec<String>,
    markdown_export_store_only_url_regex: Vec<String>,
    markdown_ignore_store_file_error: bool,
    markdown_replace_content: Vec<String>,
    markdown_replace_query_string: Vec<String>,
    markdown_move_content_before_h1_to_end: bool,
    initial_parsed_url: Option<ParsedUrl>,
    ignore_regexes: Vec<String>,
    initial_url: String,
    content_processor_manager: Option<Arc<Mutex<ContentProcessorManager>>>,
    /// Maps URL -> relative file path for successfully exported files
    exported_file_paths: HashMap<String, String>,
}

impl Default for MarkdownExporter {
    fn default() -> Self {
        Self::new()
    }
}

impl MarkdownExporter {
    pub fn new() -> Self {
        Self {
            markdown_export_directory: None,
            markdown_export_single_file: None,
            markdown_disable_images: false,
            markdown_disable_files: false,
            markdown_remove_links_and_images_from_single_file: false,
            markdown_exclude_selector: Vec::new(),
            markdown_export_store_only_url_regex: Vec::new(),
            markdown_ignore_store_file_error: false,
            markdown_replace_content: Vec::new(),
            markdown_replace_query_string: Vec::new(),
            markdown_move_content_before_h1_to_end: false,
            initial_parsed_url: None,
            ignore_regexes: Vec::new(),
            initial_url: String::new(),
            content_processor_manager: None,
            exported_file_paths: HashMap::new(),
        }
    }

    pub fn set_markdown_export_directory(&mut self, dir: Option<String>) {
        self.markdown_export_directory = dir.map(|d| d.trim_end_matches('/').to_string());
    }

    pub fn set_markdown_export_single_file(&mut self, file: Option<String>) {
        self.markdown_export_single_file = file;
    }

    pub fn set_markdown_disable_images(&mut self, disable: bool) {
        self.markdown_disable_images = disable;
    }

    pub fn set_markdown_disable_files(&mut self, disable: bool) {
        self.markdown_disable_files = disable;
    }

    pub fn set_markdown_remove_links_and_images_from_single_file(&mut self, remove: bool) {
        self.markdown_remove_links_and_images_from_single_file = remove;
    }

    pub fn set_markdown_exclude_selector(&mut self, selectors: Vec<String>) {
        self.markdown_exclude_selector = selectors;
    }

    pub fn set_markdown_export_store_only_url_regex(&mut self, regexes: Vec<String>) {
        self.markdown_export_store_only_url_regex = regexes;
    }

    pub fn set_markdown_ignore_store_file_error(&mut self, ignore: bool) {
        self.markdown_ignore_store_file_error = ignore;
    }

    pub fn set_markdown_replace_content(&mut self, replacements: Vec<String>) {
        self.markdown_replace_content = replacements;
    }

    pub fn set_markdown_replace_query_string(&mut self, replacements: Vec<String>) {
        self.markdown_replace_query_string = replacements;
    }

    pub fn set_markdown_move_content_before_h1_to_end(&mut self, move_content: bool) {
        self.markdown_move_content_before_h1_to_end = move_content;
    }

    pub fn set_initial_parsed_url(&mut self, url: ParsedUrl) {
        self.initial_parsed_url = Some(url);
    }

    pub fn set_ignore_regexes(&mut self, regexes: Vec<String>) {
        self.ignore_regexes = regexes;
    }

    pub fn set_initial_url(&mut self, url: String) {
        self.initial_url = url;
    }

    pub fn set_content_processor_manager(&mut self, cpm: Arc<Mutex<ContentProcessorManager>>) {
        self.content_processor_manager = Some(cpm);
    }

    /// Get the mapping of URL -> relative file path for all successfully exported files.
    pub fn get_exported_file_paths(&self) -> &HashMap<String, String> {
        &self.exported_file_paths
    }

    /// Store a file to the markdown export directory.
    fn store_file(&mut self, visited_url: &VisitedUrl, status: &Status) -> CrawlerResult<()> {
        let export_dir = self
            .markdown_export_directory
            .as_ref()
            .ok_or_else(|| CrawlerError::Export("Markdown export directory not set".to_string()))?;

        let body_bytes = status.get_url_body(&visited_url.uq_id).unwrap_or_default();

        // For content types requiring URL rewriting (HTML, CSS, JS), work with text.
        // For binary content (images, fonts), keep raw bytes to avoid UTF-8 corruption.
        let final_bytes =
            if !body_bytes.is_empty() && CONTENT_TYPES_REQUIRING_CHANGES.contains(&visited_url.content_type) {
                let mut content = String::from_utf8_lossy(&body_bytes).into_owned();

                // Apply content changes for offline version through content processors
                if let Some(ref cpm) = self.content_processor_manager {
                    let parsed_url = ParsedUrl::parse(&visited_url.url, None);
                    if let Ok(mut manager) = cpm.lock() {
                        manager.apply_content_changes_for_offline_version(
                            &mut content,
                            visited_url.content_type,
                            &parsed_url,
                            true,
                        );
                    }
                }

                // Apply custom content replacements
                if !content.is_empty() && !self.markdown_replace_content.is_empty() {
                    for replace in &self.markdown_replace_content {
                        let parts: Vec<&str> = replace.splitn(2, "->").collect();
                        let replace_from = parts[0].trim();
                        let replace_to = if parts.len() > 1 { parts[1].trim() } else { "" };

                        let is_regex = crate::utils::is_regex_pattern(replace_from);

                        if is_regex {
                            if let Some(pattern) = extract_regex_pattern(replace_from)
                                && let Ok(re) = Regex::new(&pattern)
                            {
                                content = re.replace_all(&content, replace_to).to_string();
                            }
                        } else {
                            content = content.replace(replace_from, replace_to);
                        }
                    }
                }

                content.into_bytes()
            } else {
                body_bytes
            };

        // Build store file path
        let relative_path = self.get_relative_file_path_for_file_by_url(visited_url, status);
        let sanitized_path = OfflineUrlConverter::sanitize_file_path(&relative_path, false);
        // Path traversal protection: strip "../" sequences from sanitized path
        let sanitized_path = sanitized_path.replace("../", "").replace("..\\", "");
        let store_file_path = format!("{}/{}", export_dir, sanitized_path);

        // Create directory structure
        let dir_path = Path::new(&store_file_path).parent().ok_or_else(|| {
            CrawlerError::Export(format!("Cannot determine parent directory for '{}'", store_file_path))
        })?;

        if !dir_path.exists() {
            fs::create_dir_all(dir_path).map_err(|e| {
                CrawlerError::Export(format!("Cannot create directory '{}': {}", dir_path.display(), e))
            })?;
        }

        // Check if we should overwrite
        if Path::new(&store_file_path).exists()
            && let Some(ref initial_url) = self.initial_parsed_url
            && !visited_url.is_https()
            && initial_url.is_https()
        {
            let message = format!(
                "File '{}' already exists and will not be overwritten because initial request was HTTPS and this request is HTTP: {}",
                store_file_path, visited_url.url
            );
            status.add_notice_to_summary("markdown-exporter-store-file-ignored", &message);
            return Ok(());
        }

        // Write the content (raw bytes to preserve binary data for images/fonts)
        if let Err(e) = fs::write(&store_file_path, &final_bytes) {
            let has_extension = Regex::new(r"(?i)\.[a-z0-9\-]{1,15}$")
                .map(|re| re.is_match(&store_file_path))
                .unwrap_or(false);

            if has_extension && !self.markdown_ignore_store_file_error {
                return Err(CrawlerError::Export(format!(
                    "Cannot store file '{}': {}",
                    store_file_path, e
                )));
            } else {
                let message = format!(
                    "Cannot store file '{}' (undefined extension). Original URL: {}",
                    store_file_path, visited_url.url
                );
                status.add_notice_to_summary("markdown-exporter-store-file-error", &message);
                return Ok(());
            }
        }

        // Convert HTML to Markdown
        if store_file_path.ends_with(".html") {
            let md_file_path = format!("{}md", &store_file_path[..store_file_path.len() - 4]);

            let html_content = fs::read_to_string(&store_file_path).unwrap_or_default();
            let converter = HtmlToMarkdownConverter::new(&html_content, self.markdown_exclude_selector.clone());
            let markdown = converter.get_markdown();

            if let Err(_e) = fs::write(&md_file_path, &markdown) {
                let message = format!(
                    "Cannot convert HTML file to Markdown file '{}'. Original URL: {}",
                    md_file_path, visited_url.url
                );
                status.add_notice_to_summary("markdown-exporter-store-file-error", &message);
                return Ok(());
            }

            // Remove the HTML file
            let _ = fs::remove_file(&store_file_path);

            if !Path::new(&md_file_path).exists() {
                let message = format!(
                    "Cannot convert HTML file to Markdown file '{}'. Original URL: {}",
                    md_file_path, visited_url.url
                );
                status.add_notice_to_summary("markdown-exporter-store-file-error", &message);
                return Ok(());
            }

            // Normalize the markdown file
            self.normalize_markdown_file(&md_file_path);
        }

        // Record the mapping — for HTML files, use the .md path
        let final_relative_path = if sanitized_path.ends_with(".html") {
            format!("{}md", &sanitized_path[..sanitized_path.len() - 4])
        } else {
            sanitized_path.clone()
        };
        self.exported_file_paths
            .insert(visited_url.url.clone(), final_relative_path);

        Ok(())
    }

    /// Normalize a markdown file after conversion from HTML.
    fn normalize_markdown_file(&self, md_file_path: &str) {
        let md_content = match fs::read_to_string(md_file_path) {
            Ok(content) => content,
            Err(_) => return,
        };

        let normalized = self.normalize_markdown_content(&md_content, true);
        let _ = fs::write(md_file_path, &normalized);
    }

    /// Normalize markdown content after conversion from HTML.
    /// When `replace_html_links_with_md` is true, `.html` extensions in links are replaced with `.md`.
    /// When false (standalone conversion mode), links are left as-is.
    pub fn normalize_markdown_content(&self, content: &str, replace_html_links_with_md: bool) -> String {
        let mut md_content = content.to_string();

        // Replace .html with .md in links (only when exporting a full site)
        if replace_html_links_with_md && let Ok(link_re) = Regex::new(r"\[([^\]]*)\]\(([^)]+)\)") {
            let ignore_regexes = &self.ignore_regexes;
            md_content = link_re
                .replace_all(&md_content, |caps: &regex::Captures| {
                    let link_text = caps.get(1).map_or("", |m| m.as_str());
                    let url = caps.get(2).map_or("", |m| m.as_str());

                    // Check if URL matches any ignore pattern
                    for ignore_regex in ignore_regexes {
                        if let Ok(re) = Regex::new(ignore_regex)
                            && re.is_match(url)
                        {
                            return format!("[{}]({})", link_text, url);
                        }
                    }

                    // Replace .html with .md
                    let new_url = url.replace(".html", ".md").replace(".html#", ".md#");
                    format!("[{}]({})", link_text, new_url)
                })
                .to_string();
        }

        // Disable images if configured
        if self.markdown_disable_images {
            // Replace image in anchor text
            if let Ok(re) = Regex::new(r"\[!\[[^\]]*\]\([^\)]*\)\]\([^\)]*\)") {
                md_content = re.replace_all(&md_content, "").to_string();
            }
            // Replace standard images
            if let Ok(re) = Regex::new(r"!\[.*?\]\(.*?\)") {
                md_content = re.replace_all(&md_content, "").to_string();
            }
            // Normalize leading whitespace inside link text: [ text](url) → [text](url)
            if let Ok(re) = Regex::new(r"\[\s+([^\]]+)\]\(") {
                md_content = re.replace_all(&md_content, "[$1](").to_string();
            }
        }

        // Disable files if configured
        if self.markdown_disable_files
            && let Ok(re) = Regex::new(r"(?i)\[([^\]]+)\]\(([^)]+)\)")
        {
            let ignore_regexes = self.ignore_regexes.clone();
            md_content = re
                .replace_all(&md_content, |caps: &regex::Captures| {
                    let url = caps.get(2).map_or("", |m| m.as_str());

                    // Skip http(s), tel:, mailto: and other protocol URLs
                    if url.starts_with("http://")
                        || url.starts_with("https://")
                        || url.starts_with("tel:")
                        || url.starts_with("mailto:")
                    {
                        return caps[0].to_string();
                    }

                    let full_url = url.to_string();
                    let ext = url.rsplit('.').next().unwrap_or("").to_lowercase();

                    // Check ignore patterns
                    for ignore_regex in &ignore_regexes {
                        if let Ok(re) = Regex::new(ignore_regex)
                            && re.is_match(&full_url)
                        {
                            return caps[0].to_string();
                        }
                    }

                    // Keep page links and images (disable-files targets downloadable documents)
                    if ["md", "html", "htm", "jpg", "png", "gif", "webp", "avif"].contains(&ext.as_str()) {
                        return caps[0].to_string();
                    }

                    String::new()
                })
                .to_string();

            md_content = md_content.replace("  ", " ");
        }

        // Remove empty links
        if let Ok(re) = Regex::new(r"\[[^\]]*\]\(\)") {
            md_content = re.replace_all(&md_content, "").to_string();
        }

        // Remove empty list items (e.g. after disabling images and files)
        if let Ok(re) = Regex::new(r"(?m)^\s*[-*+]\s*$\n?") {
            md_content = re.replace_all(&md_content, "").to_string();
        }

        // Remove links where text is a bare filename (fallback from removed media like <video>)
        // e.g. [some-page.html](some-page.md) — real link text never looks like a raw filename
        if let Ok(re) = Regex::new(r"(?m)^\s*\[([^\]\s]+\.html?)\]\([^\)]+\)\s*$\n?") {
            md_content = re.replace_all(&md_content, "").to_string();
        }

        // Remove table rows where all cells are empty (e.g. after content removal)
        if let Ok(re) = Regex::new(r"(?m)^\|\s*(\|\s*)+$\n?") {
            md_content = re.replace_all(&md_content, "").to_string();
        }

        // Remove empty lines in code blocks
        md_content = md_content.replace("\\\n\n  -", "\\\n  -");

        // Remove empty lines at beginning of code blocks
        if let Ok(re) = Regex::new(r"```\n{2,}") {
            md_content = re.replace_all(&md_content, "```\n").to_string();
        }

        // Apply additional fixes
        md_content = self.remove_empty_lines_in_lists(&md_content);
        md_content = self.move_content_before_main_heading_to_end(&md_content);
        md_content = self.fix_multiline_images(&md_content);
        md_content = self.detect_and_set_code_language(&md_content);

        // Add backticks around --param inside tables
        if let Ok(re) = Regex::new(r"(?i)\| -{1,2}([a-z0-9][a-z0-9-]*) \|") {
            md_content = re.replace_all(&md_content, "| `--$1` |").to_string();
        }

        // Remove 3+ empty lines to 2 empty lines
        if let Ok(re) = Regex::new(r"\n{3,}") {
            md_content = re.replace_all(&md_content, "\n\n").to_string();
        }

        // Trim special chars (only whitespace from start, all special chars from end
        // to preserve markdown-significant characters like # headings and - lists at the start)
        md_content = md_content
            .trim_start_matches(['\n', '\t', ' '])
            .trim_end_matches(['\n', '\t', ' ', '-', '#', '*'])
            .to_string();

        // Fix excessive whitespace
        md_content = self.remove_excessive_whitespace(&md_content);

        // Collapse large link lists into accordions (must run after all list normalization)
        md_content = HtmlToMarkdownConverter::collapse_large_link_lists(&md_content);

        md_content
    }

    /// Remove excessive whitespace from markdown content.
    fn remove_excessive_whitespace(&self, md: &str) -> String {
        let lines: Vec<&str> = md.split('\n').collect();
        let mut result: Vec<String> = Vec::new();
        let mut in_code_block = false;
        let mut last_line_was_empty = false;

        let code_block_re = Regex::new(r"^```").ok();
        let list_item_re = Regex::new(r"^(\s*)([-*+]|\d+\.)\s").ok();
        let table_row_re = Regex::new(r"^\s*\|.*\|\s*$").ok();
        let heading_re = Regex::new(r"^#+\s+").ok();
        let whitespace_re = Regex::new(r"\s+").ok();

        for line in &lines {
            if code_block_re.as_ref().map(|re| re.is_match(line)).unwrap_or(false) {
                in_code_block = !in_code_block;
                result.push(line.to_string());
                last_line_was_empty = false;
                continue;
            }

            if in_code_block {
                result.push(line.to_string());
                last_line_was_empty = false;
                continue;
            }

            let is_list_item = list_item_re.as_ref().map(|re| re.is_match(line)).unwrap_or(false);
            let is_table_row = table_row_re.as_ref().map(|re| re.is_match(line)).unwrap_or(false);
            let is_heading = heading_re.as_ref().map(|re| re.is_match(line)).unwrap_or(false);

            if line.trim().is_empty() {
                if !last_line_was_empty {
                    result.push(String::new());
                    last_line_was_empty = true;
                }
                continue;
            }

            if is_list_item || is_table_row || is_heading {
                result.push(line.to_string());
            } else {
                let trimmed = whitespace_re
                    .as_ref()
                    .map(|re| re.replace_all(line.trim(), " ").to_string())
                    .unwrap_or_else(|| line.trim().to_string());
                if !trimmed.is_empty() {
                    result.push(trimmed);
                }
            }
            last_line_was_empty = false;
        }

        let mut content = result.join("\n");

        // Remove spaces at the end of lines
        if let Ok(re) = Regex::new(r"(?m)[ \t]+$") {
            content = re.replace_all(&content, "").to_string();
        }

        content
    }

    /// Remove empty lines between list items.
    fn remove_empty_lines_in_lists(&self, md: &str) -> String {
        let lines: Vec<&str> = md.split('\n').collect();
        let mut result: Vec<String> = Vec::new();
        let mut in_list = false;
        let mut last_line_empty = false;
        let mut last_indent_level: i32 = 0;

        let list_re = Regex::new(r"^[ ]{0,3}[-*+][ ]|^[ ]{0,3}\d+\.[ ]|^[ ]{2,}[-*+][ ]").ok();

        for line in &lines {
            let trimmed_line = line.trim();
            let is_empty = trimmed_line.is_empty();

            let is_list_item = list_re.as_ref().map(|re| re.is_match(line)).unwrap_or(false);

            if is_list_item {
                in_list = true;

                if last_line_empty {
                    let leading_spaces: i32 = line.len() as i32 - line.trim_start().len() as i32;

                    if (leading_spaces - last_indent_level).abs() > 2 {
                        // Different nesting level, keep empty line
                    } else {
                        // Same nesting level, remove empty line
                        result.pop();
                    }
                }

                result.push(line.to_string());
                last_line_empty = false;
                let leading_spaces = line.len() as i32 - line.trim_start().len() as i32;
                last_indent_level = leading_spaces;
            } else if is_empty {
                result.push(line.to_string());
                last_line_empty = true;
            } else {
                let leading_spaces = line.len() as i32 - line.trim_start().len() as i32;
                if in_list && leading_spaces < last_indent_level {
                    in_list = false;
                }
                result.push(line.to_string());
                last_line_empty = false;
                last_indent_level = leading_spaces;
            }
        }

        result.join("\n")
    }

    /// Move content before the main heading to the end.
    fn move_content_before_main_heading_to_end(&self, md: &str) -> String {
        if !self.markdown_move_content_before_h1_to_end {
            return md.to_string();
        }

        let mut headings: Vec<(usize, usize)> = Vec::new(); // (offset, level)

        // ATX headings
        if let Ok(re) = Regex::new(r"(?m)^(#{1,6})\s.*$") {
            for mat in re.find_iter(md) {
                let level = mat.as_str().chars().take_while(|c| *c == '#').count();
                headings.push((mat.start(), level));
            }
        }

        // Setext headings
        if let Ok(re) = Regex::new(r"(?m)^(.+?)\n(=+|-+)\s*$") {
            for caps in re.captures_iter(md) {
                if let (Some(text_match), Some(underline_match)) = (caps.get(1), caps.get(2)) {
                    if text_match.as_str().trim().is_empty() {
                        continue;
                    }
                    let underline = underline_match.as_str();
                    let level = if underline.starts_with('=') { 1 } else { 2 };
                    headings.push((text_match.start(), level));
                }
            }
        }

        if headings.is_empty() {
            return md.to_string();
        }

        // Find the highest level (lowest number)
        let min_level = headings.iter().map(|(_, level)| *level).min().unwrap_or(6);

        // Find first heading with that level
        let main_heading = headings
            .iter()
            .filter(|(_, level)| *level == min_level)
            .min_by_key(|(offset, _)| *offset);

        if let Some((heading_pos, _)) = main_heading {
            let content_before = &md[..*heading_pos];
            let content_after = &md[*heading_pos..];

            if content_before.trim().is_empty() {
                return md.to_string();
            }

            format!("{}\n\n---\n\n{}", content_after.trim(), content_before.trim())
        } else {
            md.to_string()
        }
    }

    /// Fix multi-line images and links.
    fn fix_multiline_images(&self, md: &str) -> String {
        md.replace("[\n![", "[![").replace(")\n](", ")](")
    }

    /// Detect and set code language for unlabeled code blocks.
    fn detect_and_set_code_language(&self, md: &str) -> String {
        let code_block_re = match Regex::new(r"(?s)```\s*\n((?:[^`]|`[^`]|``[^`])*?)\n```") {
            Ok(re) => re,
            Err(_) => return md.to_string(),
        };

        code_block_re
            .replace_all(md, |caps: &regex::Captures| {
                let code = caps.get(1).map_or("", |m| m.as_str());
                let detected = self.detect_language(code);
                format!("```{}\n{}\n```", detected, code)
            })
            .to_string()
    }

    /// Detect programming language from code content.
    fn detect_language(&self, code: &str) -> String {
        let patterns: Vec<(&str, Vec<&str>)> = vec![
            (
                "php",
                vec![
                    r"^<\?php",
                    r"\$[a-zA-Z_]",
                    r"\b(?:public|private|protected)\s+function\b",
                    r"\bnamespace\s+[a-zA-Z\\]+;",
                ],
            ),
            (
                "javascript",
                vec![
                    r"\bconst\s+[a-zA-Z_][a-zA-Z0-9_]*\s*=",
                    r"\bfunction\s*\([^)]*\)\s*\{",
                    r"\blet\s+[a-zA-Z_][a-zA-Z0-9_]*\s*=",
                    r"\bconsole\.log\(",
                    r"=>\s*\{",
                ],
            ),
            (
                "jsx",
                vec![
                    r"return\s+\(",
                    r"import\s+[a-zA-Z0-9_,\{\} ]+\s+from",
                    r"export\s+(default|const)",
                ],
            ),
            (
                "typescript",
                vec![
                    r":\s*(?:string|number|boolean|any)\b",
                    r"interface\s+[A-Z][a-zA-Z0-9_]*\s*\{",
                    r"type\s+[A-Z][a-zA-Z0-9_]*\s*=",
                ],
            ),
            (
                "python",
                vec![
                    r"(?m)def\s+[a-zA-Z_][a-zA-Z0-9_]*\s*\([^)]*\):\s*$",
                    r"(?m)^from\s+[a-zA-Z_.]+\s+import\b",
                    r"(?m)^if\s+__name__\s*==\s*['\x22]__main__['\x22]:\s*$",
                ],
            ),
            (
                "java",
                vec![
                    r"public\s+class\s+[A-Z][a-zA-Z0-9_]*",
                    r"System\.out\.println\(",
                    r"private\s+final\s+",
                ],
            ),
            (
                "rust",
                vec![
                    r"fn\s+[a-z_][a-z0-9_]*\s*\([^)]*\)\s*(?:->\s*[a-zA-Z<>]+\s*)?\{",
                    r"let\s+mut\s+",
                    r"impl\s+[A-Z][a-zA-Z0-9_]*",
                ],
            ),
            (
                "ruby",
                vec![
                    r"(?m)^require\s+['\x22][a-zA-Z0-9_/]+['\x22]",
                    r"def\s+[a-z_][a-z0-9_]*\b",
                    r"\battr_accessor\b",
                ],
            ),
            (
                "css",
                vec![
                    r"(?m)^[.#][a-zA-Z\-_][^\{]*\{",
                    r"\b(?:margin|padding|border|color|background):\s*[^;]+;",
                    r"@media\s+",
                ],
            ),
            (
                "bash",
                vec![
                    r"^#!/bin/(?:bash|sh)",
                    r"\$\([^)]+\)",
                    r"(?:^|\s)(?:-{1,2}[a-zA-Z0-9]+)",
                    r"\becho\s+",
                    r"\|\s*grep\b",
                ],
            ),
            (
                "go",
                vec![
                    r"\bfunc\s+[a-zA-Z_][a-zA-Z0-9_]*\s*\([^)]*\)",
                    r"\btype\s+[A-Z][a-zA-Z0-9_]*\s+struct\b",
                    r"\bpackage\s+[a-z][a-z0-9_]*\b",
                    r"\bif\s+err\s*!=\s*nil\b",
                ],
            ),
            (
                "csharp",
                vec![
                    r"\bnamespace\s+[A-Za-z.]+\b",
                    r"\bpublic\s+(?:class|interface|enum)\b",
                    r"\busing\s+[A-Za-z.]+;",
                    r"\basync\s+Task<",
                ],
            ),
            (
                "kotlin",
                vec![
                    r"\bfun\s+[a-zA-Z_][a-zA-Z0-9_]*\s*\(",
                    r"\bval\s+[a-zA-Z_][a-zA-Z0-9_]*:",
                    r"\bvar\s+[a-zA-Z_][a-zA-Z0-9_]*:",
                    r"\bdata\s+class\b",
                ],
            ),
            (
                "swift",
                vec![
                    r"\bfunc\s+[a-zA-Z_][a-zA-Z0-9_]*\s*\(",
                    r"\bvar\s+[a-zA-Z_][a-zA-Z0-9_]*:\s*[A-Z]",
                    r"\blet\s+[a-zA-Z_][a-zA-Z0-9_]*:",
                    r"\bclass\s+[A-Z][A-Za-z0-9_]*:",
                ],
            ),
            (
                "cpp",
                vec![
                    r"\b(?:class|struct)\s+[A-Z][a-zA-Z0-9_]*\b",
                    r"\bstd::[a-z0-9_]+",
                    r"\b#include\s+[<\x22][a-z0-9_.]+[>\x22]",
                    r"\btemplate\s*<[^>]+>",
                ],
            ),
            (
                "scala",
                vec![
                    r"\bdef\s+[a-z][a-zA-Z0-9_]*\s*\(",
                    r"\bcase\s+class\b",
                    r"\bobject\s+[A-Z][a-zA-Z0-9_]*\b",
                    r"\bval\s+[a-z][a-zA-Z0-9_]*\s*=",
                ],
            ),
            (
                "perl",
                vec![
                    r"\buse\s+[A-Z][A-Za-z:]+;",
                    r"\bsub\s+[a-z_][a-z0-9_]*\s*\{",
                    r"@[a-zA-Z_][a-zA-Z0-9_]*",
                ],
            ),
            (
                "lua",
                vec![
                    r"\bfunction\s+[a-z_][a-z0-9_]*\s*\(",
                    r"\blocal\s+[a-z_][a-z0-9_]*\s*=",
                    r"\brequire\s*\(?['\x22][^'\x22]+['\x22]\)?",
                ],
            ),
            (
                "vb",
                vec![
                    r"\bPublic\s+(?:Class|Interface|Module)\b",
                    r"\bPrivate\s+Sub\s+[A-Za-z_][A-Za-z0-9_]*\(",
                    r"\bDim\s+[A-Za-z_][A-Za-z0-9_]*\s+As\b",
                    r"\bEnd\s+(?:Sub|Function|Class|If|While)\b",
                ],
            ),
            (
                "fsharp",
                vec![
                    r"\blet\s+[a-z_][a-zA-Z0-9_]*\s*=",
                    r"\bmodule\s+[A-Z][A-Za-z0-9_]*\s*=",
                    r"\btype\s+[A-Z][A-Za-z0-9_]*\s*=",
                    r"\bmatch\s+.*\bwith\b",
                ],
            ),
            (
                "powershell",
                vec![
                    r"\$[A-Za-z_][A-Za-z0-9_]*",
                    r"\[Parameter\(.*?\)\]",
                    r"\bfunction\s+[A-Z][A-Za-z0-9-]*",
                    r"\b(?:Get|Set|New|Remove)-[A-Z][A-Za-z]*",
                ],
            ),
            (
                "xaml",
                vec![
                    r"<Window\s+[^>]*>",
                    r"<UserControl\s+[^>]*>",
                    r"xmlns:(?:x|d)=\x22[^\x22]+\x22",
                    r"<(?:Grid|StackPanel|DockPanel)[^>]*>",
                ],
            ),
            (
                "razor",
                vec![
                    r"@(?:model|using|inject)",
                    r"@Html\.[A-Za-z]+\(",
                    r"@\{.*?\}",
                    r#"<partial\s+name=\x22[^\x22]+\x22\s*/>"#,
                ],
            ),
            (
                "html",
                vec![r"<(html|head|body|h1|a|img|table|tr|td|ul|ol|li|script|style)[^>]*>"],
            ),
        ];

        let mut best_lang = String::new();
        let mut best_score = 0usize;

        for (lang, lang_patterns) in &patterns {
            let mut score = 0usize;
            for pattern in lang_patterns {
                if let Ok(re) = Regex::new(pattern) {
                    score += re.find_iter(code).count();
                }
            }
            if score > best_score {
                best_score = score;
                best_lang = lang.to_string();
            }
        }

        best_lang
    }

    /// Check if URL should be stored based on filters.
    fn should_be_url_stored(&self, visited_url: &VisitedUrl) -> bool {
        let mut result = false;

        if !self.markdown_export_store_only_url_regex.is_empty() {
            for regex_str in &self.markdown_export_store_only_url_regex {
                let pattern = crate::utils::extract_pcre_regex_pattern(regex_str);
                if let Ok(re) = Regex::new(&pattern)
                    && re.is_match(&visited_url.url)
                {
                    result = true;
                    break;
                }
            }
        } else {
            result = true;
        }

        // Do not store robots.txt
        if visited_url.url.ends_with("robots.txt") {
            result = false;
        }

        result
    }

    /// Get relative file path for storing a visited URL.
    fn get_relative_file_path_for_file_by_url(&self, visited_url: &VisitedUrl, status: &Status) -> String {
        let initial_url = self
            .initial_parsed_url
            .clone()
            .unwrap_or_else(|| ParsedUrl::parse(&visited_url.url, None));

        let source_url = if !visited_url.source_uq_id.is_empty() {
            status
                .get_url_by_uq_id(&visited_url.source_uq_id)
                .unwrap_or_else(|| visited_url.url.clone())
        } else {
            visited_url.url.clone()
        };

        let base_url = ParsedUrl::parse(&source_url, None);
        let target_url = ParsedUrl::parse(&visited_url.url, None);

        let attribute = if visited_url.content_type == ContentTypeId::Image {
            "src"
        } else {
            "href"
        };

        let mut converter = OfflineUrlConverter::new(initial_url, base_url, target_url, None, None, Some(attribute));

        let relative_url = converter.convert_url_to_relative(false);
        let relative_target_url = converter.get_relative_target_url();
        let target_domain_relation = converter.get_target_domain_relation();

        match target_domain_relation {
            TargetDomainRelation::InitialDifferentBaseSame | TargetDomainRelation::InitialDifferentBaseDifferent => {
                let relative_path = relative_url
                    .replace("../", "")
                    .trim_start_matches(['/', ' '])
                    .to_string();
                let host = relative_target_url.host.as_deref().unwrap_or("");
                if !relative_path.starts_with(&format!("_{}", host)) {
                    format!("_{}/{}", host, relative_path)
                } else {
                    relative_path
                }
            }
            TargetDomainRelation::InitialSameBaseSame | TargetDomainRelation::InitialSameBaseDifferent => relative_url
                .replace("../", "")
                .trim_start_matches(['/', ' '])
                .to_string(),
        }
    }

    /// Validate URL.
    fn is_valid_url(url: &str) -> bool {
        url::Url::parse(url).is_ok()
    }
}

impl Exporter for MarkdownExporter {
    fn get_name(&self) -> &str {
        "MarkdownExporter"
    }

    fn should_be_activated(&self) -> bool {
        self.markdown_export_directory.is_some() || self.markdown_export_single_file.is_some()
    }

    fn export(&mut self, status: &Status, _output: &dyn Output) -> CrawlerResult<()> {
        let start_time = Instant::now();

        // Set replace_query_string configuration
        OfflineUrlConverter::set_replace_query_string(self.markdown_replace_query_string.clone());

        // Determine valid content types
        let mut valid_content_types = vec![ContentTypeId::Html, ContentTypeId::Redirect];
        if !self.markdown_disable_images {
            valid_content_types.push(ContentTypeId::Image);
        }
        if !self.markdown_disable_files {
            valid_content_types.push(ContentTypeId::Document);
        }

        let visited_urls = status.get_visited_urls();

        // Filter relevant URLs
        let exported_urls: Vec<&VisitedUrl> = visited_urls
            .iter()
            .filter(|u| {
                // Do not store images from non-img-src sources
                if u.is_image() && !matches!(u.source_attr, SOURCE_IMG_SRC | SOURCE_A_HREF) {
                    return false;
                }

                u.status_code == 200 && valid_content_types.contains(&u.content_type)
            })
            .collect();

        // Store all allowed URLs
        for exported_url in &exported_urls {
            if Self::is_valid_url(&exported_url.url) && self.should_be_url_stored(exported_url) {
                self.store_file(exported_url, status)?;
            }
        }

        // Add info to summary
        let duration = start_time.elapsed().as_secs_f64();
        if let Some(ref export_dir) = self.markdown_export_directory {
            let formatted_path = utils::get_output_formatted_path(export_dir);
            let formatted_duration = utils::get_formatted_duration(duration);
            status.add_info_to_summary(
                "markdown-generated",
                &format!(
                    "Markdown content generated to '{}' and took {}",
                    formatted_path, formatted_duration
                ),
            );
        }

        // Combine markdown files to single file if requested
        if let (Some(single_file), Some(export_dir)) =
            (&self.markdown_export_single_file, &self.markdown_export_directory)
        {
            let combine_start = Instant::now();
            let combiner = MarkdownSiteAggregator::new(&self.initial_url);

            match combiner.combine_directory(export_dir, self.markdown_remove_links_and_images_from_single_file) {
                Ok(combined_markdown) => {
                    // Ensure directory exists
                    if let Some(parent) = Path::new(single_file).parent()
                        && !parent.exists()
                    {
                        fs::create_dir_all(parent).map_err(|e| {
                            CrawlerError::Export(format!(
                                "Cannot create directory for single markdown file: '{}': {}",
                                parent.display(),
                                e
                            ))
                        })?;
                    }

                    fs::write(single_file, &combined_markdown).map_err(|e| {
                        CrawlerError::Export(format!("Cannot write single markdown file '{}': {}", single_file, e))
                    })?;

                    let combine_duration = combine_start.elapsed().as_secs_f64();
                    let formatted_path = utils::get_output_formatted_path(single_file);
                    let formatted_duration = utils::get_formatted_duration(combine_duration);
                    status.add_info_to_summary(
                        "markdown-combined",
                        &format!(
                            "Markdown files combined into single file '{}' and took {}",
                            formatted_path, formatted_duration
                        ),
                    );
                }
                Err(e) => {
                    status.add_critical_to_summary(
                        "markdown-combine-error",
                        &format!("Error combining markdown files: {}", e),
                    );
                }
            }
        }

        Ok(())
    }
}

/// Extract regex pattern from a delimited string.
fn extract_regex_pattern(input: &str) -> Option<String> {
    if input.len() < 2 {
        return None;
    }
    let delimiter = input.chars().next()?;
    let rest = &input[1..];
    if let Some(end_pos) = rest.rfind(delimiter) {
        let pattern = &rest[..end_pos];
        let flags = &rest[end_pos + 1..];
        let mut regex_pattern = String::new();
        if flags.contains('i') {
            regex_pattern.push_str("(?i)");
        }
        regex_pattern.push_str(pattern);
        Some(regex_pattern)
    } else {
        None
    }
}

/// Convert a local HTML file to Markdown without crawling.
/// Used by the `--html-to-markdown` CLI mode.
pub fn convert_html_file_to_markdown(
    html_file_path: &str,
    exclude_selectors: Vec<String>,
    disable_images: bool,
    disable_files: bool,
    move_content_before_h1_to_end: bool,
) -> Result<String, CrawlerError> {
    let html_content = fs::read_to_string(html_file_path)
        .map_err(|e| CrawlerError::Export(format!("Cannot read HTML file '{}': {}", html_file_path, e)))?;

    let converter = HtmlToMarkdownConverter::new(&html_content, exclude_selectors);
    let markdown = converter.get_markdown();

    let mut exporter = MarkdownExporter::new();
    exporter.set_markdown_disable_images(disable_images);
    exporter.set_markdown_disable_files(disable_files);
    exporter.set_markdown_move_content_before_h1_to_end(move_content_before_h1_to_end);

    Ok(exporter.normalize_markdown_content(&markdown, false))
}

#[cfg(test)]
mod tests {
    use super::*;
    use std::fs;

    #[test]
    fn test_should_be_activated() {
        let mut exporter = MarkdownExporter::new();
        assert!(!exporter.should_be_activated());

        exporter.set_markdown_export_directory(Some("/tmp/md".to_string()));
        assert!(exporter.should_be_activated());
    }

    #[test]
    fn test_detect_language_rust() {
        let exporter = MarkdownExporter::new();
        assert_eq!(exporter.detect_language("fn main() {\n    let mut x = 5;\n}"), "rust");
    }

    #[test]
    fn test_detect_language_python() {
        let exporter = MarkdownExporter::new();
        assert_eq!(
            exporter.detect_language("def hello():\n    print('hello')\nfrom os import path"),
            "python"
        );
    }

    #[test]
    fn test_fix_multiline_images() {
        let exporter = MarkdownExporter::new();
        let input = "[\n![image](src)](link)";
        let result = exporter.fix_multiline_images(input);
        assert_eq!(result, "[![image](src)](link)");
    }

    // --- Helper to test normalize_markdown_content ---

    fn normalize(exporter: &MarkdownExporter, content: &str) -> String {
        exporter.normalize_markdown_content(content, true)
    }

    // --- Tests for d2f9e51: preserve heading markers when trimming ---

    #[test]
    fn test_trim_preserves_heading_at_start() {
        let exporter = MarkdownExporter::new();
        let result = normalize(&exporter, "# My Heading\n\nSome text.");
        assert!(
            result.starts_with("# My Heading"),
            "Heading should be preserved at start: {:?}",
            result
        );
    }

    #[test]
    fn test_trim_removes_special_chars_at_end() {
        let exporter = MarkdownExporter::new();
        let result = normalize(&exporter, "Some text\n\n---\n\n###\n\n");
        assert!(!result.ends_with('#'));
        assert!(!result.ends_with('-'));
    }

    // --- Tests for 2a07ac4: preserve .html/.htm in disable-files ---

    #[test]
    fn test_disable_files_preserves_html_links() {
        let mut exporter = MarkdownExporter::new();
        exporter.set_markdown_disable_files(true);
        // Note: .html → .md conversion runs first, so page.html becomes page.md
        let result = normalize(&exporter, "[Click here](page.html)\n[Download](doc.pdf)");
        assert!(
            result.contains("[Click here](page.md)"),
            "Page links should be preserved (as .md): {:?}",
            result
        );
        assert!(!result.contains("doc.pdf"), "PDF links should be removed: {:?}", result);
    }

    #[test]
    fn test_disable_files_preserves_htm_links() {
        let mut exporter = MarkdownExporter::new();
        exporter.set_markdown_disable_files(true);
        let result = normalize(&exporter, "[Page](old.htm)");
        assert!(
            result.contains("[Page](old.htm)"),
            "HTM links should be preserved: {:?}",
            result
        );
    }

    #[test]
    fn test_disable_files_preserves_md_links() {
        let mut exporter = MarkdownExporter::new();
        exporter.set_markdown_disable_files(true);
        let result = normalize(&exporter, "[About](about.md)");
        assert!(
            result.contains("[About](about.md)"),
            "MD links should be preserved: {:?}",
            result
        );
    }

    // --- Tests for 9a6df27: preserve tel: and mailto: links ---

    #[test]
    fn test_disable_files_preserves_tel_links() {
        let mut exporter = MarkdownExporter::new();
        exporter.set_markdown_disable_files(true);
        let result = normalize(&exporter, "[Call us](tel:+420123456789)");
        assert!(
            result.contains("[Call us](tel:+420123456789)"),
            "tel: links should be preserved: {:?}",
            result
        );
    }

    #[test]
    fn test_disable_files_preserves_mailto_links() {
        let mut exporter = MarkdownExporter::new();
        exporter.set_markdown_disable_files(true);
        let result = normalize(&exporter, "[Email](mailto:info@example.com)");
        assert!(
            result.contains("[Email](mailto:info@example.com)"),
            "mailto: links should be preserved: {:?}",
            result
        );
    }

    #[test]
    fn test_disable_files_preserves_https_links() {
        let mut exporter = MarkdownExporter::new();
        exporter.set_markdown_disable_files(true);
        let result = normalize(&exporter, "Text\n\n[External](https://example.com/page)\n\nMore");
        assert!(
            result.contains("[External](https://example.com/page)"),
            "HTTPS links should be preserved: {:?}",
            result
        );
    }

    #[test]
    fn test_disable_files_removes_pdf() {
        let mut exporter = MarkdownExporter::new();
        exporter.set_markdown_disable_files(true);
        let result = normalize(&exporter, "[Manual](manual.pdf)");
        assert!(!result.contains("manual.pdf"), "PDF should be removed: {:?}", result);
    }

    // --- Tests for 8606edc: empty list items + link text whitespace ---

    #[test]
    fn test_empty_list_items_removed() {
        let exporter = MarkdownExporter::new();
        let result = normalize(&exporter, "Text\n\n- Item 1\n- \n- Item 2");
        assert!(result.contains("Item 1"), "Item 1 should be present: {:?}", result);
        assert!(result.contains("Item 2"), "Item 2 should be present: {:?}", result);
    }

    #[test]
    fn test_disable_images_normalizes_link_whitespace() {
        let mut exporter = MarkdownExporter::new();
        exporter.set_markdown_disable_images(true);
        let result = normalize(&exporter, "Text\n\n[ Some text](page.md)");
        assert!(
            result.contains("[Some text](page.md)"),
            "Leading whitespace in link text should be removed: {:?}",
            result
        );
    }

    #[test]
    fn test_disable_images_removes_standard_images() {
        let mut exporter = MarkdownExporter::new();
        exporter.set_markdown_disable_images(true);
        let result = normalize(&exporter, "Text before\n\n![Alt text](image.png)\n\nText after");
        assert!(
            !result.contains("![Alt text]"),
            "Images should be removed: {:?}",
            result
        );
        assert!(result.contains("Text before"));
        assert!(result.contains("Text after"));
    }

    // --- Tests for 9e1def9 + 274439e: orphaned filename links ---

    #[test]
    fn test_orphaned_filename_link_removed() {
        let exporter = MarkdownExporter::new();
        let result = normalize(
            &exporter,
            "## Heading\n\n[some-page.html](some-page.md)\n\nReal content",
        );
        assert!(
            !result.contains("some-page.html"),
            "Orphaned filename link should be removed: {:?}",
            result
        );
        assert!(result.contains("Real content"));
    }

    #[test]
    fn test_orphaned_filename_link_with_leading_whitespace() {
        let exporter = MarkdownExporter::new();
        let result = normalize(&exporter, "## Heading\n\n [my-page.html](my-page.md)\n\nContent");
        assert!(
            !result.contains("my-page.html"),
            "Indented orphaned link should be removed: {:?}",
            result
        );
    }

    #[test]
    fn test_real_link_text_not_removed() {
        let exporter = MarkdownExporter::new();
        // .html → .md conversion runs first
        let result = normalize(&exporter, "[Click here](page.html)\n\nSome text");
        assert!(
            result.contains("[Click here](page.md)"),
            "Links with real text should be kept: {:?}",
            result
        );
    }

    // --- Tests for b8511cc: empty table rows ---

    #[test]
    fn test_empty_table_rows_removed() {
        let exporter = MarkdownExporter::new();
        let result = normalize(&exporter, "| Header |\n| --- |\n| | |\n| Data |\n");
        assert!(
            !result.contains("| | |"),
            "Empty table rows should be removed: {:?}",
            result
        );
        assert!(result.contains("Data"));
    }

    #[test]
    fn test_table_row_with_content_preserved() {
        let exporter = MarkdownExporter::new();
        let result = normalize(&exporter, "| Col1 | Col2 |\n| --- | --- |\n| A | B |\n");
        assert!(result.contains("| A | B |"));
    }

    // --- Tests for move_content_before_main_heading_to_end ---

    #[test]
    fn test_move_content_before_h1() {
        let mut exporter = MarkdownExporter::new();
        exporter.set_markdown_move_content_before_h1_to_end(true);
        let result = exporter.move_content_before_main_heading_to_end("Nav content\n\n# Main Title\n\nPage body");
        assert!(
            result.starts_with("# Main Title"),
            "Should start with heading: {:?}",
            result
        );
        assert!(result.contains("Nav content"));
        assert!(result.contains("---"));
    }

    #[test]
    fn test_move_content_disabled() {
        let exporter = MarkdownExporter::new();
        let input = "Nav content\n\n# Main Title\n\nPage body";
        let result = exporter.move_content_before_main_heading_to_end(input);
        assert_eq!(result, input, "Should return unchanged when disabled");
    }

    #[test]
    fn test_move_content_nothing_before_h1() {
        let mut exporter = MarkdownExporter::new();
        exporter.set_markdown_move_content_before_h1_to_end(true);
        let input = "# Title\n\nBody text";
        let result = exporter.move_content_before_main_heading_to_end(input);
        assert_eq!(result, input, "Should return unchanged when nothing before h1");
    }

    // --- Tests for normalize_markdown_content and convert_html_file_to_markdown ---

    #[test]
    fn test_normalize_content_without_html_link_replacement() {
        let exporter = MarkdownExporter::new();
        let result = exporter.normalize_markdown_content("# Title\n\n[Link](page.html)\n\nText", false);
        assert!(
            result.contains("[Link](page.html)"),
            "Links should stay as .html: {:?}",
            result
        );
    }

    #[test]
    fn test_normalize_content_with_html_link_replacement() {
        let exporter = MarkdownExporter::new();
        let result = exporter.normalize_markdown_content("# Title\n\n[Link](page.html)\n\nText", true);
        assert!(
            result.contains("[Link](page.md)"),
            "Links should be converted to .md: {:?}",
            result
        );
    }

    #[test]
    fn test_convert_html_file_basic() {
        let html = "<html><body><h1>Hello</h1><p>World</p></body></html>";
        let path = "/tmp/siteone_test_htm_convert.html";
        fs::write(path, html).unwrap();
        let result = convert_html_file_to_markdown(path, vec![], false, false, false).unwrap();
        let _ = fs::remove_file(path);
        assert!(result.contains("# Hello"), "Should contain h1: {:?}", result);
        assert!(result.contains("World"), "Should contain paragraph: {:?}", result);
    }

    #[test]
    fn test_convert_html_file_nonexistent() {
        let result = convert_html_file_to_markdown("/tmp/nonexistent_siteone_test.html", vec![], false, false, false);
        assert!(result.is_err(), "Should error on nonexistent file");
    }

    #[test]
    fn test_convert_html_file_with_disable_images() {
        let html = "<html><body><h1>Title</h1><img src=\"photo.jpg\" alt=\"Photo\"><p>Text</p></body></html>";
        let path = "/tmp/siteone_test_htm_images.html";
        fs::write(path, html).unwrap();
        let result = convert_html_file_to_markdown(path, vec![], true, false, false).unwrap();
        let _ = fs::remove_file(path);
        assert!(!result.contains("photo.jpg"), "Images should be removed: {:?}", result);
        assert!(result.contains("Text"));
    }

    #[test]
    fn test_convert_html_file_preserves_html_links() {
        let html = r#"<html><body><h1>Title</h1><a href="other.html">Link</a></body></html>"#;
        let path = "/tmp/siteone_test_htm_links.html";
        fs::write(path, html).unwrap();
        let result = convert_html_file_to_markdown(path, vec![], false, false, false).unwrap();
        let _ = fs::remove_file(path);
        assert!(
            result.contains("other.html"),
            "HTML links should NOT be converted to .md: {:?}",
            result
        );
    }
}


================================================
FILE: src/export/mod.rs
================================================
pub mod exporter;
pub mod html_report;
pub mod utils;

pub mod base_exporter;
pub mod file_exporter;
pub mod mailer_exporter;
pub mod markdown_exporter;
pub mod offline_website_exporter;
pub mod sitemap_exporter;
pub mod upload_exporter;


================================================
FILE: src/export/offline_website_exporter.rs
================================================
// SiteOne Crawler - OfflineWebsiteExporter
// (c) Jan Reges <jan.reges@siteone.cz>
//
// Saves all crawled pages to local filesystem for offline browsing.

use std::collections::HashMap;
use std::fs;
use std::path::Path;
use std::sync::{Arc, Mutex};
use std::time::Instant;

use md5::{Digest, Md5};
use regex::Regex;

use crate::content_processor::manager::ContentProcessorManager;
use crate::engine::parsed_url::ParsedUrl;
use crate::error::{CrawlerError, CrawlerResult};
use crate::export::exporter::Exporter;
use crate::export::utils::offline_url_converter::OfflineUrlConverter;
use crate::export::utils::target_domain_relation::TargetDomainRelation;
use crate::output::output::Output;
use crate::result::status::Status;
use crate::result::visited_url::VisitedUrl;
use crate::types::ContentTypeId;
use crate::utils;

/// Content types that require URL rewriting for offline browsing
const CONTENT_TYPES_REQUIRING_CHANGES: &[ContentTypeId] = &[
    ContentTypeId::Html,
    ContentTypeId::Script,
    ContentTypeId::Stylesheet,
    ContentTypeId::Redirect,
];

/// Exports all crawled pages to a local directory for offline browsing.
/// Rewrites URLs in HTML/CSS/JS for offline navigation.
pub struct OfflineWebsiteExporter {
    offline_export_directory: Option<String>,
    offline_export_store_only_url_regex: Vec<String>,
    offline_export_remove_unwanted_code: bool,
    offline_export_no_auto_redirect_html: bool,
    offline_export_preserve_url_structure: bool,
    offline_export_lowercase: bool,
    ignore_store_file_error: bool,
    replace_content: Vec<String>,
    replace_query_string: Vec<String>,
    initial_parsed_url: Option<ParsedUrl>,
    content_processor_manager: Option<Arc<Mutex<ContentProcessorManager>>>,
    #[allow(clippy::type_complexity)]
    is_domain_allowed_for_static_files: Option<Box<dyn Fn(&str) -> bool + Send + Sync>>,
    #[allow(clippy::type_complexity)]
    is_external_domain_allowed_for_crawling: Option<Box<dyn Fn(&str) -> bool + Send + Sync>>,
    /// Maps URL -> relative file path for successfully exported files
    exported_file_paths: HashMap<String, String>,
}

impl Default for OfflineWebsiteExporter {
    fn default() -> Self {
        Self::new()
    }
}

impl OfflineWebsiteExporter {
    pub fn new() -> Self {
        Self {
            offline_export_directory: None,
            offline_export_store_only_url_regex: Vec::new(),
            offline_export_remove_unwanted_code: false,
            offline_export_no_auto_redirect_html: false,
            offline_export_preserve_url_structure: false,
            offline_export_lowercase: false,
            ignore_store_file_error: false,
            replace_content: Vec::new(),
            replace_query_string: Vec::new(),
            initial_parsed_url: None,
            content_processor_manager: None,
            is_domain_allowed_for_static_files: None,
            is_external_domain_allowed_for_crawling: None,
            exported_file_paths: HashMap::new(),
        }
    }

    pub fn set_offline_export_directory(&mut self, dir: Option<String>) {
        self.offline_export_directory = dir.map(|d| d.trim_end_matches('/').to_string());
    }

    pub fn set_offline_export_store_only_url_regex(&mut self, regexes: Vec<String>) {
        self.offline_export_store_only_url_regex = regexes;
    }

    pub fn set_offline_export_remove_unwanted_code(&mut self, remove: bool) {
        self.offline_export_remove_unwanted_code = remove;
    }

    pub fn set_offline_export_no_auto_redirect_html(&mut self, disable: bool) {
        self.offline_export_no_auto_redirect_html = disable;
    }

    pub fn set_offline_export_preserve_url_structure(&mut self, preserve: bool) {
        self.offline_export_preserve_url_structure = preserve;
    }

    pub fn set_offline_export_lowercase(&mut self, lowercase: bool) {
        self.offline_export_lowercase = lowercase;
    }

    pub fn set_ignore_store_file_error(&mut self, ignore: bool) {
        self.ignore_store_file_error = ignore;
    }

    pub fn set_replace_content(&mut self, replacements: Vec<String>) {
        self.replace_content = replacements;
    }

    pub fn set_replace_query_string(&mut self, replacements: Vec<String>) {
        self.replace_query_string = replacements;
    }

    pub fn set_initial_parsed_url(&mut self, url: ParsedUrl) {
        self.initial_parsed_url = Some(url);
    }

    pub fn set_content_processor_manager(&mut self, cpm: Arc<Mutex<ContentProcessorManager>>) {
        self.content_processor_manager = Some(cpm);
    }

    pub fn set_domain_callbacks(
        &mut self,
        static_files: Box<dyn Fn(&str) -> bool + Send + Sync>,
        crawling: Box<dyn Fn(&str) -> bool + Send + Sync>,
    ) {
        self.is_domain_allowed_for_static_files = Some(static_files);
        self.is_external_domain_allowed_for_crawling = Some(crawling);
    }

    /// Get the mapping of URL -> relative file path for all successfully exported files.
    pub fn get_exported_file_paths(&self) -> &HashMap<String, String> {
        &self.exported_file_paths
    }

    /// Store a single file to the offline export directory.
    fn store_file(&mut self, visited_url: &VisitedUrl, status: &Status, _output: &dyn Output) -> CrawlerResult<()> {
        let export_dir = self
            .offline_export_directory
            .as_ref()
            .ok_or_else(|| CrawlerError::Export("Offline export directory not set".to_string()))?;

        let body_bytes = status.get_url_body(&visited_url.uq_id).unwrap_or_default();

        // For content types requiring URL rewriting (HTML, CSS, JS), work with text
        // For binary content types (images, fonts), keep raw bytes
        let final_bytes =
            if !body_bytes.is_empty() && CONTENT_TYPES_REQUIRING_CHANGES.contains(&visited_url.content_type) {
                let mut content = String::from_utf8_lossy(&body_bytes).into_owned();

                // Apply content changes through all content processors (URL rewriting for offline)
                if let Some(ref cpm) = self.content_processor_manager {
                    let parsed_url = ParsedUrl::parse(&visited_url.url, None);
                    if let Ok(mut manager) = cpm.lock() {
                        let original_content = content.clone();

                        // Create a content loader that loads module content by URL from storage.
                        // This enables Astro module inlining (and any future processor that needs it).
                        let content_loader = |url_str: &str| -> Option<String> {
                            let parsed = ParsedUrl::parse(url_str, None);
                            let full_url = parsed.get_full_url(true, false);
                            let mut hasher = Md5::new();
                            hasher.update(full_url.as_bytes());
                            let hash = format!("{:x}", hasher.finalize());
                            let uq_id = hash[..8].to_string();
                            status.get_url_body_text(&uq_id)
                        };

                        manager.apply_content_changes_for_offline_version_with_loader(
                            &mut content,
                            visited_url.content_type,
                            &parsed_url,
                            self.offline_export_remove_unwanted_code,
                            &content_loader,
                        );
                        // If content was somehow corrupted, use original
                        if content.is_empty() {
                            content = original_content;
                        }
                    }
                }

                // Apply custom content replacements
                if !self.replace_content.is_empty() {
                    for replace in &self.replace_content {
                        let parts: Vec<&str> = replace.splitn(2, "->").collect();
                        let replace_from = parts[0].trim();
                        let replace_to = if parts.len() > 1 { parts[1].trim() } else { "" };

                        let is_regex = crate::utils::is_regex_pattern(replace_from);

                        if is_regex {
                            if let Some(pattern) = extract_regex_pattern(replace_from)
                                && let Ok(re) = Regex::new(&pattern)
                            {
                                content = re.replace_all(&content, replace_to).to_string();
                            }
                        } else {
                            content = content.replace(replace_from, replace_to);
                        }
                    }
                }

                content.into_bytes()
            } else {
                body_bytes
            };

        // Build store file path
        let relative_path = self.get_relative_file_path_for_file_by_url(visited_url, status);
        let sanitized_path = OfflineUrlConverter::sanitize_file_path(&relative_path, false);
        // Path traversal protection: strip "../" sequences from sanitized path
        let sanitized_path = sanitized_path.replace("../", "").replace("..\\", "");
        let store_file_path = format!("{}/{}", export_dir, sanitized_path);

        // Create directory structure
        let dir_path = Path::new(&store_file_path).parent().ok_or_else(|| {
            CrawlerError::Export(format!("Cannot determine parent directory for '{}'", store_file_path))
        })?;

        if !dir_path.exists() {
            fs::create_dir_all(dir_path).map_err(|e| {
                CrawlerError::Export(format!("Cannot create directory '{}': {}", dir_path.display(), e))
            })?;
        }

        // Check if we should save the file
        let mut save_file = true;
        if Path::new(&store_file_path).exists()
            && let Some(ref initial_url) = self.initial_parsed_url
            && !visited_url.is_https()
            && initial_url.is_https()
        {
            save_file = false;
            let message = format!(
                "File '{}' already exists and will not be overwritten because initial request was HTTPS and this request is HTTP: {}",
                store_file_path, visited_url.url
            );
            status.add_notice_to_summary("offline-exporter-store-file-ignored", &message);
        }

        if save_file {
            match fs::write(&store_file_path, &final_bytes) {
                Ok(()) => {
                    self.exported_file_paths
                        .insert(visited_url.url.clone(), sanitized_path.clone());
                }
                Err(e) => {
                    let has_extension = Regex::new(r"(?i)\.[a-z0-9\-]{1,15}$")
                        .map(|re| re.is_match(&store_file_path))
                        .unwrap_or(false);

                    if has_extension && !self.ignore_store_file_error {
                        return Err(CrawlerError::Export(format!(
                            "Cannot store file '{}': {}",
                            store_file_path, e
                        )));
                    } else {
                        let message = format!(
                            "Cannot store file '{}' (undefined extension). Original URL: {}",
                            store_file_path, visited_url.url
                        );
                        status.add_notice_to_summary("offline-exporter-store-file-error", &message);
                    }
                }
            }
        }

        Ok(())
    }

    /// Check if URL should be stored based on filters.
    fn should_be_url_stored(&self, visited_url: &VisitedUrl) -> bool {
        let mut result = false;

        // Check --offline-export-store-only-url-regex
        if !self.offline_export_store_only_url_regex.is_empty() {
            for regex_str in &self.offline_export_store_only_url_regex {
                let pattern = crate::utils::extract_pcre_regex_pattern(regex_str);
                if let Ok(re) = Regex::new(&pattern)
                    && re.is_match(&visited_url.url)
                {
                    result = true;
                    break;
                }
            }
        } else {
            result = true;
        }

        // Check --allow-domain-* for external domains
        if result && visited_url.is_external {
            let parsed_url = ParsedUrl::parse(&visited_url.url, None);
            if let Some(ref host) = parsed_url.host
                && let Some(ref cb) = self.is_external_domain_allowed_for_crawling
            {
                if cb(host) {
                    result = true;
                } else if visited_url.is_static_file() || parsed_url.is_static_file() {
                    if let Some(ref static_cb) = self.is_domain_allowed_for_static_files {
                        result = static_cb(host);
                    } else {
                        result = false;
                    }
                } else {
                    result = false;
                }
            }
        }

        result
    }

    /// Get relative file path for storing a visited URL.
    fn get_relative_file_path_for_file_by_url(&self, visited_url: &VisitedUrl, status: &Status) -> String {
        let initial_url = self
            .initial_parsed_url
            .clone()
            .unwrap_or_else(|| ParsedUrl::parse(&visited_url.url, None));

        let source_url = if !visited_url.source_uq_id.is_empty() {
            status
                .get_url_by_uq_id(&visited_url.source_uq_id)
                .unwrap_or_else(|| visited_url.url.clone())
        } else {
            visited_url.url.clone()
        };

        let base_url = ParsedUrl::parse(&source_url, None);
        let target_url = ParsedUrl::parse(&visited_url.url, None);

        // Determine source attribute hint
        let attribute = if visited_url.content_type == ContentTypeId::Image {
            "src"
        } else {
            "href"
        };

        let mut converter = OfflineUrlConverter::new(initial_url, base_url, target_url, None, None, Some(attribute));
        converter.set_preserve_url_structure(self.offline_export_preserve_url_structure);

        let relative_url = converter.convert_url_to_relative(false);
        let relative_target_url = converter.get_relative_target_url();
        let target_domain_relation = converter.get_target_domain_relation();

        match target_domain_relation {
            TargetDomainRelation::InitialDifferentBaseSame | TargetDomainRelation::InitialDifferentBaseDifferent => {
                let relative_path = relative_url
                    .replace("../", "")
                    .trim_start_matches(['/', ' '])
                    .to_string();
                let host = relative_target_url.host.as_deref().unwrap_or("");
                if !relative_path.starts_with(&format!("_{}", host)) {
                    format!("_{}/{}", host, relative_path)
                } else {
                    relative_path
                }
            }
            TargetDomainRelation::InitialSameBaseSame | TargetDomainRelation::InitialSameBaseDifferent => relative_url
                .replace("../", "")
                .trim_start_matches(['/', ' '])
                .to_string(),
        }
    }

    /// Validate URL for export.
    fn is_valid_url(url: &str) -> bool {
        // First try standard URL parsing
        if url::Url::parse(url).is_ok() {
            return true;
        }

        // Try with URL-encoded version for international characters
        let encoded: String = url
            .chars()
            .map(|c| {
                if c.is_ascii() && c as u32 >= 0x20 && (c as u32) <= 0x7E {
                    c.to_string()
                } else {
                    percent_encoding::utf8_percent_encode(&c.to_string(), percent_encoding::NON_ALPHANUMERIC)
                        .to_string()
                }
            })
            .collect();

        url::Url::parse(&encoded).is_ok()
    }

    /// Add redirect HTML files to subfolders that contain index.html.
    fn add_redirect_html_to_subfolders(dir: &str) -> CrawlerResult<()> {
        let dir_path = Path::new(dir);
        if !dir_path.is_dir() {
            return Ok(());
        }

        let entries = fs::read_dir(dir_path)
            .map_err(|e| CrawlerError::Export(format!("Cannot read directory '{}': {}", dir, e)))?;

        for entry in entries.flatten() {
            let path = entry.path();
            if path.is_dir() {
                let dir_name = path.file_name().and_then(|n| n.to_str()).unwrap_or("");
                let index_html_path = path.join("index.html");

                if index_html_path.exists() {
                    // Create redirect HTML file for the folder
                    let redirect_html_path = format!("{}.html", path.display());
                    if !Path::new(&redirect_html_path).exists() {
                        let redirect_content = format!(
                            "<!DOCTYPE html><meta http-equiv=\"refresh\" content=\"0;url={}/index.html\">",
                            dir_name
                        );
                        let _ = fs::write(&redirect_html_path, redirect_content);
                    }
                }

                // Recurse into subdirectories
                Self::add_redirect_html_to_subfolders(&path.to_string_lossy())?;
            }
        }

        Ok(())
    }
}

impl Exporter for OfflineWebsiteExporter {
    fn get_name(&self) -> &str {
        "OfflineWebsiteExporter"
    }

    fn should_be_activated(&self) -> bool {
        self.offline_export_directory.is_some()
    }

    fn export(&mut self, status: &Status, output: &dyn Output) -> CrawlerResult<()> {
        let start_time = Instant::now();
        let export_dir = match self.offline_export_directory {
            Some(ref dir) => dir.clone(),
            None => return Ok(()),
        };

        // Set replace_query_string configuration
        OfflineUrlConverter::set_replace_query_string(self.replace_query_string.clone());

        // Set lowercase configuration for all URL conversions
        OfflineUrlConverter::set_lowercase(self.offline_export_lowercase);

        let visited_urls = status.get_visited_urls();

        // Filter relevant URLs with OK status codes
        let exported_urls: Vec<&VisitedUrl> = visited_urls
            .iter()
            .filter(|u| matches!(u.status_code, 200 | 201 | 301 | 302 | 303 | 308))
            .collect();

        // Store all allowed URLs
        for exported_url in &exported_urls {
            if Self::is_valid_url(&exported_url.url) && self.should_be_url_stored(exported_url) {
                self.store_file(exported_url, status, output)?;
            }
        }

        // Add redirect HTML files for subfolders
        if !self.offline_export_no_auto_redirect_html {
            let _ = Self::add_redirect_html_to_subfolders(&export_dir);
        }

        // Add info to summary
        let duration = start_time.elapsed().as_secs_f64();
        let formatted_path = utils::get_output_formatted_path(&export_dir);
        let formatted_duration = utils::get_formatted_duration(duration);
        status.add_info_to_summary(
            "offline-website-generated",
            &format!(
                "Offline website generated to '{}' and took {}",
                formatted_path, formatted_duration
            ),
        );

        Ok(())
    }
}

/// Extract regex pattern from a delimited string (e.g., /pattern/flags)
fn extract_regex_pattern(input: &str) -> Option<String> {
    if input.len() < 2 {
        return None;
    }
    let delimiter = input.chars().next()?;
    let rest = &input[1..];
    if let Some(end_pos) = rest.rfind(delimiter) {
        let pattern = &rest[..end_pos];
        let flags = &rest[end_pos + 1..];
        let mut regex_pattern = String::new();
        if flags.contains('i') {
            regex_pattern.push_str("(?i)");
        }
        regex_pattern.push_str(pattern);
        Some(regex_pattern)
    } else {
        None
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_is_valid_url() {
        assert!(OfflineWebsiteExporter::is_valid_url("https://example.com/"));
        assert!(OfflineWebsiteExporter::is_valid_url("https://example.com/path/page"));
        assert!(!OfflineWebsiteExporter::is_valid_url("not-a-url"));
    }

    #[test]
    fn test_should_be_activated() {
        let mut exporter = OfflineWebsiteExporter::new();
        assert!(!exporter.should_be_activated());

        exporter.set_offline_export_directory(Some("/tmp/offline".to_string()));
        assert!(exporter.should_be_activated());
    }
}


================================================
FILE: src/export/sitemap_exporter.rs
================================================
// SiteOne Crawler - SitemapExporter
// (c) Jan Reges <jan.reges@siteone.cz>
//
// Generates sitemap.xml and/or sitemap.txt from crawl results.

use std::fs;
use std::io::Write;
use std::path::Path;

use crate::error::{CrawlerError, CrawlerResult};
use crate::export::exporter::Exporter;
use crate::output::output::Output;
use crate::result::status::Status;
use crate::types::ContentTypeId;
use crate::utils;

pub struct SitemapExporter {
    /// Path for XML sitemap output (--sitemap-xml-file)
    pub output_sitemap_xml: Option<String>,
    /// Path for TXT sitemap output (--sitemap-txt-file)
    pub output_sitemap_txt: Option<String>,
    /// Base priority for XML sitemap entries (--sitemap-base-priority)
    pub base_priority: f64,
    /// Priority increase value based on slash count (--sitemap-priority-increase)
    pub priority_increase: f64,
}

impl SitemapExporter {
    pub fn new(
        output_sitemap_xml: Option<String>,
        output_sitemap_txt: Option<String>,
        base_priority: f64,
        priority_increase: f64,
    ) -> Self {
        Self {
            output_sitemap_xml,
            output_sitemap_txt,
            base_priority,
            priority_increase,
        }
    }

    /// Collect URLs eligible for sitemap: internal, HTML, 200 status code.
    /// Sort by slash count ascending, then alphabetically.
    fn collect_sitemap_urls(&self, status: &Status) -> Vec<String> {
        let visited_urls = status.get_visited_urls();
        let mut urls: Vec<String> = visited_urls
            .iter()
            .filter(|vu| !vu.is_external && vu.content_type == ContentTypeId::Html && vu.status_code == 200)
            .map(|vu| vu.url.clone())
            .collect();

        // Sort by slash count ascending, then alphabetically
        urls.sort_by(|a, b| {
            let a_trimmed = a.trim_end_matches('/');
            let b_trimmed = b.trim_end_matches('/');
            let a_slashes = a_trimmed.matches('/').count();
            let b_slashes = b_trimmed.matches('/').count();
            a_slashes.cmp(&b_slashes).then_with(|| a.cmp(b))
        });

        urls
    }

    /// Generate an XML sitemap file.
    fn generate_xml_sitemap(&self, output_file: &str, urls: &[String]) -> CrawlerResult<String> {
        // Ensure .xml extension
        let output_file = if output_file.to_lowercase().ends_with(".xml") {
            output_file.to_string()
        } else {
            let stripped = regex::Regex::new(r"\.xml$")
                .ok()
                .map(|re| re.replace(output_file, "").to_string())
                .unwrap_or_else(|| output_file.to_string());
            format!("{}.xml", stripped)
        };

        // Ensure parent directory exists
        let path = Path::new(&output_file);
        if let Some(parent) = path.parent()
            && !parent.exists()
        {
            fs::create_dir_all(parent).map_err(|e| {
                CrawlerError::Export(format!("Cannot create output directory '{}': {}", parent.display(), e))
            })?;
        }

        // Build XML content manually for proper formatting
        let mut xml = String::new();
        xml.push_str("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n");
        xml.push_str("<urlset xmlns=\"https://www.sitemaps.org/schemas/sitemap/0.9\">\n");
        xml.push_str("<!-- Sitemap generated using SiteOne Crawler - https://crawler.siteone.io/features/sitemap-generator/ -->\n");

        for url in urls {
            // Calculate priority based on slash count in path
            let slashes_count = url::Url::parse(url)
                .ok()
                .map(|u| u.path().matches('/').count())
                .unwrap_or(1) as f64;

            let priority = (self.base_priority + (self.priority_increase * (1.0 - slashes_count))).clamp(0.1, 1.0);

            // Escape special XML characters in URL
            let escaped_url = escape_xml(url);

            xml.push_str("  <url>\n");
            xml.push_str(&format!("    <loc>{}</loc>\n", escaped_url));
            xml.push_str(&format!("    <priority>{:.1}</priority>\n", priority));
            xml.push_str("  </url>\n");
        }

        xml.push_str("</urlset>\n");

        // Write to file
        let mut file = fs::File::create(&output_file)
            .map_err(|e| CrawlerError::Export(format!("Failed to create XML sitemap file '{}': {}", output_file, e)))?;
        file.write_all(xml.as_bytes())
            .map_err(|e| CrawlerError::Export(format!("Failed to write XML sitemap to '{}': {}", output_file, e)))?;

        Ok(output_file)
    }

    /// Generate a TXT sitemap file (plain list of URLs).
    fn generate_txt_sitemap(&self, output_file: &str, urls: &[String]) -> CrawlerResult<String> {
        // Ensure .txt extension
        let output_file = if output_file.to_lowercase().ends_with(".txt") {
            output_file.to_string()
        } else {
            let stripped = regex::Regex::new(r"\.txt$")
                .ok()
                .map(|re| re.replace(output_file, "").to_string())
                .unwrap_or_else(|| output_file.to_string());
            format!("{}.txt", stripped)
        };

        // Ensure parent directory exists
        let path = Path::new(&output_file);
        if let Some(parent) = path.parent()
            && !parent.exists()
        {
            fs::create_dir_all(parent).map_err(|e| {
                CrawlerError::Export(format!("Cannot create output directory '{}': {}", parent.display(), e))
            })?;
        }

        let content = urls.join("\n");
        fs::write(&output_file, &content)
            .map_err(|e| CrawlerError::Export(format!("Failed to write TXT sitemap to '{}': {}", output_file, e)))?;

        Ok(output_file)
    }
}

impl Exporter for SitemapExporter {
    fn get_name(&self) -> &str {
        "SitemapExporter"
    }

    fn should_be_activated(&self) -> bool {
        self.output_sitemap_xml.is_some() || self.output_sitemap_txt.is_some()
    }

    fn export(&mut self, status: &Status, _output: &dyn Output) -> CrawlerResult<()> {
        let urls = self.collect_sitemap_urls(status);

        // Generate XML sitemap
        if let Some(ref output_file) = self.output_sitemap_xml.clone() {
            match self.generate_xml_sitemap(output_file, &urls) {
                Ok(sitemap_file) => {
                    let display_path = utils::get_output_formatted_path(&sitemap_file);
                    status.add_info_to_summary("sitemap-xml", &format!("XML sitemap generated to '{}'", display_path));
                }
                Err(e) => {
                    status.add_critical_to_summary("sitemap-xml", &format!("Sitemap XML ERROR: {}", e));
                }
            }
        }

        // Generate TXT sitemap
        if let Some(ref output_file) = self.output_sitemap_txt.clone() {
            match self.generate_txt_sitemap(output_file, &urls) {
                Ok(sitemap_file) => {
                    let display_path = utils::get_output_formatted_path(&sitemap_file);
                    status.add_info_to_summary("sitemap-txt", &format!("TXT sitemap generated to '{}'", display_path));
                }
                Err(e) => {
                    status.add_critical_to_summary("sitemap-txt", &format!("Sitemap TXT ERROR: {}", e));
                }
            }
        }

        Ok(())
    }
}

/// Escape special XML characters in a string.
fn escape_xml(s: &str) -> String {
    s.replace('&', "&amp;")
        .replace('<', "&lt;")
        .replace('>', "&gt;")
        .replace('"', "&quot;")
        .replace('\'', "&apos;")
}


================================================
FILE: src/export/upload_exporter.rs
================================================
// SiteOne Crawler - UploadExporter
// (c) Jan Reges <jan.reges@siteone.cz>
//
// Uploads HTML report to crawler.siteone.io via HTTP POST.

use std::time::Instant;

use flate2::Compression;
use flate2::write::GzEncoder;
use std::io::Write;

use crate::error::{CrawlerError, CrawlerResult};
use crate::export::exporter::Exporter;
use crate::output::output::Output;
use crate::result::status::Status;
use crate::utils;
use crate::version;

pub struct UploadExporter {
    /// Whether upload is enabled (--upload)
    pub upload_enabled: bool,
    /// Upload endpoint URL (--upload-to)
    pub endpoint: String,
    /// Retention period (--upload-retention)
    pub retention: Option<String>,
    /// Optional password for the online report (--upload-password)
    pub password: Option<String>,
    /// Upload timeout in seconds (--upload-timeout)
    pub upload_timeout: u64,
    /// HTML report content to upload (set before export)
    pub html_report_content: Option<String>,
}

impl UploadExporter {
    pub fn new(
        upload_enabled: bool,
        endpoint: String,
        retention: Option<String>,
        password: Option<String>,
        upload_timeout: u64,
    ) -> Self {
        Self {
            upload_enabled,
            endpoint,
            retention,
            password,
            upload_timeout,
            html_report_content: None,
        }
    }

    /// Set HTML report content to be uploaded.
    pub fn set_html_report_content(&mut self, content: String) {
        self.html_report_content = Some(content);
    }

    /// Upload the HTML report to the configured endpoint.
    /// Returns the URL where the report is available.
    fn upload(&self, html: &str) -> CrawlerResult<String> {
        // Gzip compress the HTML body
        let mut encoder = GzEncoder::new(Vec::new(), Compression::default());
        encoder
            .write_all(html.as_bytes())
            .map_err(|e| CrawlerError::Export(format!("Failed to compress HTML for upload: {}", e)))?;
        let compressed_html = encoder
            .finish()
            .map_err(|e| CrawlerError::Export(format!("Failed to finish compression for upload: {}", e)))?;

        // Build form data
        let mut form = vec![
            ("version".to_string(), version::CODE.to_string()),
            ("platform".to_string(), std::env::consts::OS.to_string()),
            ("arch".to_string(), get_arch()),
        ];

        if let Some(ref retention) = self.retention {
            form.push(("retention".to_string(), retention.clone()));
        }
        if let Some(ref password) = self.password {
            let trimmed = password.trim();
            if !trimmed.is_empty() {
                form.push(("password".to_string(), trimmed.to_string()));
            }
        }

        // Send as application/x-www-form-urlencoded.
        // The gzipped binary htmlBody is URL-encoded via percent-encoding.
        let client = reqwest::blocking::Client::builder()
            .timeout(std::time::Duration::from_secs(self.upload_timeout))
            .build()
            .map_err(|e| CrawlerError::Export(format!("Failed to create HTTP client for upload: {}", e)))?;

        // Build URL-encoded body manually — reqwest's .form() doesn't support binary values
        use percent_encoding::{NON_ALPHANUMERIC, percent_encode};
        let mut parts: Vec<String> = Vec::new();
        let encoded_html = percent_encode(&compressed_html, NON_ALPHANUMERIC).to_string();
        parts.push(format!("htmlBody={}", encoded_html));
        for (key, value) in &form {
            parts.push(format!(
                "{}={}",
                percent_encode(key.as_bytes(), NON_ALPHANUMERIC),
                percent_encode(value.as_bytes(), NON_ALPHANUMERIC)
            ));
        }
        let body = parts.join("&");

        let response = client
            .post(&self.endpoint)
            .header("Content-Type", "application/x-www-form-urlencoded")
            .body(body)
            .send()
            .map_err(|e| CrawlerError::Export(format!("Upload request failed: {}", e)))?;

        let status_code = response.status();
        let body = response.text().unwrap_or_default();

        // Try to parse JSON response
        if let Ok(json) = serde_json::from_str::<serde_json::Value>(&body) {
            if let Some(url) = json.get("url").and_then(|v| v.as_str()) {
                return Ok(url.to_string());
            }
            if let Some(error) = json.get("error").and_then(|v| v.as_str()) {
                return Err(CrawlerError::Export(format!(
                    "Upload failed: {} ({})",
                    error, status_code
                )));
            }
        }

        Err(CrawlerError::Export(format!(
            "Upload failed: unknown error ({})",
            status_code
        )))
    }
}

impl Exporter for UploadExporter {
    fn get_name(&self) -> &str {
        "UploadExporter"
    }

    fn should_be_activated(&self) -> bool {
        self.upload_enabled
    }

    fn export(&mut self, status: &Status, _output: &dyn Output) -> CrawlerResult<()> {
        let html = match &self.html_report_content {
            Some(c) => c.clone(),
            None => {
                return Err(CrawlerError::Export(
                    "HTML report content not available. Set it via set_html_report_content() before export."
                        .to_string(),
                ));
            }
        };

        let start = Instant::now();
        match self.upload(&html) {
            Ok(online_url) => {
                let elapsed = start.elapsed().as_secs_f64();
                status.add_info_to_summary(
                    "upload-done",
                    &format!(
                        "HTML report uploaded to '{}' and took {}",
                        online_url,
                        utils::get_formatted_duration(elapsed)
                    ),
                );
            }
            Err(e) => {
                let elapsed = start.elapsed().as_secs_f64();
                status.add_critical_to_summary(
                    "upload-failed",
                    &format!(
                        "HTML report upload failed: {} and took {}",
                        e,
                        utils::get_formatted_duration(elapsed)
                    ),
                );
            }
        }

        Ok(())
    }
}

/// Detect system architecture.
fn get_arch() -> String {
    match std::env::consts::ARCH {
        "x86_64" => "x64".to_string(),
        "aarch64" => "arm64".to_string(),
        other => other.to_string(),
    }
}


================================================
FILE: src/export/utils/html_to_markdown.rs
================================================
// SiteOne Crawler - HtmlToMarkdownConverter
// (c) Jan Reges <jan.reges@siteone.cz>
//
// Converts HTML to Markdown format using the scraper crate for HTML parsing.

use std::collections::HashMap;

use ego_tree::NodeRef;
use once_cell::sync::Lazy;
use regex::Regex;
use scraper::{Html, Node, Selector};

static RE_NON_ALNUM: Lazy<Regex> = Lazy::new(|| Regex::new(r"[^a-z0-9]+").unwrap());

/// Converts HTML content to Markdown format.
/// Handles all HTML elements: headings, paragraphs, bold/italic, links, images,
/// lists, tables, blockquotes, code blocks, horizontal rules, etc.
pub struct HtmlToMarkdownConverter {
    html: String,
    excluded_selectors: Vec<String>,
    implicit_excluded_selectors: Vec<String>,
    strong_delimiter: String,
    em_delimiter: String,
    bullet_list_marker: String,
    code_block_fence: String,
    horizontal_rule: String,
    heading_style: HeadingStyle,
    escape_mode: bool,
    include_images: bool,
    convert_tables: bool,
    convert_strikethrough: bool,
    strikethrough_delimiter: String,
}

#[derive(Debug, Clone, Copy, PartialEq)]
pub enum HeadingStyle {
    Atx,
    Setext,
}

impl HtmlToMarkdownConverter {
    pub fn new(html: &str, excluded_selectors: Vec<String>) -> Self {
        Self {
            html: html.to_string(),
            excluded_selectors,
            implicit_excluded_selectors: vec![
                // Hidden elements
                ".hidden".to_string(),
                ".hide".to_string(),
                ".invisible".to_string(),
                ".lg\\:sl-hidden".to_string(),
                ".md\\:sl-hidden".to_string(),
                ".lg\\:hidden".to_string(),
                ".md\\:hidden".to_string(),
                // ARIA hidden and menu elements
                "[aria-hidden='true']".to_string(),
                "[role='menu']".to_string(),
                // Cookie consent banners
                ".cookie-panel".to_string(),
                ".cookie-banner".to_string(),
                ".cookie-consent".to_string(),
                ".cookie-notice".to_string(),
                ".cookie-bar".to_string(),
                "#cookie-banner".to_string(),
                "#cookie-consent".to_string(),
                "#cookie-notice".to_string(),
                "#cookiebanner".to_string(),
                "#CybotCookiebotDialog".to_string(),
                ".cc-window".to_string(),
                "#onetrust-banner-sdk".to_string(),
            ],
            strong_delimiter: "**".to_string(),
            em_delimiter: "*".to_string(),
            bullet_list_marker: "-".to_string(),
            code_block_fence: "```".to_string(),
            horizontal_rule: "* * *".to_string(),
            heading_style: HeadingStyle::Atx,
            escape_mode: true,
            include_images: true,
            convert_tables: true,
            convert_strikethrough: true,
            strikethrough_delimiter: "~~".to_string(),
        }
    }

    pub fn set_strong_delimiter(&mut self, delimiter: &str) -> &mut Self {
        self.strong_delimiter = delimiter.to_string();
        self
    }

    pub fn set_em_delimiter(&mut self, delimiter: &str) -> &mut Self {
        self.em_delimiter = delimiter.to_string();
        self
    }

    pub fn set_bullet_list_marker(&mut self, marker: &str) -> &mut Self {
        if ["-", "*", "+"].contains(&marker) {
            self.bullet_list_marker = marker.to_string();
        }
        self
    }

    pub fn set_code_block_fence(&mut self, fence: &str) -> &mut Self {
        if fence.len() >= 3 && fence.starts_with('`') {
            self.code_block_fence = fence.to_string();
        }
        self
    }

    pub fn set_horizontal_rule(&mut self, rule: &str) -> &mut Self {
        self.horizontal_rule = rule.to_string();
        self
    }

    pub fn set_heading_style(&mut self, style: HeadingStyle) -> &mut Self {
        self.heading_style = style;
        self
    }

    pub fn set_escape_mode(&mut self, enable: bool) -> &mut Self {
        self.escape_mode = enable;
        self
    }

    pub fn set_include_images(&mut self, include: bool) -> &mut Self {
        self.include_images = include;
        self
    }

    pub fn set_convert_tables(&mut self, convert: bool) -> &mut Self {
        self.convert_tables = convert;
        self
    }

    pub fn set_convert_strikethrough(&mut self, convert: bool) -> &mut Self {
        self.convert_strikethrough = convert;
        self
    }

    pub fn set_strikethrough_delimiter(&mut self, delimiter: &str) -> &mut Self {
        self.strikethrough_delimiter = delimiter.to_string();
        self
    }

    /// Convert the HTML to Markdown.
    pub fn get_markdown(&self) -> String {
        let document = Html::parse_document(&self.html);

        // Remove excluded selectors from the document - we'll skip these during conversion
        let excluded_ids = self.collect_excluded_node_ids(&document);

        // Try to get the body element first, fallback to documentElement
        let body_selector = Selector::parse("body").unwrap_or_else(|_| Selector::parse("*").unwrap());
        let start_node = document
            .select(&body_selector)
            .next()
            .map(|el| el.id())
            .unwrap_or_else(|| document.root_element().id());

        let node_ref = document.tree.get(start_node);
        let raw_markdown = match node_ref {
            Some(node) => self.convert_node(&node, &document, &excluded_ids),
            None => return String::new(),
        };

        let normalized = self.normalize_whitespace(&raw_markdown);

        // Deduplication logic
        let blocks: Vec<&str> = normalized.split("\n\n").collect();
        if blocks.len() <= 1 {
            let result = normalized.trim().to_string();
            return self.post_process(&result);
        }

        let mut fingerprints: HashMap<String, (String, usize)> = HashMap::new();
        let mut unique_blocks: Vec<(usize, String)> = Vec::new();

        for (index, original_block) in blocks.iter().enumerate() {
            let trimmed = original_block.trim();

            if trimmed.is_empty() {
                unique_blocks.push((index, original_block.to_string()));
                continue;
            }

            // Create fingerprint: lowercase alphanumeric only
            let fingerprint = RE_NON_ALNUM.replace_all(&trimmed.to_lowercase(), "").to_string();

            if fingerprint.is_empty() {
                unique_blocks.push((index, original_block.to_string()));
                continue;
            }

            if let Some((existing_block, existing_index)) = fingerprints.get(&fingerprint) {
                // Duplicate found - keep the longer one
                if trimmed.len() > existing_block.trim().len() {
                    // Remove the shorter one
                    unique_blocks.retain(|(idx, _)| *idx != *existing_index);
                    unique_blocks.push((index, original_block.to_string()));
                    fingerprints.insert(fingerprint, (original_block.to_string(), index));
                }
                // else: existing is longer or equal, discard current
            } else {
                fingerprints.insert(fingerprint, (original_block.to_string(), index));
                unique_blocks.push((index, original_block.to_string()));
            }
        }

        // Sort by original index to preserve order
        unique_blocks.sort_by_key(|(idx, _)| *idx);

        let final_markdown: String = unique_blocks
            .into_iter()
            .map(|(_, block)| block)
            .collect::<Vec<_>>()
            .join("\n\n");

        self.post_process(&final_markdown)
    }

    fn post_process(&self, markdown: &str) -> String {
        // Replace backslashes with actual characters
        let result = Regex::new(r"\\([.\-])")
            .map(|re| re.replace_all(markdown, "$1").to_string())
            .unwrap_or_else(|_| markdown.to_string());

        result.trim().to_string()
    }

    /// Minimum number of links in a list block to trigger collapsing into accordion.
    pub const MIN_LINKS_FOR_COLLAPSE: usize = 8;

    /// Collapse large link lists into `<details>` accordions.
    /// First collapsed list on the page gets "Menu" label, subsequent get "Links".
    pub fn collapse_large_link_lists(markdown: &str) -> String {
        let lines: Vec<&str> = markdown.lines().collect();
        let len = lines.len();
        let mut result_lines: Vec<String> = Vec::with_capacity(len);
        let mut is_first_collapse = true;
        let mut i = 0;

        while i < len {
            // Check if this line starts a list block
            if Self::is_list_item(lines[i]) {
                let block_start = i;

                // Consume the entire list block
                while i < len {
                    if Self::is_list_item(lines[i]) || Self::is_list_continuation(lines[i]) {
                        i += 1;
                    } else if lines[i].trim().is_empty() {
                        // Blank line — check if the list continues after it
                        let mut next_non_blank = i + 1;
                        while next_non_blank < len && lines[next_non_blank].trim().is_empty() {
                            next_non_blank += 1;
                        }
                        if next_non_blank < len && Self::is_list_item(lines[next_non_blank]) {
                            // List continues after blank line(s)
                            i = next_non_blank;
                        } else {
                            break;
                        }
                    } else {
                        break;
                    }
                }

                let block_end = i;
                let block_lines = &lines[block_start..block_end];

                // Count lines containing markdown links
                let link_count = block_lines.iter().filter(|line| line.contains("](")).count();

                if link_count > Self::MIN_LINKS_FOR_COLLAPSE {
                    let label = if is_first_collapse { "Menu" } else { "Links" };
                    is_first_collapse = false;
                    result_lines.push("<details>".to_string());
                    result_lines.push(format!("<summary>{}</summary>", label));
                    result_lines.push(String::new());
                    for line in block_lines {
                        result_lines.push(line.to_string());
                    }
                    result_lines.push(String::new());
                    result_lines.push("</details>".to_string());
                    result_lines.push(String::new()); // blank line required so next Markdown (e.g. heading) isn't swallowed into the HTML block
                } else {
                    for line in block_lines {
                        result_lines.push(line.to_string());
                    }
                }
            } else {
                result_lines.push(lines[i].to_string());
                i += 1;
            }
        }

        result_lines.join("\n")
    }

    /// Check if a line is a list item (starts with `- `, `* `, `+ `, or numbered `1. `).
    fn is_list_item(line: &str) -> bool {
        let trimmed = line.trim_start();
        trimmed.starts_with("- ")
            || trimmed.starts_with("* ")
            || trimmed.starts_with("+ ")
            || trimmed.bytes().next().is_some_and(|b| b.is_ascii_digit()) && trimmed.contains(". ")
    }

    /// Check if a line is a continuation of a list item (indented text that's not a new item).
    fn is_list_continuation(line: &str) -> bool {
        let trimmed = line.trim_start();
        // Indented non-empty line that's not a list item itself
        line.len() > trimmed.len() && !trimmed.is_empty()
    }

    /// Collect node IDs of elements matching excluded selectors
    fn collect_excluded_node_ids(&self, document: &Html) -> Vec<ego_tree::NodeId> {
        let mut excluded = Vec::new();
        let all_selectors: Vec<&str> = self
            .excluded_selectors
            .iter()
            .chain(self.implicit_excluded_selectors.iter())
            .map(|s| s.as_str())
            .collect();

        for selector_str in all_selectors {
            if let Ok(selector) = Selector::parse(selector_str) {
                for element in document.select(&selector) {
                    excluded.push(element.id());
                    // Also exclude all descendants
                    for descendant in element.descendants() {
                        excluded.push(descendant.id());
                    }
                }
            }
        }

        // Also collect unwanted tags: script, style, noscript, head, meta, link, iframe, frame
        for tag in &["script", "style", "noscript", "head", "meta", "link", "iframe", "frame"] {
            if let Ok(selector) = Selector::parse(tag) {
                for element in document.select(&selector) {
                    excluded.push(element.id());
                    for descendant in element.descendants() {
                        excluded.push(descendant.id());
                    }
                }
            }
        }

        excluded
    }

    /// Convert a DOM node to Markdown.
    fn convert_node(&self, node: &NodeRef<Node>, document: &Html, excluded: &[ego_tree::NodeId]) -> String {
        if excluded.contains(&node.id()) {
            return String::new();
        }

        match node.value() {
            Node::Text(text) => {
                let text_content = text.text.to_string();
                // Check parent context
                if let Some(parent) = node.parent()
                    && let Node::Element(el) = parent.value()
                {
                    let tag = el.name.local.as_ref();
                    if tag == "code" || tag == "pre" {
                        return text_content;
                    }
                }
                self.escape_markdown_chars(&text_content)
            }
            Node::Element(el) => {
                let tag = el.name.local.as_ref().to_lowercase();
                match tag.as_str() {
                    "strong" | "b" => {
                        let inner = self.collapse_inline_whitespace(&self.get_inner_markdown(node, document, excluded));
                        self.wrap_with_delimiter(&inner, &self.strong_delimiter)
                    }
                    "em" | "i" => {
                        let inner = self.collapse_inline_whitespace(&self.get_inner_markdown(node, document, excluded));
                        self.wrap_with_delimiter(&inner, &self.em_delimiter)
                    }
                    "h1" | "h2" | "h3" | "h4" | "h5" | "h6" => self.convert_heading(node, document, excluded),
                    "p" => {
                        let inner = self.get_inner_markdown(node, document, excluded).trim().to_string();
                        if inner.is_empty() {
                            String::new()
                        } else {
                            format!("\n\n{}\n\n", inner)
                        }
                    }
                    "br" => "  \n".to_string(),
                    "hr" => format!("\n\n{}\n\n", self.horizontal_rule),
                    "a" => self.convert_link(node, document, excluded),
                    "img" => self.convert_image(node),
                    "code" => self.convert_inline_code(node),
                    "pre" => self.convert_code_block(node, document),
                    "ul" | "ol" => self.convert_list_to_markdown(node, document, excluded),
                    "blockquote" => self.convert_blockquote(node, document, excluded),
                    "table" => self.convert_table(node, document, excluded),
                    "s" | "del" | "strike" => {
                        if !self.convert_strikethrough {
                            return self.get_inner_markdown(node, document, excluded);
                        }
                        let inner = self.collapse_inline_whitespace(&self.get_inner_markdown(node, document, excluded));
                        self.wrap_with_delimiter(&inner, &self.strikethrough_delimiter)
                    }
                    "dl" => self.convert_definition_list(node, document, excluded),
                    "dt" | "dd" => self.get_inner_markdown(node, document, excluded),
                    "sup" => {
                        let inner = self.collapse_inline_whitespace(&self.get_inner_markdown(node, document, excluded));
                        format!("^{}^", inner)
                    }
                    "sub" => {
                        let inner = self.collapse_inline_whitespace(&self.get_inner_markdown(node, document, excluded));
                        format!("~{}~", inner)
                    }
                    // Ignored form/non-content elements
                    "form" | "fieldset" | "legend" | "label" | "dialog" | "button" | "input" | "select"
                    | "textarea" | "script" | "style" | "noscript" | "head" | "meta" | "link" | "iframe" | "frame" => {
                        String::new()
                    }
                    // Block container elements - wrap with newlines to prevent text concatenation
                    "nav" | "header" | "footer" | "aside" | "article" | "section" | "main" | "figure"
                    | "figcaption" | "div" => {
                        let inner = self.get_inner_markdown(node, document, excluded);
                        let trimmed = inner.trim();
                        if trimmed.is_empty() {
                            String::new()
                        } else {
                            format!("\n\n{}\n\n", trimmed)
                        }
                    }
                    // Inline container elements
                    "span" => self.get_inner_markdown(node, document, excluded),
                    _ => self.get_inner_markdown(node, document, excluded),
                }
            }
            Node::Comment(_) => String::new(),
            _ => String::new(),
        }
    }

    /// Get inner markdown by processing all children of a node.
    fn get_inner_markdown(&self, node: &NodeRef<Node>, document: &Html, excluded: &[ego_tree::NodeId]) -> String {
        let mut markdown = String::new();
        let mut consecutive_links: Vec<ego_tree::NodeId> = Vec::new();

        for child in node.children() {
            if excluded.contains(&child.id()) {
                continue;
            }

            let is_valid_link = self.is_valid_link_node(&child);

            if is_valid_link {
                consecutive_links.push(child.id());
            } else if matches!(child.value(), Node::Text(t) if t.text.trim().is_empty())
                && !consecutive_links.is_empty()
            {
                // Ignore whitespace between links
                continue;
            } else {
                // Process collected links
                if consecutive_links.len() >= 2 {
                    markdown.push_str(&self.convert_consecutive_links_to_table(&consecutive_links, document, excluded));
                } else if consecutive_links.len() == 1
                    && let Some(link_node) = document.tree.get(consecutive_links[0])
                {
                    markdown.push_str(&self.convert_link(&link_node, document, excluded));
                }
                consecutive_links.clear();

                markdown.push_str(&self.convert_node(&child, document, excluded));
            }
        }

        // Process remaining links
        if consecutive_links.len() >= 2 {
            markdown.push_str(&self.convert_consecutive_links_to_table(&consecutive_links, document, excluded));
        } else if consecutive_links.len() == 1
            && let Some(link_node) = document.tree.get(consecutive_links[0])
        {
            markdown.push_str(&self.convert_link(&link_node, document, excluded));
        }

        markdown
    }

    /// Check if a node is a valid link for consecutive link detection.
    fn is_valid_link_node(&self, node: &NodeRef<Node>) -> bool {
        if let Node::Element(el) = node.value() {
            if el.name.local.as_ref() != "a" {
                return false;
            }
            let href = el.attr("href");
            if href.map(|v| v.is_empty()).unwrap_or(true) {
                return false;
            }
            // Must have text content or image child
            let text_content = self.extract_text_content(node).trim().to_string();
            let has_image = node
                .descendants()
                .any(|d| matches!(d.value(), Node::Element(e) if e.name.local.as_ref() == "img"));
            !text_content.is_empty() || has_image
        } else {
            false
        }
    }

    /// Extract plain text content from a node recursively.
    fn extract_text_content(&self, node: &NodeRef<Node>) -> String {
        let mut text = String::new();
        for child in node.descendants() {
            if let Node::Text(t) = child.value() {
                text.push_str(&t.text);
            }
        }
        text
    }

    /// Collapse multiple whitespace characters into a single space.
    fn collapse_inline_whitespace(&self, text: &str) -> String {
        let text = text.replace("&nbsp;", " ").replace('\u{00A0}', " ");
        Regex::new(r"\s+")
            .map(|re| re.replace_all(&text, " ").trim().to_string())
            .unwrap_or_else(|_| text.trim().to_string())
    }

    /// Convert heading element to Markdown.
    fn convert_heading(&self, node: &NodeRef<Node>, document: &Html, excluded: &[ego_tree::NodeId]) -> String {
        if let Node::Element(el) = node.value() {
            let tag = el.name.local.as_ref();
            let level: usize = tag[1..].parse().unwrap_or(1);
            let content = self.collapse_inline_whitespace(&self.get_inner_markdown(node, document, excluded));
            // Remove markdown characters that might interfere inside headings
            let content = content.replace(['#', '*', '_', '`', '[', ']'], "");
            let content = content.trim().to_string();

            if content.is_empty() {
                return String::new();
            }

            if self.heading_style == HeadingStyle::Setext && level <= 2 {
                let underline_char = if level == 1 { '=' } else { '-' };
                let underline = underline_char.to_string().repeat(content.chars().count());
                format!("\n\n{}\n{}\n\n", content, underline)
            } else {
                let prefix = "#".repeat(level);
                format!("\n\n{} {}\n\n", prefix, content)
            }
        } else {
            String::new()
        }
    }

    /// Convert link element to Markdown.
    fn convert_link(&self, node: &NodeRef<Node>, document: &Html, excluded: &[ego_tree::NodeId]) -> String {
        if let Node::Element(el) = node.value() {
            let href = el.attr("href").unwrap_or("").to_string();

            if href.is_empty() {
                return self.get_inner_markdown(node, document, excluded);
            }

            let text = self.collapse_inline_whitespace(&self.get_inner_markdown(node, document, excluded));

            let text = if !text.is_empty() {
                text
            } else if let Some(aria_label) = el.attr("aria-label") {
                let label = aria_label.trim().to_string();
                if label.is_empty() { href.clone() } else { label }
            } else {
                href.clone()
            };

            let title = el.attr("title").unwrap_or("").to_string();

            let mut markdown = format!("[{}]({}", text, href);
            if !title.is_empty() {
                markdown.push_str(&format!(" \"{}\"", self.escape_markdown_chars(&title)));
            }
            markdown.push(')');

            markdown
        } else {
            String::new()
        }
    }

    /// Convert image element to Markdown.
    fn convert_image(&self, node: &NodeRef<Node>) -> String {
        if let Node::Element(el) = node.value() {
            if !self.include_images {
                let alt = el.attr("alt").unwrap_or("").to_string();
                return if alt.is_empty() {
                    String::new()
                } else {
                    self.escape_markdown_chars(&alt)
                };
            }

            let alt = self.collapse_inline_whitespace(el.attr("alt").unwrap_or(""));
            let src = el.attr("src").unwrap_or("").to_string();
            let title = el.attr("title").unwrap_or("").to_string();

            if src.is_empty() {
                return String::new();
            }

            let title = self.escape_markdown_chars(&title);

            let mut markdown = format!("![{}]({}", alt, src);
            if !title.is_empty() {
                markdown.push_str(&format!(" \"{}\"", title));
            }
            markdown.push(')');

            format!("\n\n{}\n\n", markdown)
        } else {
            String::new()
        }
    }

    /// Convert inline code element to Markdown.
    fn convert_inline_code(&self, node: &NodeRef<Node>) -> String {
        let code = self.extract_text_content(node);
        let trimmed_code = code.trim();

        // Determine required backticks
        let mut max_backticks = 0usize;
        let mut current_count = 0usize;
        for ch in code.chars() {
            if ch == '`' {
                current_count += 1;
                max_backticks = max_backticks.max(current_count);
            } else {
                current_count = 0;
            }
        }
        let fence = "`".repeat(max_backticks + 1);

        let prefix_space = if trimmed_code.starts_with('`') { " " } else { "" };
        let suffix_space = if trimmed_code.ends_with('`') { " " } else { "" };

        format!("{}{}{}{}{}", fence, prefix_space, trimmed_code, suffix_space, fence)
    }

    /// Convert pre/code block to Markdown.
    fn convert_code_block(&self, node: &NodeRef<Node>, _document: &Html) -> String {
        // Find inner <code> element if present
        let code_text = node
            .descendants()
            .find(|d| matches!(d.value(), Node::Element(e) if e.name.local.as_ref() == "code"))
            .map(|code_node| self.extract_text_content(&code_node))
            .unwrap_or_else(|| self.extract_text_content(node));

        let code = code_text.trim_matches(|c: char| c == '\n' || c == '\r');

        // Replace '\' followed by multiple spaces with '\' + newline + spaces
        let code = Regex::new(r"(\\)(\s{2,})")
            .map(|re| re.replace_all(code, "$1\n$2").to_string())
            .unwrap_or_else(|_| code.to_string());

        // Detect language from class attribute
        let mut language = String::new();

        // Check class on <pre> or inner <code>
        let class_attr = if let Node::Element(el) = node.value() {
            el.attr("class").map(|v| v.to_string())
        } else {
            None
        };

        let class_to_check = class_attr.or_else(|| {
            node.descendants()
                .find(|d| matches!(d.value(), Node::Element(e) if e.name.local.as_ref() == "code"))
                .and_then(|code_node| {
                    if let Node::Element(el) = code_node.value() {
                        el.attr("class").map(|v| v.to_string())
                    } else {
                        None
                    }
                })
        });

        if let Some(class_val) = class_to_check {
            for class in class_val.split_whitespace() {
                if let Some(lang) = class.strip_prefix("language-") {
                    language = lang.to_string();
                    break;
                } else if let Some(lang) = class.strip_prefix("lang-") {
                    language = lang.to_string();
                    break;
                }
            }
        }

        // Clean language identifier
        language = language.replace(|c: char| c.is_whitespace() || c == '`', "");

        format!(
            "\n\n{}{}\n{}\n{}\n\n",
            self.code_block_fence, language, code, self.code_block_fence
        )
    }

    /// Convert blockquote element to Markdown.
    fn convert_blockquote(&self, node: &NodeRef<Node>, document: &Html, excluded: &[ego_tree::NodeId]) -> String {
        let content = self.get_inner_markdown(node, document, excluded);
        let content = content.trim();
        if content.is_empty() {
            return String::new();
        }

        let mut markdown = String::new();
        for line in content.lines() {
            markdown.push_str(&format!("> {}\n", line));
        }

        format!("\n\n{}\n\n", markdown.trim_end())
    }

    /// Convert table element to Markdown.
    fn convert_table(&self, node: &NodeRef<Node>, document: &Html, excluded: &[ego_tree::NodeId]) -> String {
        if !self.convert_tables {
            // Return clean HTML table
            return format!("\n\n{}\n\n", self.extract_text_content(node).trim());
        }

        let mut rows: Vec<Vec<String>> = Vec::new();
        let mut header_cells: Vec<String> = Vec::new();
        let mut max_col_lengths: Vec<usize> = Vec::new();
        let mut has_header = false;

        // Process thead
        for child in node.children() {
            if let Node::Element(el) = child.value() {
                let tag = el.name.local.as_ref();
                if tag == "thead" {
                    has_header = true;
                    // Find tr in thead
                    for thead_child in child.children() {
                        if let Node::Element(tr_el) = thead_child.value()
                            && tr_el.name.local.as_ref() == "tr"
                        {
                            let mut col_index = 0;
                            for cell_node in thead_child.children() {
                                if let Node::Element(cell_el) = cell_node.value() {
                                    let cell_tag = cell_el.name.local.as_ref();
                                    if cell_tag == "th" || cell_tag == "td" {
                                        let content = self.extract_header_content(&cell_node, document, excluded);
                                        while max_col_lengths.len() <= col_index {
                                            max_col_lengths.push(0);
                                        }
                                        max_col_lengths[col_index] =
                                            max_col_lengths[col_index].max(content.chars().count());
                                        header_cells.push(content);
                                        col_index += 1;
                                    }
                                }
                            }
                            break; // Only first tr in thead
                        }
                    }
                }
            }
        }

        // Process tbody and direct tr children
        let mut direct_trs: Vec<ego_tree::NodeId> = Vec::new();
        for child in node.children() {
            if let Node::Element(el) = child.value() {
                let tag = el.name.local.as_ref();
                if tag == "tbody" {
                    for tbody_child in child.children() {
                        if let Node::Element(tr_el) = tbody_child.value()
                            && tr_el.name.local.as_ref() == "tr"
                        {
                            direct_trs.push(tbody_child.id());
                        }
                    }
                } else if tag == "tr" && !has_header && direct_trs.is_empty() {
                    direct_trs.push(child.id());
                }
            }
        }

        // If no tbody, look for direct TR children
        if direct_trs.is_empty() && !has_header {
            for child in node.children() {
                if let Node::Element(el) = child.value()
                    && el.name.local.as_ref() == "tr"
                {
                    direct_trs.push(child.id());
                }
            }
        }

        for tr_id in &direct_trs {
            if let Some(tr_node) = document.tree.get(*tr_id) {
                // If no header found yet, check if first row has <th>
                if !has_header && rows.is_empty() {
                    let mut potential_header: Vec<String> = Vec::new();
                    let mut is_potential_header = false;

                    for cell_node in tr_node.children() {
                        if let Node::Element(cell_el) = cell_node.value() {
                            let cell_tag = cell_el.name.local.as_ref();
                            if cell_tag == "th" || cell_tag == "td" {
                                if cell_tag == "th" {
                                    is_potential_header = true;
                                }
                                let content = self.extract_header_content(&cell_node, document, excluded);
                                let col_index = potential_header.len();
                                while max_col_lengths.len() <= col_index {
                                    max_col_lengths.push(0);
                                }
                                max_col_lengths[col_index] = max_col_lengths[col_index].max(content.chars().count());
                                potential_header.push(content);
                            }
                        }
                    }

                    if is_potential_header {
                        header_cells = potential_header;
                        has_header = true;
                        continue;
                    }
                }

                // Process as data row
                let mut row_cells: Vec<String> = Vec::new();
                for cell_node in tr_node.children() {
                    if let Node::Element(cell_el) = cell_node.value() {
                        let cell_tag = cell_el.name.local.as_ref();
                        if cell_tag == "th" || cell_tag == "td" {
                            let content = self
                                .collapse_inline_whitespace(&self.get_inner_markdown(&cell_node, document, excluded));
                            let col_index = row_cells.len();
                            while max_col_lengths.len() <= col_index {
                                max_col_lengths.push(0);
                            }
                            max_col_lengths[col_index] = max_col_lengths[col_index].max(content.chars().count());
                            row_cells.push(content);
                        }
                    }
                }

                // Pad row if fewer cells than max columns
                let num_cols = max_col_lengths.len();
                while row_cells.len() < num_cols {
                    row_cells.push(String::new());
                }

                rows.push(row_cells);
            }
        }

        if header_cells.is_empty() && rows.is_empty() {
            return String::new();
        }

        // Determine number of columns
        let mut num_cols = header_cells.len();
        for row in &rows {
            num_cols = num_cols.max(row.len());
        }

        // Ensure min length 3 for separator
        while max_col_lengths.len() < num_cols {
            max_col_lengths.push(0);
        }
        for length in &mut max_col_lengths {
            *length = (*length).max(3);
        }

        let mut markdown = "\n\n".to_string();
        if !header_cells.is_empty() {
            while header_cells.len() < num_cols {
                header_cells.push(String::new());
            }
            markdown.push_str(&self.format_table_row(&header_cells, &max_col_lengths));
            markdown.push_str(&self.format_table_separator(&max_col_lengths));
        } else {
            markdown.push_str(&self.format_table_separator(&max_col_lengths));
        }

        for row in &rows {
            let mut padded_row = row.clone();
            while padded_row.len() < num_cols {
                padded_row.push(String::new());
            }
            markdown.push_str(&self.format_table_row(&padded_row, &max_col_lengths));
        }

        format!("{}\n\n", markdown.trim_end())
    }

    /// Extract header content from a table header cell.
    fn extract_header_content(&self, cell: &NodeRef<Node>, document: &Html, excluded: &[ego_tree::NodeId]) -> String {
        let content = self.collapse_inline_whitespace(&self.get_inner_markdown(cell, document, excluded));

        if content.trim().is_empty() {
            // Fallback: extract text content directly
            self.collapse_inline_whitespace(&self.extract_text_content(cell))
        } else {
            content
        }
    }

    /// Convert consecutive links to a table.
    fn convert_consecutive_links_to_table(
        &self,
        link_ids: &[ego_tree::NodeId],
        document: &Html,
        excluded: &[ego_tree::NodeId],
    ) -> String {
        let mut cells: Vec<String> = Vec::new();
        let mut max_col_lengths: Vec<usize> = Vec::new();

        for link_id in link_ids {
            if let Some(link_node) = document.tree.get(*link_id) {
                let cell_content = self.convert_link(&link_node, document, excluded);
                if cell_content.is_empty() {
                    continue;
                }
                max_col_lengths.push(cell_content.chars().count().max(3));
                cells.push(cell_content);
            }
        }

        if cells.is_empty() {
            return String::new();
        }

        let mut markdown = "\n\n".to_string();
        markdown.push_str(&self.format_table_row(&cells, &max_col_lengths));

        format!("{}\n", markdown)
    }

    /// Format a table row.
    fn format_table_row(&self, cells: &[String], max_lengths: &[usize]) -> String {
        let mut row = "|".to_string();
        for (i, cell) in cells.iter().enumerate() {
            let max_length = max_lengths.get(i).copied().unwrap_or(cell.chars().count());
            let padding_len = max_length.saturating_sub(cell.chars().count());
            let padding = " ".repeat(padding_len);
            let escaped = self.escape_markdown_table_cell_content(cell);
            row.push_str(&format!(" {}{} |", escaped, padding));
        }
        row.push('\n');
        row
    }

    /// Format a table separator row.
    fn format_table_separator(&self, max_lengths: &[usize]) -> String {
        let mut separator = "|".to_string();
        for length in max_lengths {
            let dash_count = (*length).max(3);
            separator.push_str(&format!(" {} |", "-".repeat(dash_count)));
        }
        separator.push('\n');
        separator
    }

    /// Wrap text with a delimiter.
    fn wrap_with_delimiter(&self, text: &str, delimiter: &str) -> String {
        if text.trim().is_empty() {
            return text.to_string();
        }
        format!("{}{}{}", delimiter, text.trim(), delimiter)
    }

    /// Escape Markdown special characters.
    fn escape_markdown_chars(&self, text: &str) -> String {
        if !self.escape_mode {
            return text.to_string();
        }
        let mut result = text.replace('\\', "\\\\");
        for ch in &[
            '`', '*', '_', '{', '}', '[', ']', '(', ')', '#', '+', '-', '.', '!', '|',
        ] {
            result = result.replace(*ch, &format!("\\{}", ch));
        }
        result
    }

    /// Escape pipe character in table cells.
    fn escape_markdown_table_cell_content(&self, text: &str) -> String {
        text.replace('|', "\\|")
    }

    /// Convert definition list.
    fn convert_definition_list(&self, node: &NodeRef<Node>, document: &Html, excluded: &[ego_tree::NodeId]) -> String {
        let mut markdown = String::new();
        let mut dt_content: Option<String> = None;

        for child in node.children() {
            if let Node::Element(el) = child.value() {
                let tag = el.name.local.as_ref();
                if tag == "dt" {
                    if let Some(ref content) = dt_content {
                        markdown.push_str(&format!("{}\n", content));
                    }
                    dt_content = Some(self.get_inner_markdown(&child, document, excluded));
                } else if tag == "dd" {
                    let dd_content = self.get_inner_markdown(&child, document, excluded);
                    if let Some(ref dt) = dt_content {
                        markdown.push_str(&format!("\n{}\n:   {}\n", dt, dd_content));
                        dt_content = None;
                    } else {
                        markdown.push_str(&format!("\n:   {}\n", dd_content));
                    }
                }
            }
        }

        if let Some(ref content) = dt_content {
            markdown.push_str(&format!("\n{}\n", content));
        }

        if markdown.is_empty() {
            String::new()
        } else {
            format!("\n{}\n\n", markdown.trim())
        }
    }

    /// Convert list (ul/ol) to Markdown.
    fn convert_list_to_markdown(&self, node: &NodeRef<Node>, document: &Html, excluded: &[ego_tree::NodeId]) -> String {
        let list_markdown = self.process_list(node, 0, document, excluded);
        let trimmed = list_markdown.trim();
        if trimmed.is_empty() {
            String::new()
        } else {
            format!("\n\n{}\n\n", trimmed)
        }
    }

    /// Recursively process a list element.
    fn process_list(
        &self,
        list_element: &NodeRef<Node>,
        level: usize,
        document: &Html,
        excluded: &[ego_tree::NodeId],
    ) -> String {
        let mut markdown = String::new();
        let is_ordered = matches!(list_element.value(), Node::Element(el)
            if el.name.local.as_ref() == "ol");

        let mut item_counter: usize = 1;
        if is_ordered
            && let Node::Element(el) = list_element.value()
            && let Some(start_val) = el.attr("start")
            && let Ok(start) = start_val.parse::<usize>()
            && start > 1
        {
            item_counter = start;
        }

        let indent = "    ".repeat(level);

        for child in list_element.children() {
            if excluded.contains(&child.id()) {
                continue;
            }
            if let Node::Element(el) = child.value()
                && el.name.local.as_ref() == "li"
            {
                let marker = if is_ordered {
                    let m = format!("{}.", item_counter);
                    item_counter += 1;
                    m
                } else {
                    self.bullet_list_marker.clone()
                };

                let (item_content, nested_list) = self.extract_li_data(&child, level, document, excluded);

                let trimmed_content = item_content.trim();
                let lines: Vec<&str> = trimmed_content.split('\n').filter(|s| !s.is_empty()).collect();

                let first_line = lines.first().copied().unwrap_or("");
                markdown.push_str(&format!("{}{} {}\n", indent, marker, first_line));

                // Add subsequent lines with proper indentation
                let subsequent_indent = format!("{}{}", indent, " ".repeat(marker.len() + 1));
                for line in lines.iter().skip(1) {
                    markdown.push_str(&format!("{}{}\n", subsequent_indent, line));
                }

                if !nested_list.is_empty() {
                    markdown.push_str(&nested_list);
                    markdown.push('\n');
                }
            }
        }

        markdown
    }

    /// Extract content and nested list markdown from a <li> element.
    fn extract_li_data(
        &self,
        li_element: &NodeRef<Node>,
        level: usize,
        document: &Html,
        excluded: &[ego_tree::NodeId],
    ) -> (String, String) {
        let mut item_content = String::new();
        let mut nested_list = String::new();

        for child in li_element.children() {
            if excluded.contains(&child.id()) {
                continue;
            }
            if let Node::Element(el) = child.value() {
                let tag = el.name.local.as_ref();
                if tag == "ul" || tag == "ol" {
                    nested_list.push('\n');
                    nested_list.push_str(&self.process_list(&child, level + 1, document, excluded));
                } else if tag == "p" {
                    item_content.push_str(self.get_inner_markdown(&child, document, excluded).trim());
                    item_content.push('\n');
                } else {
                    item_content.push_str(&self.convert_node(&child, document, excluded));
                }
            } else {
                item_content.push_str(&self.convert_node(&child, document, excluded));
            }
        }

        let cleaned_item = item_content.trim().to_string();
        let cleaned_nested = nested_list.trim().to_string();

        let final_nested = if !cleaned_nested.is_empty() && !cleaned_item.is_empty() {
            format!("\n{}", cleaned_nested)
        } else {
            cleaned_nested
        };

        (cleaned_item, final_nested)
    }

    /// Normalize whitespace in converted Markdown.
    fn normalize_whitespace(&self, text: &str) -> String {
        // Replace CRLF with LF
        let text = text.replace("\r\n", "\n");
        // Replace multiple consecutive newlines with max two
        let text = Regex::new(r"\n{3,}")
            .map(|re| re.replace_all(&text, "\n\n").to_string())
            .unwrap_or(text);
        // Trim trailing spaces/tabs from each line
        let text = Regex::new(r"[ \t]+$")
            .map(|re| {
                text.lines()
                    .map(|line| re.replace_all(line, "").to_string())
                    .collect::<Vec<_>>()
                    .join("\n")
            })
            .unwrap_or(text);

        text.trim().to_string()
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_simple_paragraph() {
        let converter = HtmlToMarkdownConverter::new("<p>Hello world</p>", vec![]);
        let md = converter.get_markdown();
        assert!(md.contains("Hello world"));
    }

    #[test]
    fn test_heading_atx() {
        let mut converter = HtmlToMarkdownConverter::new("<h1>Title</h1>", vec![]);
        converter.set_heading_style(HeadingStyle::Atx);
        let md = converter.get_markdown();
        assert!(md.contains("# Title"));
    }

    #[test]
    fn test_heading_setext() {
        let mut converter = HtmlToMarkdownConverter::new("<h1>Title</h1>", vec![]);
        converter.set_heading_style(HeadingStyle::Setext);
        let md = converter.get_markdown();
        assert!(md.contains("Title\n====="));
    }

    #[test]
    fn test_bold() {
        let converter = HtmlToMarkdownConverter::new("<strong>bold text</strong>", vec![]);
        let md = converter.get_markdown();
        assert!(md.contains("**bold text**"));
    }

    #[test]
    fn test_italic() {
        let converter = HtmlToMarkdownConverter::new("<em>italic text</em>", vec![]);
        let md = converter.get_markdown();
        assert!(md.contains("*italic text*"));
    }

    #[test]
    fn test_link() {
        let converter = HtmlToMarkdownConverter::new("<a href=\"https://example.com\">Example</a>", vec![]);
        let md = converter.get_markdown();
        assert!(md.contains("[Example](https://example.com)"));
    }

    #[test]
    fn test_image() {
        let converter = HtmlToMarkdownConverter::new("<img src=\"image.jpg\" alt=\"An image\">", vec![]);
        let md = converter.get_markdown();
        assert!(md.contains("![An image](image.jpg)"));
    }

    #[test]
    fn test_unordered_list() {
        let converter = HtmlToMarkdownConverter::new("<ul><li>Item 1</li><li>Item 2</li></ul>", vec![]);
        let md = converter.get_markdown();
        assert!(md.contains("- Item 1"));
        assert!(md.contains("- Item 2"));
    }

    #[test]
    fn test_ordered_list() {
        let converter = HtmlToMarkdownConverter::new("<ol><li>First</li><li>Second</li></ol>", vec![]);
        let md = converter.get_markdown();
        assert!(md.contains("1. First"));
        assert!(md.contains("2. Second"));
    }

    #[test]
    fn test_code_block() {
        let converter =
            HtmlToMarkdownConverter::new("<pre><code class=\"language-rust\">fn main() {}</code></pre>", vec![]);
        let md = converter.get_markdown();
        assert!(md.contains("```rust"));
        assert!(md.contains("fn main() {}"));
        assert!(md.contains("```"));
    }

    #[test]
    fn test_inline_code() {
        let converter = HtmlToMarkdownConverter::new("<code>foo</code>", vec![]);
        let md = converter.get_markdown();
        assert!(md.contains("`foo`"));
    }

    #[test]
    fn test_blockquote() {
        let converter = HtmlToMarkdownConverter::new("<blockquote>Quoted text</blockquote>", vec![]);
        let md = converter.get_markdown();
        assert!(md.contains("> Quoted text"));
    }

    #[test]
    fn test_horizontal_rule() {
        let converter = HtmlToMarkdownConverter::new("<hr>", vec![]);
        let md = converter.get_markdown();
        assert!(md.contains("* * *"));
    }

    #[test]
    fn test_table() {
        let converter = HtmlToMarkdownConverter::new(
            "<table><thead><tr><th>Name</th><th>Value</th></tr></thead>\
             <tbody><tr><td>A</td><td>1</td></tr></tbody></table>",
            vec![],
        );
        let md = converter.get_markdown();
        assert!(md.contains("| Name"));
        assert!(md.contains("| A"));
        assert!(md.contains("---"));
    }

    #[test]
    fn test_strikethrough() {
        let converter = HtmlToMarkdownConverter::new("<del>deleted text</del>", vec![]);
        let md = converter.get_markdown();
        assert!(md.contains("~~deleted text~~"));
    }

    #[test]
    fn test_excluded_selector() {
        let converter = HtmlToMarkdownConverter::new(
            "<div><p>Keep this</p><div class=\"hidden\">Remove this</div></div>",
            vec![],
        );
        let md = converter.get_markdown();
        assert!(md.contains("Keep this"));
        assert!(!md.contains("Remove this"));
    }

    #[test]
    fn test_script_removed() {
        let converter = HtmlToMarkdownConverter::new("<div><p>Content</p><script>alert('test')</script></div>", vec![]);
        let md = converter.get_markdown();
        assert!(md.contains("Content"));
        assert!(!md.contains("alert"));
    }

    // --- Tests for aria-hidden and role=menu exclusion ---

    #[test]
    fn test_aria_hidden_excluded() {
        let converter = HtmlToMarkdownConverter::new(
            "<div><p>Visible</p><div aria-hidden=\"true\"><p>Hidden mega-menu</p></div></div>",
            vec![],
        );
        let md = converter.get_markdown();
        assert!(md.contains("Visible"));
        assert!(!md.contains("Hidden mega-menu"));
    }

    #[test]
    fn test_aria_hidden_children_excluded() {
        let converter = HtmlToMarkdownConverter::new(
            "<div><p>Content</p><nav aria-hidden=\"true\"><ul><li><a href=\"/\">Home</a></li><li><a href=\"/about\">About</a></li></ul></nav></div>",
            vec![],
        );
        let md = converter.get_markdown();
        assert!(md.contains("Content"));
        assert!(!md.contains("Home"));
        assert!(!md.contains("About"));
    }

    #[test]
    fn test_role_menu_excluded() {
        let converter = HtmlToMarkdownConverter::new(
            "<div><p>Page content</p><ul role=\"menu\"><li>Menu Item 1</li><li>Menu Item 2</li></ul></div>",
            vec![],
        );
        let md = converter.get_markdown();
        assert!(md.contains("Page content"));
        assert!(!md.contains("Menu Item"));
    }

    // --- Tests for block element spacing ---

    #[test]
    fn test_adjacent_divs_have_spacing() {
        let converter = HtmlToMarkdownConverter::new("<div>text one</div><div>text two</div>", vec![]);
        let md = converter.get_markdown();
        assert!(
            !md.contains("text onetext two"),
            "Adjacent divs should not concatenate: {}",
            md
        );
        assert!(md.contains("text one"));
        assert!(md.contains("text two"));
    }

    #[test]
    fn test_adjacent_sections_have_spacing() {
        let converter = HtmlToMarkdownConverter::new(
            "<section><p>First</p></section><section><p>Second</p></section>",
            vec![],
        );
        let md = converter.get_markdown();
        assert!(
            !md.contains("FirstSecond"),
            "Adjacent sections should not concatenate: {}",
            md
        );
    }

    #[test]
    fn test_span_remains_inline() {
        let converter = HtmlToMarkdownConverter::new("<p>Hello <span>world</span> test</p>", vec![]);
        let md = converter.get_markdown();
        assert!(md.contains("Hello world test"));
    }

    #[test]
    fn test_nested_divs_no_excessive_whitespace() {
        let converter = HtmlToMarkdownConverter::new("<div><div><div>deep text</div></div></div>", vec![]);
        let md = converter.get_markdown();
        assert!(md.contains("deep text"));
        // Should not have more than two consecutive newlines after normalization
        assert!(
            !md.contains("\n\n\n"),
            "Nested divs should not produce excessive newlines"
        );
    }

    #[test]
    fn test_empty_div_produces_no_output() {
        let converter = HtmlToMarkdownConverter::new("<p>Before</p><div></div><p>After</p>", vec![]);
        let md = converter.get_markdown();
        assert!(md.contains("Before"));
        assert!(md.contains("After"));
    }

    // --- Tests for aria-label fallback on links ---

    #[test]
    fn test_link_aria_label_fallback() {
        let converter = HtmlToMarkdownConverter::new(
            r#"<a href="https://facebook.com/page" aria-label="Facebook"><svg><path d="M0 0"/></svg></a>"#,
            vec![],
        );
        let md = converter.get_markdown();
        assert!(
            md.contains("[Facebook](https://facebook.com/page)"),
            "Should use aria-label: {}",
            md
        );
    }

    #[test]
    fn test_link_visible_text_preferred_over_aria_label() {
        let converter = HtmlToMarkdownConverter::new(
            r#"<a href="https://example.com" aria-label="Aria Label">Visible Text</a>"#,
            vec![],
        );
        let md = converter.get_markdown();
        assert!(md.contains("[Visible Text](https://example.com)"));
        assert!(!md.contains("Aria Label"));
    }

    #[test]
    fn test_link_url_fallback_without_aria_label() {
        let converter = HtmlToMarkdownConverter::new(r#"<a href="https://example.com"><svg></svg></a>"#, vec![]);
        let md = converter.get_markdown();
        assert!(
            md.contains("[https://example.com](https://example.com)"),
            "Should fall back to URL: {}",
            md
        );
    }

    #[test]
    fn test_link_empty_aria_label_falls_back_to_url() {
        let converter = HtmlToMarkdownConverter::new(
            r#"<a href="https://example.com" aria-label="  "><svg></svg></a>"#,
            vec![],
        );
        let md = converter.get_markdown();
        assert!(
            md.contains("[https://example.com](https://example.com)"),
            "Empty aria-label should fall back to URL: {}",
            md
        );
    }

    // --- Tests for cookie banner exclusion ---

    #[test]
    fn test_cookie_banner_excluded() {
        let converter = HtmlToMarkdownConverter::new(
            "<div><p>Content</p><div class=\"cookie-banner\"><p>We use cookies</p><button>Accept</button></div></div>",
            vec![],
        );
        let md = converter.get_markdown();
        assert!(md.contains("Content"));
        assert!(!md.contains("cookies"));
    }

    #[test]
    fn test_onetrust_banner_excluded() {
        let converter = HtmlToMarkdownConverter::new(
            "<div><p>Content</p><div id=\"onetrust-banner-sdk\"><p>Cookie preferences</p></div></div>",
            vec![],
        );
        let md = converter.get_markdown();
        assert!(md.contains("Content"));
        assert!(!md.contains("Cookie preferences"));
    }
}


================================================
FILE: src/export/utils/markdown_site_aggregator.rs
================================================
// SiteOne Crawler - MarkdownSiteAggregator
// (c) Jan Reges <jan.reges@siteone.cz>
//
// Combines multiple markdown files into a single file.

use std::fs;
use std::path::Path;

use regex::Regex;

use crate::error::{CrawlerError, CrawlerResult};

/// Similarity threshold for common header/footer detection (percentage).
const SIMILARITY_THRESHOLD: f64 = 80.0;

/// Combines multiple Markdown files from a directory into a single document.
/// Detects and extracts common headers/footers, adds page separators and URLs.
pub struct MarkdownSiteAggregator {
    base_url: String,
}

impl MarkdownSiteAggregator {
    pub fn new(base_url: &str) -> Self {
        Self {
            base_url: base_url.trim_end_matches('/').to_string(),
        }
    }

    /// Combine all markdown files in a directory into a single document.
    pub fn combine_directory(&self, directory_path: &str, remove_links_and_images: bool) -> CrawlerResult<String> {
        let files = self.get_markdown_files(directory_path)?;

        // Load content of all files into a map [url => lines]
        let mut pages: Vec<(String, Vec<String>)> = Vec::new();
        for file_path in &files {
            let url = self.make_url_from_path(file_path, directory_path);
            let content = fs::read_to_string(file_path)
                .map_err(|e| CrawlerError::Export(format!("Cannot read file '{}': {}", file_path, e)))?;
            let lines: Vec<String> = content.trim_end().split('\n').map(|s| s.to_string()).collect();
            pages.push((url, lines));
        }

        // Sort URLs to ensure index pages come first
        let base_url = self.base_url.clone();
        pages.sort_by(|(url_a, _), (url_b, _)| {
            // Root URL should always be first
            if url_a == &base_url || url_a.is_empty() {
                return std::cmp::Ordering::Less;
            }
            if url_b == &base_url || url_b.is_empty() {
                return std::cmp::Ordering::Greater;
            }

            let parts_a: Vec<&str> = url_a.trim_end_matches('/').split('/').collect();
            let parts_b: Vec<&str> = url_b.trim_end_matches('/').split('/').collect();

            let min_len = parts_a.len().min(parts_b.len());
            for i in 0..min_len {
                if parts_a[i] != parts_b[i] {
                    return parts_a[i].cmp(parts_b[i]);
                }
            }

            parts_a.len().cmp(&parts_b.len())
        });

        // Detect common header and footer
        let page_lines: Vec<&Vec<String>> = pages.iter().map(|(_, lines)| lines).collect();
        let header_lines = self.detect_common_header(&page_lines);
        let footer_lines = self.detect_common_footer(&page_lines);

        // Remove header and footer from individual pages
        for (_, lines) in &mut pages {
            if !header_lines.is_empty() {
                *lines = self.remove_prefix(lines, &header_lines);
            }
            if !footer_lines.is_empty() {
                *lines = self.remove_suffix(lines, &footer_lines);
            }
        }

        // Build resulting markdown
        let mut result_lines: Vec<String> = Vec::new();
        if !header_lines.is_empty() {
            result_lines.extend(header_lines.iter().cloned());
            result_lines.push(String::new());
        }

        // Add content of all pages with their URLs
        for (url, lines) in &pages {
            // Use emoji + URL marker
            result_lines.push(format!("\u{2B07}\u{FE0F} `URL: {}`\n\n---\n\n", url));
            for line in lines {
                result_lines.push(line.clone());
            }
            result_lines.push("\n\n---\n".to_string());
        }

        if !footer_lines.is_empty() {
            // Remove the last empty line before footer if present
            if result_lines.last().map(|s| s.is_empty()).unwrap_or(false) {
                result_lines.pop();
            }
            result_lines.push(String::new());
            result_lines.extend(footer_lines.iter().cloned());
        }

        let mut final_markdown = result_lines.join("\n");

        if remove_links_and_images {
            final_markdown = self.remove_links_and_images(&final_markdown);
        }

        Ok(final_markdown)
    }

    /// Get all markdown files in a directory recursively.
    fn get_markdown_files(&self, dir: &str) -> CrawlerResult<Vec<String>> {
        let mut paths = Vec::new();
        self.collect_markdown_files(dir, &mut paths)?;
        Ok(paths)
    }

    #[allow(clippy::only_used_in_recursion)]
    fn collect_markdown_files(&self, dir: &str, paths: &mut Vec<String>) -> CrawlerResult<()> {
        let dir_path = Path::new(dir);
        if !dir_path.is_dir() {
            return Ok(());
        }

        let entries = fs::read_dir(dir_path)
            .map_err(|e| CrawlerError::Export(format!("Cannot read directory '{}': {}", dir, e)))?;

        for entry in entries.flatten() {
            let path = entry.path();
            if path.is_dir() {
                self.collect_markdown_files(&path.to_string_lossy(), paths)?;
            } else if path.is_file()
                && let Some(ext) = path.extension()
                && ext.to_str().map(|e| e.to_lowercase()) == Some("md".to_string())
            {
                paths.push(path.to_string_lossy().to_string());
            }
        }

        Ok(())
    }

    /// Make URL from file path.
    fn make_url_from_path(&self, file_path: &str, root_dir: &str) -> String {
        let root = root_dir.trim_end_matches('/');
        let rel_path = file_path[root.len()..].trim_start_matches('/').replace('\\', "/");

        // Remove .md extension
        let rel_path = if rel_path.ends_with(".md") {
            &rel_path[..rel_path.len() - 3]
        } else {
            &rel_path
        };

        // Replace index at end with /
        let rel_path = Regex::new(r"/index$")
            .map(|re| re.replace(rel_path, "/").to_string())
            .unwrap_or_else(|_| rel_path.to_string());

        // Handle root index.md
        if rel_path == "index" || rel_path.is_empty() {
            return if !self.base_url.is_empty() {
                self.base_url.clone()
            } else {
                String::new()
            };
        }

        if !self.base_url.is_empty() {
            format!("{}/{}", self.base_url, rel_path.trim_start_matches('/'))
        } else {
            rel_path.to_string()
        }
    }

    /// Detect common header across pages.
    fn detect_common_header(&self, pages: &[&Vec<String>]) -> Vec<String> {
        if pages.is_empty() {
            return Vec::new();
        }

        // Use pages starting from index 2 (skip first 2), take up to 3
        let sample_start = 2.min(pages.len());
        let sample_end = (sample_start + 3).min(pages.len());
        if sample_start >= sample_end {
            return Vec::new();
        }

        let sample_pages = &pages[sample_start..sample_end];

        let mut common_header = sample_pages[0].clone();
        for page in sample_pages.iter().skip(1) {
            common_header = self.align_common_prefix(&common_header, page);
            if common_header.is_empty() {
                break;
            }
        }

        common_header
    }

    /// Detect common footer across pages.
    fn detect_common_footer(&self, pages: &[&Vec<String>]) -> Vec<String> {
        if pages.is_empty() {
            return Vec::new();
        }

        let sample_start = 2.min(pages.len());
        let sample_end = (sample_start + 3).min(pages.len());
        if sample_start >= sample_end {
            return Vec::new();
        }

        let sample_pages = &pages[sample_start..sample_end];

        // Reverse the first page
        let mut common_footer: Vec<String> = sample_pages[0].iter().rev().cloned().collect();
        for page in sample_pages.iter().skip(1) {
            let other_rev: Vec<String> = page.iter().rev().cloned().collect();
            common_footer = self.align_common_prefix(&common_footer, &other_rev);
            if common_footer.is_empty() {
                break;
            }
        }

        // Reverse back to correct order
        common_footer.reverse();
        common_footer
    }

    /// Align two line arrays and find their common prefix with fuzzy tolerance.
    fn align_common_prefix(&self, lines_a: &[String], lines_b: &[String]) -> Vec<String> {
        let mut result = Vec::new();
        let mut i = 0;
        let mut j = 0;

        while i < lines_a.len() && j < lines_b.len() {
            if self.lines_similar(&lines_a[i], &lines_b[j]) {
                result.push(lines_a[i].clone());
                i += 1;
                j += 1;
            } else {
                // Try skipping a line in A or B
                let skip_a = i + 1 < lines_a.len() && self.lines_similar(&lines_a[i + 1], &lines_b[j]);
                let skip_b = !skip_a && j + 1 < lines_b.len() && self.lines_similar(&lines_a[i], &lines_b[j + 1]);

                if skip_a {
                    i += 1;
                } else if skip_b {
                    j += 1;
                } else {
                    break;
                }
            }
        }

        result
    }

    /// Evaluate similarity of two lines (ignoring markdown formatting).
    fn lines_similar(&self, a: &str, b: &str) -> bool {
        let normalize = |s: &str| -> String {
            let result = Regex::new(r"[*_]+")
                .map(|re| re.replace_all(s, "").to_string())
                .unwrap_or_else(|_| s.to_string());
            result.trim().to_string()
        };

        let na = normalize(a);
        let nb = normalize(b);

        if na == nb {
            return true;
        }

        // Calculate similarity percentage
        let percent = self.similar_text_percent(&na, &nb);
        percent >= SIMILARITY_THRESHOLD
    }

    /// Calculate similarity percentage between two strings.
    fn similar_text_percent(&self, a: &str, b: &str) -> f64 {
        if a.is_empty() && b.is_empty() {
            return 100.0;
        }
        if a.is_empty() || b.is_empty() {
            return 0.0;
        }

        let matching = self.longest_common_substring_len(a, b);
        let total = (a.len() + b.len()) as f64;
        (2.0 * matching as f64 / total) * 100.0
    }

    /// Find the length of the longest common substring.
    fn longest_common_substring_len(&self, a: &str, b: &str) -> usize {
        let a_bytes = a.as_bytes();
        let b_bytes = b.as_bytes();
        let m = a_bytes.len();
        let n = b_bytes.len();

        if m == 0 || n == 0 {
            return 0;
        }

        let mut max_len = 0;
        let mut prev = vec![0usize; n + 1];
        let mut curr = vec![0usize; n + 1];

        for i in 1..=m {
            for j in 1..=n {
                if a_bytes[i - 1] == b_bytes[j - 1] {
                    curr[j] = prev[j - 1] + 1;
                    max_len = max_len.max(curr[j]);
                } else {
                    curr[j] = 0;
                }
            }
            std::mem::swap(&mut prev, &mut curr);
            curr.fill(0);
        }

        max_len
    }

    /// Remove common prefix (header) from a page's lines.
    fn remove_prefix(&self, lines: &[String], prefix_lines: &[String]) -> Vec<String> {
        if prefix_lines.is_empty() {
            return lines.to_vec();
        }
        let len = prefix_lines.len();
        if lines.len() >= len {
            lines[len..].to_vec()
        } else {
            lines.to_vec()
        }
    }

    /// Remove common suffix (footer) from a page's lines.
    fn remove_suffix(&self, lines: &[String], suffix_lines: &[String]) -> Vec<String> {
        if suffix_lines.is_empty() {
            return lines.to_vec();
        }
        let len = suffix_lines.len();
        if lines.len() >= len {
            lines[..lines.len() - len].to_vec()
        } else {
            lines.to_vec()
        }
    }

    /// Remove links and images from markdown text.
    fn remove_links_and_images(&self, markdown: &str) -> String {
        let mut result = markdown.to_string();

        // Remove image in anchor text
        if let Ok(re) = Regex::new(r"\[!\[[^\]]*\]\([^\)]*\)\]\([^\)]*\)") {
            result = re.replace_all(&result, "").to_string();
        }

        // Remove standalone images
        if let Ok(re) = Regex::new(r#"!\[.*?\]\([^)]*\)(\s*"[^"]*")?"#) {
            result = re.replace_all(&result, "").to_string();
        }

        // Replace links in list items
        if let Ok(re) = Regex::new(r"(?m)^\s*(\*|-|[0-9]+\.)\s*\[([^\]]+)\]\([^)]+\)") {
            result = re.replace_all(&result, "").to_string();
        }

        // Replace empty links
        if let Ok(re) = Regex::new(r"\[\]\([^)]+\)") {
            result = re.replace_all(&result, "").to_string();
        }

        // Clean up tables - remove rows with only whitespace and pipes
        if let Ok(re) = Regex::new(r"(?m)^\s*(\|\s*)+\|\s*$") {
            result = re.replace_all(&result, "").to_string();
        }

        // Clean empty list items
        if let Ok(re) = Regex::new(r"(?m)^\s*(\*|-|[0-9]+\.)\s*$") {
            result = re.replace_all(&result, "").to_string();
        }

        // Remove multiple consecutive empty lines
        if let Ok(re) = Regex::new(r"\n{3,}") {
            result = re.replace_all(&result, "\n\n").to_string();
        }

        result
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_make_url_from_path() {
        let aggregator = MarkdownSiteAggregator::new("https://example.com");
        assert_eq!(
            aggregator.make_url_from_path("/tmp/export/index.md", "/tmp/export"),
            "https://example.com"
        );
        assert_eq!(
            aggregator.make_url_from_path("/tmp/export/about.md", "/tmp/export"),
            "https://example.com/about"
        );
        assert_eq!(
            aggregator.make_url_from_path("/tmp/export/docs/intro.md", "/tmp/export"),
            "https://example.com/docs/intro"
        );
    }

    #[test]
    fn test_lines_similar() {
        let aggregator = MarkdownSiteAggregator::new("");
        assert!(aggregator.lines_similar("Hello world", "Hello world"));
        assert!(aggregator.lines_similar("**Hello** world", "Hello world"));
        assert!(!aggregator.lines_similar("Hello", "Completely different text"));
    }

    #[test]
    fn test_remove_links_and_images() {
        let aggregator = MarkdownSiteAggregator::new("");
        let input = "Some text ![image](img.jpg) and [link](url)";
        let result = aggregator.remove_links_and_images(input);
        assert!(!result.contains("![image]"));
    }
}


================================================
FILE: src/export/utils/mod.rs
================================================
// SiteOne Crawler - Export utilities module
// (c) Jan Reges <jan.reges@siteone.cz>

pub mod html_to_markdown;
pub mod markdown_site_aggregator;
pub mod offline_url_converter;
pub mod target_domain_relation;


================================================
FILE: src/export/utils/offline_url_converter.rs
================================================
// SiteOne Crawler - OfflineUrlConverter
// (c) Jan Reges <jan.reges@siteone.cz>
//
// Converts absolute URLs to relative paths for offline browsing.

use std::sync::Mutex;

use once_cell::sync::Lazy;
use regex::Regex;

use crate::engine::parsed_url::ParsedUrl;
use crate::utils;

use super::target_domain_relation::TargetDomainRelation;

/// Static replace_query_string configuration
static REPLACE_QUERY_STRING: Lazy<Mutex<Vec<String>>> = Lazy::new(|| Mutex::new(Vec::new()));

/// Static lowercase configuration for offline export
static LOWERCASE: Lazy<Mutex<bool>> = Lazy::new(|| Mutex::new(false));

/// Regex for removing file extension from path
static STRIP_EXT_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"(?i)\.[a-z0-9]{1,10}$").unwrap());

/// Regex for removing domain from path
static DOMAIN_IN_PATH_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"(?i)^(//|https?://)([^/]+)(:[0-9]+)?").unwrap());

/// Static files extensions regex pattern
static STATIC_FILES_EXTENSIONS: &str = "jpg|jpeg|png|gif|webp|svg|ico|js|css|txt|woff2|woff|ttf|eot|mp4|webm|ogg|mp3|wav|flac|pdf|doc\
     |docx|xls|xlsx|ppt|pptx|zip|rar|gz|bz2|7z|tar|xml|json|action|asp|aspx|cfm|cfml|cgi|do|gsp|jsp|jspx|lasso|phtml\
     |php|php3|php4|php5|php7|php8|php9|pl|py|rb|rbw|rhtml|shtml|srv|vm|vmdk";

/// Dynamic page extensions that get .html appended
static DYNAMIC_PAGE_EXTENSIONS: &str = "action|asp|aspx|cfm|cfml|cgi|do|gsp|jsp|jspx|lasso|phtml|php3|php4|php5|php7|php8|php9|php|pl|py|rb|rbw|rhtml|shtml|srv|vm";

// Pre-compiled regexes for sanitize_file_path hot path
static RE_PATH_EXTENSION: Lazy<Regex> = Lazy::new(|| Regex::new(r"(?i)^(.+)\.([a-z0-9]{1,10})").unwrap());
static RE_CONTROL_CHARS: Lazy<Regex> = Lazy::new(|| Regex::new(r"[\x00-\x1F\x7F]").unwrap());
static RE_WHITESPACE: Lazy<Regex> = Lazy::new(|| Regex::new(r"\s+").unwrap());
static RE_MULTI_UNDERSCORE: Lazy<Regex> = Lazy::new(|| Regex::new(r"_{2,}").unwrap());
static RE_FRAGMENT_SUFFIX: Lazy<Regex> = Lazy::new(|| Regex::new(r"#.+$").unwrap());
static RE_DOTTED_FOLDER: Lazy<Regex> = Lazy::new(|| Regex::new(r"(?i)([^/]+)\.([a-z0-9]+)/").unwrap());
static RE_DOMAIN_TLD: Lazy<Regex> =
    Lazy::new(|| Regex::new(r"(?i)\.(com|org|net|dev|io|test|local|localhost)$").unwrap());
static RE_STATIC_EXT_FOLDER: Lazy<Regex> = Lazy::new(|| {
    let pattern = format!(r"(?i)([^.]+)\.({})\/", STATIC_FILES_EXTENSIONS);
    Regex::new(&pattern).unwrap()
});
static RE_STATIC_EXT_MATCH: Lazy<Regex> = Lazy::new(|| {
    let pattern = format!(r"(?i)^({})$", STATIC_FILES_EXTENSIONS);
    Regex::new(&pattern).unwrap()
});
static RE_DYNAMIC_EXT: Lazy<Regex> = Lazy::new(|| {
    let pattern = format!(r"(?i)\.({})$", DYNAMIC_PAGE_EXTENSIONS);
    Regex::new(&pattern).unwrap()
});

/// Converts absolute URLs to relative paths for offline browsing.
pub struct OfflineUrlConverter {
    initial_url: ParsedUrl,
    base_url: ParsedUrl,
    target_url: ParsedUrl,
    relative_target_url: ParsedUrl,
    target_url_source_attribute: Option<String>,
    #[allow(clippy::type_complexity)]
    callback_is_domain_allowed_for_static_files: Option<Box<dyn Fn(&str) -> bool + Send + Sync>>,
    #[allow(clippy::type_complexity)]
    callback_is_external_domain_allowed_for_crawling: Option<Box<dyn Fn(&str) -> bool + Send + Sync>>,
    target_domain_relation: TargetDomainRelation,
    preserve_url_structure: bool,
}

impl OfflineUrlConverter {
    #[allow(clippy::type_complexity)]
    pub fn new(
        initial_url: ParsedUrl,
        base_url: ParsedUrl,
        target_url: ParsedUrl,
        callback_is_domain_allowed_for_static_files: Option<Box<dyn Fn(&str) -> bool + Send + Sync>>,
        callback_is_external_domain_allowed_for_crawling: Option<Box<dyn Fn(&str) -> bool + Send + Sync>>,
        attribute: Option<&str>,
    ) -> Self {
        let relative_target_url = target_url.clone();
        let target_domain_relation = TargetDomainRelation::get_by_urls(&initial_url, &base_url, &target_url);

        Self {
            initial_url,
            base_url,
            target_url,
            relative_target_url,
            target_url_source_attribute: attribute.map(|s| s.to_string()),
            callback_is_domain_allowed_for_static_files,
            callback_is_external_domain_allowed_for_crawling,
            target_domain_relation,
            preserve_url_structure: false,
        }
    }

    pub fn set_preserve_url_structure(&mut self, preserve: bool) {
        self.preserve_url_structure = preserve;
    }

    /// Convert URL to relative path for offline browsing.
    pub fn convert_url_to_relative(&mut self, keep_fragment: bool) -> String {
        if let Some(forced_url) = self.get_forced_url_if_needed() {
            return forced_url;
        }

        self.detect_and_set_file_name_with_extension();
        self.calculate_and_apply_depth();

        let pre_final_url = self.relative_target_url.get_full_url(false, keep_fragment);
        Self::sanitize_file_path(&pre_final_url, keep_fragment)
    }

    pub fn get_relative_target_url(&self) -> &ParsedUrl {
        &self.relative_target_url
    }

    pub fn get_target_domain_relation(&self) -> TargetDomainRelation {
        self.target_domain_relation
    }

    /// Set global replace_query_string configuration
    pub fn set_replace_query_string(replace: Vec<String>) {
        if let Ok(mut rqs) = REPLACE_QUERY_STRING.lock() {
            *rqs = replace;
        }
    }

    /// Set global lowercase configuration for offline export
    pub fn set_lowercase(lowercase: bool) {
        if let Ok(mut lc) = LOWERCASE.lock() {
            *lc = lowercase;
        }
    }

    /// Get depth of base URL path in target offline version.
    pub fn get_offline_base_url_depth(url: &ParsedUrl) -> usize {
        let trimmed = url.path.trim_start_matches('/').trim();
        if trimmed.is_empty() {
            return 0;
        }
        trimmed.matches('/').count()
    }

    /// Check if URL needs to be forced (not converted to relative).
    fn get_forced_url_if_needed(&self) -> Option<String> {
        if self.relative_target_url.is_only_fragment()
            && let Some(ref f) = self.relative_target_url.fragment
        {
            return Some(format!("#{}", f));
        }

        // when URL is not requestable resource, it is not possible to convert it to relative URL
        if !utils::is_href_for_requestable_resource(&self.target_url.get_full_url(true, true)) {
            return Some(self.target_url.get_full_url(false, true));
        }

        // when target host is external and not allowed
        let is_external_host = matches!(
            self.target_domain_relation,
            TargetDomainRelation::InitialDifferentBaseDifferent | TargetDomainRelation::InitialDifferentBaseSame
        );

        if is_external_host && let Some(ref host) = self.target_url.host {
            if self.is_external_domain_allowed_for_crawling(host)
                || (self.target_url.is_static_file() && self.is_domain_allowed_for_static_files(host))
                || (!self.target_url.is_static_file()
                    && self.target_url_source_attribute.as_deref() == Some("src")
                    && self.is_domain_allowed_for_static_files(host))
            {
                return None;
            } else {
                return Some(self.target_url.get_full_url(true, true));
            }
        }

        None
    }

    /// Add '*.html' or '/index.html' to path when needed.
    fn detect_and_set_file_name_with_extension(&mut self) {
        let query_hash = self
            .relative_target_url
            .query
            .as_ref()
            .map(|q| Self::get_query_hash_from_query_string(q))
            .filter(|h| !h.trim().is_empty());

        // when the path is empty or '/'
        let trimmed_path = self
            .relative_target_url
            .path
            .trim_matches(|c: char| c == '/' || c == ' ');
        if trimmed_path.is_empty() {
            if let Some(ref hash) = query_hash {
                self.relative_target_url.set_path(format!("/index.{}.html", hash));
                self.relative_target_url.set_query(None);
            } else if self.relative_target_url.path.is_empty() && self.relative_target_url.fragment.is_some() {
                // only #fragment
                return;
            } else {
                self.relative_target_url.set_path("/index.html".to_string());
            }
            return;
        }

        let is_image_attribute = matches!(
            self.target_url_source_attribute.as_deref(),
            Some("src") | Some("srcset")
        );

        // if the URL is probably icon, we use SVG extension, otherwise we use JPG
        let full_url_lower = self.relative_target_url.get_full_url(true, true).to_lowercase();
        let img_extension = if full_url_lower.contains("icon") { "svg" } else { "jpg" };

        // when the URL is probably font from Google Fonts, we use CSS extension
        let other_file_extension = if self.target_url_source_attribute.as_deref() == Some("href")
            && self
                .relative_target_url
                .url
                .to_lowercase()
                .contains("fonts.googleapis.com/css")
        {
            "css"
        } else {
            "html"
        };

        let extension = self.relative_target_url.estimate_extension().unwrap_or_else(|| {
            if is_image_attribute {
                img_extension.to_string()
            } else {
                other_file_extension.to_string()
            }
        });

        if self.relative_target_url.path.ends_with('/') {
            let base_name = "index";
            if let Some(ref hash) = query_hash {
                self.relative_target_url.set_path(format!(
                    "{}{}.{}.{}",
                    self.relative_target_url.path, base_name, hash, extension
                ));
                self.relative_target_url.set_query(None);
            } else {
                self.relative_target_url
                    .set_path(format!("{}{}.{}", self.relative_target_url.path, base_name, extension));
            }
        } else if self.preserve_url_structure && self.target_url.estimate_extension().is_none() {
            // Preserve URL structure: /about → /about/index.html (instead of /about.html)
            // Only for page-like URLs without a real file extension
            if let Some(ref hash) = query_hash {
                self.relative_target_url
                    .set_path(format!("{}/index.{}.html", self.relative_target_url.path, hash));
                self.relative_target_url.set_query(None);
            } else {
                self.relative_target_url
                    .set_path(format!("{}/index.html", self.relative_target_url.path));
            }
        } else {
            // Remove existing extension from path
            let path_without_ext = STRIP_EXT_RE.replace(&self.relative_target_url.path, "").to_string();
            if let Some(ref hash) = query_hash {
                self.relative_target_url
                    .set_path(format!("{}.{}.{}", path_without_ext, hash, extension));
                self.relative_target_url.set_query(None);
            } else {
                self.relative_target_url
                    .set_path(format!("{}.{}", path_without_ext, extension));
            }
        }
    }

    /// Calculate and apply depth for relative path conversion.
    fn calculate_and_apply_depth(&mut self) {
        let base_path_trimmed = self.base_url.path.trim_start_matches(['/', ' ']);
        let base_depth = if base_path_trimmed.is_empty() {
            0usize
        } else {
            base_path_trimmed.matches('/').count()
        };

        match self.target_domain_relation {
            TargetDomainRelation::InitialSameBaseSame | TargetDomainRelation::InitialDifferentBaseSame => {
                if self.relative_target_url.path.starts_with('/') {
                    if base_depth > 0 {
                        self.relative_target_url.change_depth(base_depth as i32);
                    } else {
                        let new_path = self.relative_target_url.path.trim_start_matches('/').to_string();
                        self.relative_target_url.set_path(new_path);
                    }
                }
            }
            TargetDomainRelation::InitialSameBaseDifferent => {
                // backlink from the other domain back to initial domain
                let cleaned_path = DOMAIN_IN_PATH_RE
                    .replace(&self.relative_target_url.path, "")
                    .to_string();
                let cleaned_path = cleaned_path.trim_start_matches(['/', ' ']);
                let prefix = "../".repeat(base_depth + 1);
                self.relative_target_url.set_path(format!("{}{}", prefix, cleaned_path));
            }
            TargetDomainRelation::InitialDifferentBaseDifferent => {
                let extra_depth = if self.base_url.host != self.initial_url.host {
                    1
                } else {
                    0
                };
                let host = self.relative_target_url.host.clone().unwrap_or_default();
                let path = self.relative_target_url.path.clone();
                let prefix = "../".repeat(base_depth + extra_depth);
                self.relative_target_url
                    .set_path(format!("{}_{}{}", prefix, host, path));
            }
        }
    }

    fn is_domain_allowed_for_static_files(&self, domain: &str) -> bool {
        self.callback_is_domain_allowed_for_static_files
            .as_ref()
            .map(|cb| cb(domain))
            .unwrap_or(false)
    }

    fn is_external_domain_allowed_for_crawling(&self, domain: &str) -> bool {
        self.callback_is_external_domain_allowed_for_crawling
            .as_ref()
            .map(|cb| cb(domain))
            .unwrap_or(false)
    }

    /// Sanitize file path and replace special chars.
    pub fn sanitize_file_path(file_path: &str, keep_fragment: bool) -> String {
        // First decode URL-encoded characters
        let file_path = percent_encoding::percent_decode_str(file_path)
            .decode_utf8_lossy()
            .to_string();

        // Parse the file path to extract components
        let (parsed_path, parsed_query, parsed_fragment) = parse_file_path_components(&file_path);

        // Check if path has an extension
        let path_with_extension = RE_PATH_EXTENSION.captures(&parsed_path);

        let mut result = file_path.clone();
        let mut extension: Option<String> = None;

        if let Some(caps) = path_with_extension {
            let start = caps.get(1).map(|m| m.as_str()).unwrap_or("");
            let ext = caps.get(2).map(|m| m.as_str()).unwrap_or("");
            extension = Some(ext.to_string());

            if let Some(ref query_string) = parsed_query {
                let trimmed_query = query_string.trim();
                if !trimmed_query.is_empty() {
                    let query_hash = Self::get_query_hash_from_query_string(trimmed_query);
                    // Only add query hash if it's not empty after processing
                    if !query_hash.trim().is_empty() {
                        result = format!("{}.{}.{}", start, query_hash, ext);
                    } else {
                        result = format!("{}.{}", start, ext);
                    }

                    // add fragment to the end of the file path
                    if keep_fragment && let Some(ref frag) = parsed_fragment {
                        result = format!("{}#{}", result, frag);
                    }
                }
            }
        }

        // Remove characters that are dangerous for filesystems
        let dangerous_characters = ['\\', ':', '*', '?', '"', '<', '>', '|'];
        for ch in &dangerous_characters {
            result = result.replace(*ch, "_");
        }

        // Replace control characters
        result = RE_CONTROL_CHARS.replace_all(&result, "_").to_string();

        // Handle filesystem-specific limitations
        result = result
            .trim_matches(|c: char| c == ' ' || c == '\t' || c == '\n' || c == '\r' || c == '\0' || c == '\x0B')
            .to_string();

        // Replace multiple spaces with single underscore
        result = RE_WHITESPACE.replace_all(&result, "_").to_string();

        // Remove multiple underscores
        result = RE_MULTI_UNDERSCORE.replace_all(&result, "_").to_string();

        // When filepath is too long and there is a long filename, replace filename with shorter md5
        let file_path_for_length = RE_FRAGMENT_SUFFIX.replace(&result, "").to_string();
        if file_path_for_length.len() > 200 {
            let basename = std::path::Path::new(&result)
                .file_name()
                .and_then(|n| n.to_str())
                .unwrap_or("")
                .to_string();
            if basename.len() > 40 {
                let ext = extension
                    .as_deref()
                    .or_else(|| std::path::Path::new(&basename).extension().and_then(|e| e.to_str()))
                    .unwrap_or("html");
                let hash = {
                    use md5::{Digest, Md5};
                    let mut hasher = Md5::new();
                    hasher.update(basename.as_bytes());
                    format!("{:x}", hasher.finalize())
                };
                let short_hash = &hash[..10.min(hash.len())];
                result = result.replace(&basename, &format!("{}.{}", short_hash, ext));
            }
        }

        // Adding "_" to the end of the folder that contains the potential file extension
        result = RE_STATIC_EXT_FOLDER.replace_all(&result, "${1}.${2}_/").to_string();

        // Handle any other dotted folder names that might conflict
        {
            let re = &*RE_DOTTED_FOLDER;

            let result_clone = result.clone();
            let mut new_result = String::new();
            let mut last_end = 0;

            for caps in re.captures_iter(&result_clone) {
                let Some(full_match) = caps.get(0) else {
                    continue;
                };
                let name = caps.get(1).map(|m| m.as_str()).unwrap_or("");
                let ext = caps.get(2).map(|m| m.as_str()).unwrap_or("");

                new_result.push_str(&result_clone[last_end..full_match.start()]);

                // Skip if starts with underscore (domain name)
                if name.starts_with('_') {
                    new_result.push_str(full_match.as_str());
                } else if RE_DOMAIN_TLD.is_match(&format!("{}.{}", name, ext)) {
                    // Skip domain-like names
                    new_result.push_str(full_match.as_str());
                } else if RE_STATIC_EXT_MATCH.is_match(ext) {
                    // Already handled by the previous regex
                    new_result.push_str(full_match.as_str());
                } else {
                    new_result.push_str(&format!("{}.{}_/", name, ext));
                }

                last_end = full_match.end();
            }
            new_result.push_str(&result_clone[last_end..]);
            result = new_result;
        }

        // Replace extensions of typical dynamic pages
        result = RE_DYNAMIC_EXT.replace(&result, ".$1.html").to_string();

        if !keep_fragment && result.contains('#') {
            result = RE_FRAGMENT_SUFFIX.replace(&result, "").to_string();
        }

        // Convert to lowercase if configured
        if let Ok(lc) = LOWERCASE.lock()
            && *lc
        {
            result = result.to_lowercase();
        }

        result
    }

    /// Get query hash from query string.
    fn get_query_hash_from_query_string(query_string: &str) -> String {
        let replace_qs = REPLACE_QUERY_STRING.lock().unwrap_or_else(|e| e.into_inner());
        let has_replacements = !replace_qs.is_empty();

        if has_replacements {
            let replacements = &replace_qs;
            let mut qs = query_string.to_string();

            for replace in replacements.iter() {
                let parts: Vec<&str> = replace.splitn(2, "->").collect();
                let replace_from = parts[0].trim();
                let replace_to = if parts.len() > 1 { parts[1].trim() } else { "" };

                // Check if it's a regex
                let is_regex = crate::utils::is_regex_pattern(replace_from);

                if is_regex {
                    // Extract the pattern from delimiters
                    if let Some(pattern) = extract_regex_pattern(replace_from)
                        && let Ok(re) = Regex::new(&pattern)
                    {
                        qs = re.replace_all(&qs, replace_to).to_string();
                    }
                } else {
                    qs = qs.replace(replace_from, replace_to);
                }
            }

            // replace slashes with '~'
            qs.replace('/', "~")
        } else {
            // Use MD5 hash (first 10 chars)
            let decoded = html_entities_decode(&percent_encoding::percent_decode_str(query_string).decode_utf8_lossy());
            let hash = {
                use md5::{Digest, Md5};
                let mut hasher = Md5::new();
                hasher.update(decoded.as_bytes());
                format!("{:x}", hasher.finalize())
            };
            hash[..10.min(hash.len())].to_string()
        }
    }
}

/// Extract regex pattern from a delimited string (e.g., /pattern/flags)
fn extract_regex_pattern(input: &str) -> Option<String> {
    if input.len() < 2 {
        return None;
    }
    let delimiter = input.chars().next()?;
    let rest = &input[1..];

    // Find the last occurrence of the delimiter
    if let Some(end_pos) = rest.rfind(delimiter) {
        let pattern = &rest[..end_pos];
        let flags = &rest[end_pos + 1..];

        let mut regex_pattern = String::new();
        if flags.contains('i') {
            regex_pattern.push_str("(?i)");
        }
        regex_pattern.push_str(pattern);
        Some(regex_pattern)
    } else {
        None
    }
}

/// Parse file path into path, query, and fragment components
fn parse_file_path_components(file_path: &str) -> (String, Option<String>, Option<String>) {
    let mut remaining = file_path;

    // Extract fragment
    let fragment = if let Some(hash_pos) = remaining.find('#') {
        let f = &remaining[hash_pos + 1..];
        remaining = &remaining[..hash_pos];
        if f.is_empty() { None } else { Some(f.to_string()) }
    } else {
        None
    };

    // Extract query
    let query = if let Some(q_pos) = remaining.find('?') {
        let q = &remaining[q_pos + 1..];
        remaining = &remaining[..q_pos];
        if q.is_empty() { None } else { Some(q.to_string()) }
    } else {
        None
    };

    (remaining.to_string(), query, fragment)
}

/// Decode HTML entities
fn html_entities_decode(input: &str) -> String {
    input
        .replace("&amp;", "&")
        .replace("&lt;", "<")
        .replace("&gt;", ">")
        .replace("&quot;", "\"")
        .replace("&#039;", "'")
}

#[cfg(test)]
mod tests {
    use super::*;

    // Helper: create converter with siteone.io as initial URL, with domain allow callbacks
    fn make_converter(initial: &str, base: &str, target: &str, attribute: Option<&str>) -> OfflineUrlConverter {
        let initial_url = ParsedUrl::parse(initial, None);
        let base_url = ParsedUrl::parse(base, None);
        let base_url_ref = if target.starts_with("//")
            || target.starts_with("http")
            || target.starts_with('#')
            || target.starts_with('?')
        {
            None
        } else {
            Some(&base_url)
        };
        let target_url = ParsedUrl::parse(target, base_url_ref);

        let allowed_static: Box<dyn Fn(&str) -> bool + Send + Sync> =
            Box::new(|domain: &str| matches!(domain, "cdn.siteone.io" | "cdn.webflow.com" | "nextjs.org"));
        let allowed_crawling: Box<dyn Fn(&str) -> bool + Send + Sync> =
            Box::new(|domain: &str| matches!(domain, "svelte.dev" | "nextjs.org"));

        OfflineUrlConverter::new(
            initial_url,
            base_url,
            target_url,
            Some(allowed_static),
            Some(allowed_crawling),
            attribute,
        )
    }

    fn convert(initial: &str, base: &str, target: &str, attribute: Option<&str>) -> String {
        let mut converter = make_converter(initial, base, target, attribute);
        converter.convert_url_to_relative(true)
    }

    // =========================================================================
    // getOfflineBaseUrlDepth tests
    // =========================================================================

    #[test]
    fn depth_root() {
        assert_eq!(
            OfflineUrlConverter::get_offline_base_url_depth(&ParsedUrl::parse("/", None)),
            0
        );
    }

    #[test]
    fn depth_file() {
        assert_eq!(
            OfflineUrlConverter::get_offline_base_url_depth(&ParsedUrl::parse("/foo", None)),
            0
        );
    }

    #[test]
    fn depth_dir() {
        assert_eq!(
            OfflineUrlConverter::get_offline_base_url_depth(&ParsedUrl::parse("/foo/", None)),
            1
        );
    }

    #[test]
    fn depth_file_in_dir() {
        assert_eq!(
            OfflineUrlConverter::get_offline_base_url_depth(&ParsedUrl::parse("/foo/bar", None)),
            1
        );
    }

    #[test]
    fn depth_nested_dir() {
        assert_eq!(
            OfflineUrlConverter::get_offline_base_url_depth(&ParsedUrl::parse("/foo/bar/", None)),
            2
        );
    }

    #[test]
    fn depth_root_with_query() {
        // /?param=1 → /index.queryMd5Hash.html → depth 0
        assert_eq!(
            OfflineUrlConverter::get_offline_base_url_depth(&ParsedUrl::parse("/?param=1", None)),
            0
        );
    }

    #[test]
    fn depth_file_with_query() {
        // /foo?param=1 → /foo.queryMd5Hash.html → depth 0
        assert_eq!(
            OfflineUrlConverter::get_offline_base_url_depth(&ParsedUrl::parse("/foo?param=1", None)),
            0
        );
    }

    #[test]
    fn depth_dir_with_query() {
        // /foo/?param=1 → /foo/index.queryMd5Hash.html → depth 1
        assert_eq!(
            OfflineUrlConverter::get_offline_base_url_depth(&ParsedUrl::parse("/foo/?param=1", None)),
            1
        );
    }

    #[test]
    fn depth_file_in_dir_with_query() {
        // /foo/bar?param=1 → /foo/bar.queryMd5Hash.html → depth 1
        assert_eq!(
            OfflineUrlConverter::get_offline_base_url_depth(&ParsedUrl::parse("/foo/bar?param=1", None)),
            1
        );
    }

    #[test]
    fn depth_nested_dir_with_query() {
        // /foo/bar/?param=1 → /foo/bar/index.queryMd5Hash.html → depth 2
        assert_eq!(
            OfflineUrlConverter::get_offline_base_url_depth(&ParsedUrl::parse("/foo/bar/?param=1", None)),
            2
        );
    }

    // =========================================================================
    // Core URL-to-file conversion tests (the most critical ones)
    // =========================================================================

    #[test]
    fn convert_root_to_root() {
        assert_eq!(
            convert(
                "https://siteone.io/",
                "https://siteone.io/",
                "https://siteone.io/",
                None
            ),
            "index.html"
        );
    }

    #[test]
    fn convert_root_page() {
        assert_eq!(
            convert(
                "https://siteone.io/",
                "https://siteone.io/",
                "https://siteone.io/page",
                None
            ),
            "page.html"
        );
    }

    #[test]
    fn convert_root_page_trailing_slash() {
        assert_eq!(
            convert(
                "https://siteone.io/",
                "https://siteone.io",
                "https://siteone.io/page/",
                None
            ),
            "page/index.html"
        );
    }

    #[test]
    fn convert_from_subdir_with_fragment() {
        let result = convert(
            "https://siteone.io/",
            "https://siteone.io/t/",
            "https://siteone.io/page#fragment",
            None,
        );
        assert_eq!(result, "../page.html#fragment");
    }

    #[test]
    fn convert_relative_page() {
        assert_eq!(
            convert("https://siteone.io/", "https://siteone.io/", "/page", None),
            "page.html"
        );
    }

    #[test]
    fn convert_relative_page_dir() {
        assert_eq!(
            convert("https://siteone.io/", "https://siteone.io/", "/page/", None),
            "page/index.html"
        );
    }

    #[test]
    fn convert_relative_plain() {
        assert_eq!(
            convert("https://siteone.io/", "https://siteone.io/", "page", None),
            "page.html"
        );
    }

    #[test]
    fn convert_relative_parent() {
        assert_eq!(
            convert("https://siteone.io/", "https://siteone.io/path/", "../page", None),
            "../page.html"
        );
    }

    #[test]
    fn convert_relative_parent_dir() {
        assert_eq!(
            convert("https://siteone.io/", "https://siteone.io/path/", "../page/", None),
            "../page/index.html"
        );
    }

    #[test]
    fn convert_from_subpath_same_dir() {
        assert_eq!(
            convert(
                "https://siteone.io/",
                "https://siteone.io/path/",
                "https://siteone.io/path/page",
                None
            ),
            "../path/page.html"
        );
    }

    // ---- External domains ----

    #[test]
    fn convert_external_allowed_domain_root() {
        assert_eq!(
            convert(
                "https://siteone.io/",
                "https://siteone.io/",
                "https://nextjs.org/",
                None
            ),
            "_nextjs.org/index.html"
        );
    }

    #[test]
    fn convert_external_allowed_domain_from_subdir() {
        assert_eq!(
            convert(
                "https://siteone.io/",
                "https://siteone.io/t/",
                "https://svelte.dev/x",
                None
            ),
            "../_svelte.dev/x.html"
        );
    }

    #[test]
    fn convert_external_css_file() {
        assert_eq!(
            convert(
                "https://siteone.io/",
                "https://siteone.io/t/",
                "https://svelte.dev/x/file.css",
                None
            ),
            "../_svelte.dev/x/file.css"
        );
    }

    // ---- Backlinks ----

    #[test]
    fn convert_backlink_to_initial_domain() {
        assert_eq!(
            convert(
                "https://siteone.io/",
                "https://nextjs.org/",
                "https://siteone.io/",
                None
            ),
            "../index.html"
        );
    }

    #[test]
    fn convert_backlink_subpage_to_initial() {
        assert_eq!(
            convert(
                "https://siteone.io/",
                "https://nextjs.org/subpage",
                "https://siteone.io/",
                None
            ),
            "../index.html"
        );
    }

    #[test]
    fn convert_backlink_subdir_to_initial() {
        assert_eq!(
            convert(
                "https://siteone.io/",
                "https://nextjs.org/subpage/",
                "https://siteone.io/a",
                None
            ),
            "../../a.html"
        );
    }

    #[test]
    fn convert_backlink_to_third_domain() {
        assert_eq!(
            convert(
                "https://siteone.io/",
                "https://nextjs.org/",
                "https://svelte.dev/page",
                None
            ),
            "../_svelte.dev/page.html"
        );
    }

    // ---- Protocol-relative ----

    #[test]
    fn convert_protocol_relative_external() {
        assert_eq!(
            convert("https://siteone.io/", "https://siteone.io/", "//nextjs.org/", None),
            "_nextjs.org/index.html"
        );
    }

    #[test]
    fn convert_protocol_relative_backlink() {
        assert_eq!(
            convert("https://siteone.io/", "https://nextjs.org/", "//siteone.io/page", None),
            "../page.html"
        );
    }

    // ---- Fragment only ----

    #[test]
    fn convert_fragment_only() {
        assert_eq!(
            convert("https://siteone.io/", "https://siteone.io/", "#fragment2", None),
            "#fragment2"
        );
    }

    #[test]
    fn convert_fragment_only_external() {
        assert_eq!(
            convert("https://siteone.io/", "https://nextjs.org/", "#fragment3", None),
            "#fragment3"
        );
    }

    // ---- Query string handling (md5 hash) ----

    #[test]
    fn convert_page_with_query() {
        let result = convert(
            "https://siteone.io/",
            "https://siteone.io/",
            "https://siteone.io/page?p=1",
            None,
        );
        // Should have query hash between basename and extension: page.HASH.html
        assert!(
            result.starts_with("page."),
            "expected 'page.HASH.html', got '{}'",
            result
        );
        assert!(result.ends_with(".html"), "expected '*.html', got '{}'", result);
        assert!(!result.contains('?'));
    }

    #[test]
    fn convert_query_only() {
        let result = convert("https://siteone.io/", "https://siteone.io/", "?p=1", None);
        // Should be: index.HASH.html
        assert!(
            result.starts_with("index."),
            "expected 'index.HASH.html', got '{}'",
            result
        );
        assert!(result.ends_with(".html"), "expected '*.html', got '{}'", result);
    }

    #[test]
    fn convert_css_with_query() {
        let result = convert(
            "https://siteone.io/",
            "https://siteone.io/",
            "https://siteone.io/file.css?p=1",
            None,
        );
        // Should be: file.HASH.css
        assert!(result.ends_with(".css"), "expected '*.css', got '{}'", result);
        assert!(!result.contains('?'));
    }

    // ---- Complex relative paths ----

    #[test]
    fn convert_double_parent_relative() {
        assert_eq!(
            convert(
                "https://siteone.io/",
                "https://siteone.io/path/more/",
                "../../page",
                None
            ),
            "../../page.html"
        );
    }

    #[test]
    fn convert_double_parent_relative_dir() {
        assert_eq!(
            convert(
                "https://siteone.io/",
                "https://siteone.io/path/more/",
                "../../page/",
                None
            ),
            "../../page/index.html"
        );
    }

    // ---- External CSS references ----

    #[test]
    fn convert_from_external_css_to_external_image() {
        let result = convert(
            "https://siteone.io/",
            "https://cdn.siteone.io/siteone.io/css/styles.css",
            "https://cdn.webflow.com/a/b1.jpg",
            None,
        );
        assert_eq!(result, "../../../_cdn.webflow.com/a/b1.jpg");
    }

    #[test]
    fn convert_from_deep_external_css_to_image() {
        let result = convert(
            "https://siteone.io/",
            "https://cdn.siteone.io/siteone.io/css/hello/hi/styles.css",
            "https://cdn.webflow.com/b2.jpg",
            None,
        );
        assert_eq!(result, "../../../../../_cdn.webflow.com/b2.jpg");
    }

    #[test]
    fn convert_from_external_css_to_initial_domain() {
        let result = convert(
            "https://siteone.io/",
            "https://cdn.siteone.io/siteone.io/css/hello/hi/styles.css",
            "https://siteone.io/test/image.jpg",
            None,
        );
        assert_eq!(result, "../../../../../test/image.jpg");
    }

    #[test]
    fn convert_from_external_css_relative_root() {
        let result = convert(
            "https://siteone.io/",
            "https://cdn.siteone.io/siteone.io/css/styles.css",
            "/abt.jpg",
            None,
        );
        assert_eq!(result, "../../abt.jpg");
    }

    #[test]
    fn convert_from_external_css_relative_parent() {
        let result = convert(
            "https://siteone.io/",
            "https://cdn.siteone.io/siteone.io/css/styles.css",
            "../abz.jpg",
            None,
        );
        assert_eq!(result, "../abz.jpg");
    }

    // ---- Unknown/not-allowed domains → keep absolute ----

    #[test]
    fn convert_unknown_domain_stays_absolute() {
        let result = convert(
            "https://siteone.io/",
            "https://siteone.io/",
            "https://unknown.com/",
            None,
        );
        assert_eq!(result, "https://unknown.com/");
    }

    #[test]
    fn convert_unknown_domain_http_stays_absolute() {
        let result = convert(
            "https://siteone.io/",
            "https://siteone.io/",
            "http://unknown.com/page",
            None,
        );
        assert_eq!(result, "http://unknown.com/page");
    }

    // =========================================================================
    // sanitizeFilePath (UTF-8 subset)
    // =========================================================================

    #[test]
    fn sanitize_utf8_czech() {
        assert_eq!(
            OfflineUrlConverter::sanitize_file_path("české-výrobky", false),
            "české-výrobky"
        );
    }

    #[test]
    fn sanitize_utf8_german() {
        assert_eq!(OfflineUrlConverter::sanitize_file_path("über-uns", false), "über-uns");
    }

    #[test]
    fn sanitize_utf8_chinese() {
        assert_eq!(OfflineUrlConverter::sanitize_file_path("电子产品", false), "电子产品");
    }

    #[test]
    fn sanitize_url_encoded_czech() {
        assert_eq!(
            OfflineUrlConverter::sanitize_file_path("%C4%8Desk%C3%A9-v%C3%BDrobky", false),
            "české-výrobky"
        );
    }

    #[test]
    fn sanitize_url_encoded_german() {
        assert_eq!(
            OfflineUrlConverter::sanitize_file_path("%C3%BCber-uns", false),
            "über-uns"
        );
    }

    #[test]
    fn sanitize_url_encoded_chinese() {
        assert_eq!(
            OfflineUrlConverter::sanitize_file_path("%E7%94%B5%E5%AD%90%E4%BA%A7%E5%93%81", false),
            "电子产品"
        );
    }

    #[test]
    fn sanitize_dangerous_chars_colon() {
        assert_eq!(
            OfflineUrlConverter::sanitize_file_path("file:with:colons", false),
            "file_with_colons"
        );
    }

    #[test]
    fn sanitize_dangerous_chars_asterisk() {
        assert_eq!(
            OfflineUrlConverter::sanitize_file_path("file*with*asterisks", false),
            "file_with_asterisks"
        );
    }

    #[test]
    fn sanitize_dangerous_chars_question() {
        assert_eq!(
            OfflineUrlConverter::sanitize_file_path("file?with?questions", false),
            "file_with_questions"
        );
    }

    #[test]
    fn sanitize_dangerous_chars_quotes() {
        assert_eq!(
            OfflineUrlConverter::sanitize_file_path("file\"with\"quotes", false),
            "file_with_quotes"
        );
    }

    #[test]
    fn sanitize_dangerous_chars_brackets() {
        assert_eq!(
            OfflineUrlConverter::sanitize_file_path("file<with>brackets", false),
            "file_with_brackets"
        );
    }

    #[test]
    fn sanitize_dangerous_chars_pipes() {
        assert_eq!(
            OfflineUrlConverter::sanitize_file_path("file|with|pipes", false),
            "file_with_pipes"
        );
    }

    #[test]
    fn sanitize_dangerous_chars_backslash() {
        assert_eq!(
            OfflineUrlConverter::sanitize_file_path("file\\with\\backslashes", false),
            "file_with_backslashes"
        );
    }

    #[test]
    fn sanitize_mixed_utf8_and_dangerous() {
        assert_eq!(
            OfflineUrlConverter::sanitize_file_path("české:výrobky", false),
            "české_výrobky"
        );
    }

    #[test]
    fn sanitize_empty() {
        assert_eq!(OfflineUrlConverter::sanitize_file_path("", false), "");
    }

    #[test]
    fn sanitize_dots() {
        assert_eq!(OfflineUrlConverter::sanitize_file_path(".", false), ".");
        assert_eq!(OfflineUrlConverter::sanitize_file_path("..", false), "..");
    }

    // =========================================================================
    // Direct OfflineUrlConverter URL conversion tests
    // =========================================================================

    fn convert_simple(base: &str, target: &str) -> String {
        let initial_url = ParsedUrl::parse("https://example.com/", None);
        let base_url = ParsedUrl::parse(base, None);
        let base_url_for_ref = ParsedUrl::parse(base, None);
        let target_url = ParsedUrl::parse(target, Some(&base_url_for_ref));

        let false_cb1: Box<dyn Fn(&str) -> bool + Send + Sync> = Box::new(|_| false);
        let false_cb2: Box<dyn Fn(&str) -> bool + Send + Sync> = Box::new(|_| false);

        let mut converter = OfflineUrlConverter::new(
            initial_url,
            base_url,
            target_url,
            Some(false_cb1),
            Some(false_cb2),
            None,
        );
        converter.convert_url_to_relative(false)
    }

    #[test]
    fn simple_from_subdir_to_root_asset() {
        assert_eq!(
            convert_simple("https://example.com/page/", "/style.css"),
            "../style.css"
        );
    }

    #[test]
    fn simple_from_subdir_to_root_image() {
        assert_eq!(
            convert_simple("https://example.com/page/", "/images/logo.png"),
            "../images/logo.png"
        );
    }

    #[test]
    fn simple_from_deep_subdir_to_root_asset() {
        assert_eq!(
            convert_simple("https://example.com/dir/page/", "/style.css"),
            "../../style.css"
        );
    }

    #[test]
    fn simple_from_root_to_root_asset() {
        assert_eq!(convert_simple("https://example.com/", "/style.css"), "style.css");
    }

    #[test]
    fn simple_from_root_to_subdir_image() {
        assert_eq!(
            convert_simple("https://example.com/", "/images/logo.png"),
            "images/logo.png"
        );
    }

    // ---- UTF-8 URL conversion ----

    fn convert_utf8(base: &str, target: &str) -> String {
        let initial_url = ParsedUrl::parse("https://example.com/", None);
        let base_url = ParsedUrl::parse(base, None);
        let target_url = ParsedUrl::parse(target, None);

        let false_cb1: Box<dyn Fn(&str) -> bool + Send + Sync> = Box::new(|_| false);
        let false_cb2: Box<dyn Fn(&str) -> bool + Send + Sync> = Box::new(|_| false);

        let mut converter = OfflineUrlConverter::new(
            initial_url,
            base_url,
            target_url,
            Some(false_cb1),
            Some(false_cb2),
            None,
        );
        converter.convert_url_to_relative(true)
    }

    #[test]
    fn utf8_czech_from_root() {
        assert_eq!(
            convert_utf8("https://example.com/", "https://example.com/české-výrobky"),
            "české-výrobky.html"
        );
    }

    #[test]
    fn utf8_czech_in_subdir() {
        assert_eq!(
            convert_utf8("https://example.com/", "https://example.com/products/české-výrobky"),
            "products/české-výrobky.html"
        );
    }

    #[test]
    fn utf8_german_from_root() {
        assert_eq!(
            convert_utf8("https://example.com/", "https://example.com/über-uns"),
            "über-uns.html"
        );
    }

    #[test]
    fn utf8_chinese_from_root() {
        assert_eq!(
            convert_utf8("https://example.com/", "https://example.com/电子产品"),
            "电子产品.html"
        );
    }

    #[test]
    fn utf8_czech_trailing_slash() {
        assert_eq!(
            convert_utf8("https://example.com/", "https://example.com/české-výrobky/"),
            "české-výrobky/index.html"
        );
    }

    #[test]
    fn utf8_chinese_trailing_slash() {
        assert_eq!(
            convert_utf8("https://example.com/", "https://example.com/电子产品/"),
            "电子产品/index.html"
        );
    }

    #[test]
    fn utf8_czech_from_subdir() {
        assert_eq!(
            convert_utf8("https://example.com/page/", "https://example.com/české-výrobky"),
            "../české-výrobky.html"
        );
    }

    #[test]
    fn utf8_chinese_from_subdir() {
        assert_eq!(
            convert_utf8("https://example.com/dir/", "https://example.com/电子产品"),
            "../电子产品.html"
        );
    }

    #[test]
    fn utf8_czech_with_fragment() {
        assert_eq!(
            convert_utf8("https://example.com/", "https://example.com/české#sekce"),
            "české.html#sekce"
        );
    }

    // =========================================================================
    // Existing tests preserved
    // =========================================================================

    #[test]
    fn test_sanitize_file_path_basic() {
        let result = OfflineUrlConverter::sanitize_file_path("/index.html", true);
        assert_eq!(result, "/index.html");
    }

    #[test]
    fn test_sanitize_file_path_with_query() {
        let result = OfflineUrlConverter::sanitize_file_path("/page.html?foo=bar", false);
        assert!(result.contains(".html"));
        assert!(!result.contains('?'));
    }

    #[test]
    fn test_extract_regex_pattern() {
        assert_eq!(extract_regex_pattern("/foo/i"), Some("(?i)foo".to_string()));
        assert_eq!(extract_regex_pattern("/bar/"), Some("bar".to_string()));
        assert_eq!(extract_regex_pattern("#test#"), Some("test".to_string()));
    }

    // =========================================================================
    // Preserve URL structure tests (--offline-export-preserve-url-structure)
    // =========================================================================

    fn convert_preserve(initial: &str, base: &str, target: &str) -> String {
        let initial_url = ParsedUrl::parse(initial, None);
        let base_url = ParsedUrl::parse(base, None);
        let target_url = ParsedUrl::parse(target, None);

        let false_cb1: Box<dyn Fn(&str) -> bool + Send + Sync> = Box::new(|_| false);
        let false_cb2: Box<dyn Fn(&str) -> bool + Send + Sync> = Box::new(|_| false);

        let mut converter = OfflineUrlConverter::new(
            initial_url,
            base_url,
            target_url,
            Some(false_cb1),
            Some(false_cb2),
            None,
        );
        converter.set_preserve_url_structure(true);
        converter.convert_url_to_relative(true)
    }

    #[test]
    fn preserve_extensionless_page_becomes_dir_index() {
        // /about → /about/index.html (not /about.html)
        assert_eq!(
            convert_preserve(
                "https://example.com/",
                "https://example.com/",
                "https://example.com/about"
            ),
            "about/index.html"
        );
    }

    #[test]
    fn preserve_trailing_slash_unchanged() {
        // /about/ → /about/index.html (same as without preserve)
        assert_eq!(
            convert_preserve(
                "https://example.com/",
                "https://example.com/",
                "https://example.com/about/"
            ),
            "about/index.html"
        );
    }

    #[test]
    fn preserve_with_real_extension_unchanged() {
        // /style.css → /style.css (not /style.css/index.html)
        assert_eq!(
            convert_preserve(
                "https://example.com/",
                "https://example.com/",
                "https://example.com/style.css"
            ),
            "style.css"
        );
    }

    #[test]
    fn preserve_nested_path() {
        // /docs/guide → /docs/guide/index.html
        assert_eq!(
            convert_preserve(
                "https://example.com/",
                "https://example.com/",
                "https://example.com/docs/guide"
            ),
            "docs/guide/index.html"
        );
    }

    #[test]
    fn preserve_with_query_string() {
        // /about?lang=en → /about/index.HASH.html
        let result = convert_preserve(
            "https://example.com/",
            "https://example.com/",
            "https://example.com/about?lang=en",
        );
        assert!(
            result.starts_with("about/index."),
            "expected 'about/index.HASH.html', got '{}'",
            result
        );
        assert!(result.ends_with(".html"), "expected '*.html', got '{}'", result);
        assert!(!result.contains('?'));
    }

    #[test]
    fn preserve_root_page_unchanged() {
        // / → index.html (root is always index.html)
        assert_eq!(
            convert_preserve("https://example.com/", "https://example.com/", "https://example.com/"),
            "index.html"
        );
    }
}


================================================
FILE: src/export/utils/target_domain_relation.rs
================================================
// SiteOne Crawler - TargetDomainRelation
// (c) Jan Reges <jan.reges@siteone.cz>

use crate::engine::parsed_url::ParsedUrl;

/// Describes the relationship between initial URL, base URL (page where link was found),
/// and target URL (the link destination).
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum TargetDomainRelation {
    /// e.g. initial www.siteone.io, base www.siteone.io, target www.siteone.io
    InitialSameBaseSame,
    /// e.g. initial www.siteone.io, base nextjs.org, target www.siteone.io
    InitialSameBaseDifferent,
    /// e.g. initial www.siteone.io, base nextjs.org, target nextjs.org
    InitialDifferentBaseSame,
    /// e.g. initial www.siteone.io, base nextjs.org, target svelte.dev
    InitialDifferentBaseDifferent,
}

impl TargetDomainRelation {
    /// Determine the domain relation given the hosts of the initial URL, base URL, and target URL.
    /// If `target_host` is None or matches `base_host`, it's considered same as base.
    /// Determine the domain relation given ParsedUrl references.
    pub fn get_by_urls(initial_url: &ParsedUrl, base_url: &ParsedUrl, target_url: &ParsedUrl) -> Self {
        Self::get_by_hosts(
            initial_url.host.as_deref(),
            base_url.host.as_deref(),
            target_url.host.as_deref(),
        )
    }

    /// Determine the domain relation given the hosts of the initial URL, base URL, and target URL.
    /// If `target_host` is None or matches `base_host`, it's considered same as base.
    pub fn get_by_hosts(initial_host: Option<&str>, base_host: Option<&str>, target_host: Option<&str>) -> Self {
        let initial = initial_host.unwrap_or("");
        let base = base_host.unwrap_or("");
        let target = target_host.unwrap_or("");

        if target.is_empty() || target == base {
            // base host is the same as target host
            if base == initial {
                TargetDomainRelation::InitialSameBaseSame
            } else {
                TargetDomainRelation::InitialDifferentBaseSame
            }
        } else {
            // base host is different from target host
            if target == initial {
                TargetDomainRelation::InitialSameBaseDifferent
            } else {
                TargetDomainRelation::InitialDifferentBaseDifferent
            }
        }
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::engine::parsed_url::ParsedUrl;

    // =========================================================================
    // All 12 domain relation cases
    // =========================================================================

    // INITIAL_SAME__BASE_SAME
    #[test]
    fn initial_same_base_same_relative() {
        let initial = ParsedUrl::parse("https://www.siteone.io/", None);
        let base = ParsedUrl::parse("https://www.siteone.io/", None);
        let target = ParsedUrl::parse("/", Some(&base));
        assert_eq!(
            TargetDomainRelation::get_by_urls(&initial, &base, &target),
            TargetDomainRelation::InitialSameBaseSame
        );
    }

    #[test]
    fn initial_same_base_same_absolute() {
        let initial = ParsedUrl::parse("https://www.siteone.io/", None);
        let base = ParsedUrl::parse("https://www.siteone.io/", None);
        let target = ParsedUrl::parse("https://www.siteone.io/", None);
        assert_eq!(
            TargetDomainRelation::get_by_urls(&initial, &base, &target),
            TargetDomainRelation::InitialSameBaseSame
        );
    }

    #[test]
    fn initial_same_base_same_protocol_relative() {
        let initial = ParsedUrl::parse("https://www.siteone.io/", None);
        let base = ParsedUrl::parse("https://www.siteone.io/", None);
        let target = ParsedUrl::parse("//www.siteone.io/", None);
        assert_eq!(
            TargetDomainRelation::get_by_urls(&initial, &base, &target),
            TargetDomainRelation::InitialSameBaseSame
        );
    }

    // INITIAL_SAME__BASE_DIFFERENT (backlink)
    #[test]
    fn initial_same_base_different_absolute() {
        let initial = ParsedUrl::parse("https://www.siteone.io/", None);
        let base = ParsedUrl::parse("https://nextjs.org/", None);
        let target = ParsedUrl::parse("https://www.siteone.io/", None);
        assert_eq!(
            TargetDomainRelation::get_by_urls(&initial, &base, &target),
            TargetDomainRelation::InitialSameBaseDifferent
        );
    }

    #[test]
    fn initial_same_base_different_protocol_relative() {
        let initial = ParsedUrl::parse("https://www.siteone.io/", None);
        let base = ParsedUrl::parse("https://nextjs.org/", None);
        let target = ParsedUrl::parse("//www.siteone.io/", None);
        assert_eq!(
            TargetDomainRelation::get_by_urls(&initial, &base, &target),
            TargetDomainRelation::InitialSameBaseDifferent
        );
    }

    // INITIAL_DIFFERENT__BASE_SAME
    #[test]
    fn initial_different_base_same_relative() {
        let initial = ParsedUrl::parse("https://www.siteone.io/", None);
        let base = ParsedUrl::parse("https://nextjs.org/", None);
        let target = ParsedUrl::parse("/", Some(&base));
        assert_eq!(
            TargetDomainRelation::get_by_urls(&initial, &base, &target),
            TargetDomainRelation::InitialDifferentBaseSame
        );
    }

    #[test]
    fn initial_different_base_same_absolute() {
        let initial = ParsedUrl::parse("https://www.siteone.io/", None);
        let base = ParsedUrl::parse("https://nextjs.org/", None);
        let target = ParsedUrl::parse("https://nextjs.org/", None);
        assert_eq!(
            TargetDomainRelation::get_by_urls(&initial, &base, &target),
            TargetDomainRelation::InitialDifferentBaseSame
        );
    }

    #[test]
    fn initial_different_base_same_protocol_relative() {
        let initial = ParsedUrl::parse("https://www.siteone.io/", None);
        let base = ParsedUrl::parse("https://nextjs.org/", None);
        let target = ParsedUrl::parse("//nextjs.org", None);
        assert_eq!(
            TargetDomainRelation::get_by_urls(&initial, &base, &target),
            TargetDomainRelation::InitialDifferentBaseSame
        );
    }

    // INITIAL_DIFFERENT__BASE_DIFFERENT
    #[test]
    fn initial_different_base_different_absolute() {
        let initial = ParsedUrl::parse("https://www.siteone.io/", None);
        let base = ParsedUrl::parse("https://nextjs.org/", None);
        let target = ParsedUrl::parse("https://svelte.dev/", None);
        assert_eq!(
            TargetDomainRelation::get_by_urls(&initial, &base, &target),
            TargetDomainRelation::InitialDifferentBaseDifferent
        );
    }

    #[test]
    fn initial_different_base_different_protocol_relative() {
        let initial = ParsedUrl::parse("https://www.siteone.io/", None);
        let base = ParsedUrl::parse("https://nextjs.org/", None);
        let target = ParsedUrl::parse("//svelte.dev", None);
        assert_eq!(
            TargetDomainRelation::get_by_urls(&initial, &base, &target),
            TargetDomainRelation::InitialDifferentBaseDifferent
        );
    }

    #[test]
    fn initial_different_base_different_same_initial_base() {
        let initial = ParsedUrl::parse("https://www.siteone.io/", None);
        let base = ParsedUrl::parse("https://www.siteone.io/", None);
        let target = ParsedUrl::parse("//svelte.dev", None);
        assert_eq!(
            TargetDomainRelation::get_by_urls(&initial, &base, &target),
            TargetDomainRelation::InitialDifferentBaseDifferent
        );
    }

    // Host-level tests (existing, kept)
    #[test]
    fn test_target_empty() {
        let result = TargetDomainRelation::get_by_hosts(Some("www.siteone.io"), Some("www.siteone.io"), None);
        assert_eq!(result, TargetDomainRelation::InitialSameBaseSame);
    }
}


================================================
FILE: src/extra_column.rs
================================================
// SiteOne Crawler - ExtraColumn
// (c) Jan Reges <jan.reges@siteone.cz>

use regex::Regex;
use scraper::{Html, Selector};

use crate::error::CrawlerError;

pub const CUSTOM_METHOD_XPATH: &str = "xpath";
pub const CUSTOM_METHOD_REGEXP: &str = "regexp";

#[derive(Debug, Clone, serde::Serialize)]
#[serde(rename_all = "camelCase")]
pub struct ExtraColumn {
    pub name: String,
    pub length: Option<usize>,
    pub truncate: bool,
    pub custom_method: Option<String>,
    pub custom_pattern: Option<String>,
    pub custom_group: Option<usize>,
    #[serde(skip)]
    compiled_regex: Option<Regex>,
}

fn default_column_size(name: &str) -> Option<usize> {
    match name {
        "Title" => Some(20),
        "Description" => Some(20),
        "Keywords" => Some(20),
        _ => None,
    }
}

impl ExtraColumn {
    pub fn new(
        name: String,
        length: Option<usize>,
        truncate: bool,
        custom_method: Option<String>,
        custom_pattern: Option<String>,
        custom_group: Option<usize>,
    ) -> Result<Self, CrawlerError> {
        let validated_method = if let Some(ref method) = custom_method {
            let method_lower = method.to_lowercase();
            if method_lower != CUSTOM_METHOD_XPATH && method_lower != CUSTOM_METHOD_REGEXP {
                return Err(CrawlerError::Config(format!(
                    "Invalid custom extraction method: {}. Expected '{}' or '{}'.",
                    method, CUSTOM_METHOD_XPATH, CUSTOM_METHOD_REGEXP
                )));
            }

            if method_lower == CUSTOM_METHOD_REGEXP
                && let Some(ref pattern) = custom_pattern
            {
                // Validate the regex pattern
                if Regex::new(pattern).is_err() {
                    return Err(CrawlerError::Config(format!(
                        "Invalid regexp pattern provided: {}",
                        pattern
                    )));
                }
            }

            Some(method_lower)
        } else {
            None
        };

        let compiled_regex = if validated_method.as_deref() == Some(CUSTOM_METHOD_REGEXP) {
            custom_pattern.as_deref().and_then(|p| Regex::new(p).ok())
        } else {
            None
        };

        Ok(Self {
            name,
            length,
            truncate,
            custom_method: validated_method,
            custom_pattern,
            custom_group,
            compiled_regex,
        })
    }

    pub fn get_length(&self) -> usize {
        self.length.unwrap_or(self.name.len())
    }

    pub fn get_truncated_value(&self, value: Option<&str>) -> Option<String> {
        let value = value?;

        let length = self.get_length();
        if self.truncate && value.chars().count() > length {
            let truncated: String = value.chars().take(length.saturating_sub(1)).collect();
            Some(format!("{}…", truncated.trim()))
        } else {
            Some(value.to_string())
        }
    }

    pub fn from_text(text: &str) -> Result<ExtraColumn, CrawlerError> {
        // If the string contains '=', then it is a custom extraction.
        if text.contains('=') {
            let re = Regex::new(r"^([^=]+)=(xpath|regexp):(.+?)(?:#(\d+))?(?:\((\d+)(>?)\))?$")
                .map_err(|e| CrawlerError::Parse(e.to_string()))?;

            if let Some(caps) = re.captures(text) {
                let name = caps.get(1).map_or("", |m| m.as_str()).trim().to_string();
                let custom_method = Some(caps.get(2).map_or("", |m| m.as_str()).to_lowercase());
                let custom_pattern = Some(caps.get(3).map_or("", |m| m.as_str()).trim().to_string());
                let custom_group = caps
                    .get(4)
                    .and_then(|m| {
                        let s = m.as_str();
                        if s.is_empty() { None } else { s.parse::<usize>().ok() }
                    })
                    .or(Some(0));

                let (length, truncate) = if let Some(len_match) = caps.get(5) {
                    let len = len_match.as_str().parse::<usize>().unwrap_or(0);
                    let trunc = caps.get(6).is_none_or(|m| m.as_str() != ">");
                    (Some(len), trunc)
                } else {
                    (None, true)
                };

                return ExtraColumn::new(name, length, truncate, custom_method, custom_pattern, custom_group);
            }

            // If parsing of the custom syntax fails, return a standard column.
            return ExtraColumn::new(text.trim().to_string(), None, true, None, None, None);
        }

        // Standard column parsing
        let re = Regex::new(r"^([^(]+)(\((\d+)(>?)\))?$").map_err(|e| CrawlerError::Parse(e.to_string()))?;

        if let Some(caps) = re.captures(text) {
            let name = caps.get(1).map_or("", |m| m.as_str()).trim().to_string();

            let (length, truncate) = if let Some(len_match) = caps.get(3) {
                let len = len_match.as_str().parse::<usize>().unwrap_or(0);
                let trunc = caps.get(4).is_none_or(|m| m.as_str() != ">");
                (Some(len), trunc)
            } else {
                (default_column_size(&name), true)
            };

            ExtraColumn::new(name, length, truncate, None, None, None)
        } else {
            ExtraColumn::new(text.trim().to_string(), None, true, None, None, None)
        }
    }

    pub fn extract_value(&self, text: &str) -> Option<String> {
        let method = self.custom_method.as_deref()?;
        let pattern = self.custom_pattern.as_deref()?;

        match method {
            CUSTOM_METHOD_REGEXP => {
                let re = self.compiled_regex.as_ref()?;
                let caps = re.captures(text)?;
                let group = self.custom_group.unwrap_or(0);
                caps.get(group).map(|m| m.as_str().to_string())
            }
            CUSTOM_METHOD_XPATH => {
                let index = self.custom_group.unwrap_or(0);
                Self::extract_xpath(text, pattern, index)
            }
            _ => None,
        }
    }

    /// Extract value using XPath-like pattern via CSS selector conversion.
    /// Supports common XPath patterns used in web scraping:
    ///   //tag                     -> tag
    ///   //tag[@attr='value']      -> tag[attr='value']
    ///   //tag/@attr               -> tag (then read attribute)
    ///   //tag[@attr='value']/@x   -> tag[attr='value'] (then read attribute x)
    fn extract_xpath(html: &str, xpath: &str, index: usize) -> Option<String> {
        let document = Html::parse_document(html);

        // Strip /text() suffix — it's XPath shorthand for "get text content", which is the
        // default behaviour when no attribute is requested. CSS selectors don't support text().
        let xpath = xpath.strip_suffix("/text()").unwrap_or(xpath);

        // Detect if XPath ends with /@attribute — means we want an attribute value
        let (xpath_base, target_attr) = if let Some(idx) = xpath.rfind("/@") {
            (&xpath[..idx], Some(&xpath[idx + 2..]))
        } else {
            (xpath, None)
        };

        // Convert XPath to CSS selector
        let css = xpath_to_css(xpath_base);
        let selector = Selector::parse(&css).ok()?;

        let mut nodes = document.select(&selector);

        if let Some(element) = nodes.nth(index) {
            if let Some(attr) = target_attr {
                // Return attribute value
                element.value().attr(attr).map(|v| v.trim().to_string())
            } else {
                // Return text content
                let text: String = element.text().collect::<Vec<_>>().join("");
                let trimmed = text.trim().to_string();
                if trimmed.is_empty() { None } else { Some(trimmed) }
            }
        } else {
            None
        }
    }
}

/// Convert common XPath expressions to CSS selectors.
fn xpath_to_css(xpath: &str) -> String {
    let mut s = xpath.to_string();

    // Strip leading // or /
    if s.starts_with("//") {
        s = s[2..].to_string();
    } else if s.starts_with('/') {
        s = s[1..].to_string();
    }

    // Replace // (descendant) with space (CSS descendant combinator)
    s = s.replace("//", " ");

    // Replace / (child) with > (CSS child combinator)
    s = s.replace('/', " > ");

    s
}

#[cfg(test)]
mod tests {
    use super::*;

    // -- from_text parsing --

    #[test]
    fn parse_simple_name_uses_default_length() {
        let col = ExtraColumn::from_text("Title").unwrap();
        assert_eq!(col.name, "Title");
        assert_eq!(col.length, Some(20)); // default for "Title"
        assert!(col.custom_method.is_none());
    }

    #[test]
    fn parse_name_with_explicit_length() {
        let col = ExtraColumn::from_text("Custom(50)").unwrap();
        assert_eq!(col.name, "Custom");
        assert_eq!(col.length, Some(50));
        assert!(col.truncate);
    }

    #[test]
    fn parse_name_with_no_truncate() {
        let col = ExtraColumn::from_text("Wide(30>)").unwrap();
        assert_eq!(col.name, "Wide");
        assert_eq!(col.length, Some(30));
        assert!(!col.truncate);
    }

    #[test]
    fn parse_regexp_method() {
        let col = ExtraColumn::from_text("X=regexp:<title>(.+?)</title>").unwrap();
        assert_eq!(col.custom_method.as_deref(), Some("regexp"));
        assert!(col.custom_pattern.is_some());
    }

    #[test]
    fn parse_xpath_method() {
        let col = ExtraColumn::from_text("X=xpath://h1").unwrap();
        assert_eq!(col.custom_method.as_deref(), Some("xpath"));
    }

    #[test]
    fn parse_invalid_method_returns_error() {
        let result = ExtraColumn::from_text("X=invalid:foo");
        // "invalid" is not a valid method, but from_text falls back to standard column
        // The actual error comes from ExtraColumn::new when method is validated
        // from_text with unrecognized format returns a standard column, not an error
        assert!(result.is_ok()); // falls back to standard column
        let col = result.unwrap();
        assert!(col.custom_method.is_none());
    }

    // -- extract_value regexp --

    #[test]
    fn extract_regexp_matching() {
        let col = ExtraColumn::new(
            "X".to_string(),
            None,
            true,
            Some("regexp".to_string()),
            Some("<title>(.+?)</title>".to_string()),
            Some(1),
        )
        .unwrap();
        assert_eq!(col.extract_value("<title>Hello</title>"), Some("Hello".to_string()));
    }

    #[test]
    fn extract_regexp_not_matching() {
        let col = ExtraColumn::new(
            "X".to_string(),
            None,
            true,
            Some("regexp".to_string()),
            Some("<title>(.+?)</title>".to_string()),
            Some(1),
        )
        .unwrap();
        assert_eq!(col.extract_value("<p>No title here</p>"), None);
    }

    // -- extract_value xpath --

    #[test]
    fn extract_xpath_h1() {
        let col = ExtraColumn::new(
            "X".to_string(),
            None,
            true,
            Some("xpath".to_string()),
            Some("//h1".to_string()),
            Some(0),
        )
        .unwrap();
        let html = "<html><body><h1>Title</h1></body></html>";
        assert_eq!(col.extract_value(html), Some("Title".to_string()));
    }

    #[test]
    fn extract_xpath_h1_with_text_suffix() {
        // //h1/text() is a common XPath pattern — the /text() suffix must be stripped
        // before CSS conversion since CSS selectors don't support text().
        let col = ExtraColumn::new(
            "X".to_string(),
            None,
            true,
            Some("xpath".to_string()),
            Some("//h1/text()".to_string()),
            Some(0),
        )
        .unwrap();
        let html = "<html><body><h1>My Heading</h1></body></html>";
        assert_eq!(col.extract_value(html), Some("My Heading".to_string()));
    }

    #[test]
    fn extract_xpath_attribute() {
        let col = ExtraColumn::new(
            "X".to_string(),
            None,
            true,
            Some("xpath".to_string()),
            Some("//a/@href".to_string()),
            Some(0),
        )
        .unwrap();
        let html = "<html><body><a href=\"https://example.com\">Link</a></body></html>";
        assert_eq!(col.extract_value(html), Some("https://example.com".to_string()));
    }

    #[test]
    fn extract_xpath_not_found() {
        let col = ExtraColumn::new(
            "X".to_string(),
            None,
            true,
            Some("xpath".to_string()),
            Some("//h2".to_string()),
            Some(0),
        )
        .unwrap();
        let html = "<html><body><h1>Only H1</h1></body></html>";
        assert_eq!(col.extract_value(html), None);
    }

    // -- get_truncated_value --

    #[test]
    fn truncated_value_truncates_when_longer() {
        let col = ExtraColumn::new("X".to_string(), Some(3), true, None, None, None).unwrap();
        // Takes length-1 chars (2) and appends "…" → total 3 visible chars
        assert_eq!(col.get_truncated_value(Some("Hello")), Some("He…".to_string()));
    }

    #[test]
    fn truncated_value_none_returns_none() {
        let col = ExtraColumn::new("X".to_string(), Some(3), true, None, None, None).unwrap();
        assert_eq!(col.get_truncated_value(None), None);
    }
}


================================================
FILE: src/info.rs
================================================
// SiteOne Crawler - Info
// (c) Jan Reges <jan.reges@siteone.cz>

use serde::{Deserialize, Serialize};

#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Info {
    pub name: String,
    pub version: String,
    pub executed_at: String,
    pub command: String,
    pub hostname: String,
    pub final_user_agent: String,
    /// The initial URL passed via --url option
    pub initial_url: String,
}

impl Info {
    pub fn new(
        name: String,
        version: String,
        executed_at: String,
        command: String,
        hostname: String,
        final_user_agent: String,
        initial_url: String,
    ) -> Self {
        Self {
            name,
            version,
            executed_at,
            command,
            hostname,
            final_user_agent,
            initial_url,
        }
    }

    pub fn set_final_user_agent(&mut self, final_user_agent: String) {
        self.final_user_agent = final_user_agent;
    }
}


================================================
FILE: src/lib.rs
================================================
// SiteOne Crawler - Library root
// (c) Jan Reges <jan.reges@siteone.cz>

pub mod analysis;
pub mod components;
pub mod content_processor;
pub mod debugger;
pub mod engine;
pub mod error;
pub mod export;
pub mod extra_column;
pub mod info;
pub mod options;
pub mod output;
pub mod result;
pub mod scoring;
pub mod server;
pub mod types;
pub mod utils;
pub mod version;
pub mod wizard;


================================================
FILE: src/main.rs
================================================
// SiteOne Crawler - Main entry point
// (c) Jan Reges <jan.reges@siteone.cz>

use siteone_crawler::engine::initiator::Initiator;
use siteone_crawler::utils;

#[tokio::main]
async fn main() {
    // Install the default crypto provider for rustls (needed by SSL/TLS analyzer)
    let _ = rustls::crypto::ring::default_provider().install_default();

    // Force ANSI color output
    utils::force_enabled_colors();

    // Set timezone early, before tokio runtime spawns threads.
    // We check argv directly to avoid duplicating full option parsing.
    {
        let argv: Vec<String> = std::env::args().collect();
        for i in 0..argv.len() {
            if let Some(tz) = argv[i].strip_prefix("--timezone=") {
                // SAFETY: Called before any threads are spawned by the runtime
                unsafe {
                    std::env::set_var("TZ", tz);
                }
                break;
            } else if argv[i] == "--timezone" && i + 1 < argv.len() {
                unsafe {
                    std::env::set_var("TZ", &argv[i + 1]);
                }
                break;
            }
        }
    }

    let mut argv: Vec<String> = std::env::args().collect();

    // Interactive wizard: when no args given AND stdin/stdout are interactive TTYs,
    // show a guided wizard instead of the error + help wall. (GitHub issue #93)
    let launched_via_wizard = argv.len() == 1 && siteone_crawler::wizard::is_interactive_tty();
    if launched_via_wizard {
        match siteone_crawler::wizard::run_wizard() {
            Ok(wizard_argv) => argv = wizard_argv,
            Err(_) => {
                std::process::exit(0);
            }
        }
    }

    // Create initiator (parses CLI args, handles --help/--version)
    // On error: show ERROR, then help, then ERROR again
    let initiator = match Initiator::new(&argv) {
        Ok(i) => i,
        Err(e) => {
            // Extract inner message (strip "Config error: " prefix for display)
            let msg = match &e {
                siteone_crawler::error::CrawlerError::Config(inner) => inner.clone(),
                other => other.to_string(),
            };
            eprint!("{}", utils::get_color_text(&format!("ERROR: {}", msg), "red", false));
            Initiator::print_help();
            eprintln!(
                "{}",
                utils::get_color_text(&format!("\nERROR: {}\n", msg), "red", false)
            );
            std::process::exit(101);
        }
    };

    // Check for serve mode (built-in HTTP server for browsing exports)
    let serve_markdown = initiator.get_options().serve_markdown_dir.clone();
    let serve_offline = initiator.get_options().serve_offline_dir.clone();
    let serve_port = initiator.get_options().serve_port as u16;
    let serve_bind = initiator.get_options().serve_bind_address.clone();

    if let Some(dir) = serve_markdown {
        siteone_crawler::server::run(
            std::path::PathBuf::from(dir),
            siteone_crawler::server::ServeMode::Markdown,
            serve_port,
            &serve_bind,
        )
        .await;
        return;
    }
    if let Some(dir) = serve_offline {
        siteone_crawler::server::run(
            std::path::PathBuf::from(dir),
            siteone_crawler::server::ServeMode::Offline,
            serve_port,
            &serve_bind,
        )
        .await;
        return;
    }

    // Check for html-to-markdown mode (standalone file conversion, no crawling)
    if let Some(html_file) = initiator.get_options().html_to_markdown_file.clone() {
        let options = initiator.get_options();
        match siteone_crawler::export::markdown_exporter::convert_html_file_to_markdown(
            &html_file,
            options.markdown_exclude_selector.clone(),
            options.markdown_disable_images,
            options.markdown_disable_files,
            options.markdown_move_content_before_h1_to_end,
        ) {
            Ok(markdown) => {
                if let Some(output_path) = &options.html_to_markdown_output {
                    if let Err(e) = std::fs::write(output_path, &markdown) {
                        eprintln!(
                            "{}",
                            siteone_crawler::utils::get_color_text(
                                &format!("ERROR: Cannot write output file '{}': {}", output_path, e),
                                "red",
                                false,
                            )
                        );
                        std::process::exit(1);
                    }
                    eprintln!(
                        "{}",
                        siteone_crawler::utils::get_color_text(
                            &format!("Markdown written to '{}'", output_path),
                            "green",
                            false,
                        )
                    );
                } else {
                    print!("{}", markdown);
                }
            }
            Err(e) => {
                eprintln!(
                    "{}",
                    siteone_crawler::utils::get_color_text(&format!("ERROR: {}", e), "red", false,)
                );
                std::process::exit(1);
            }
        }
        return;
    }

    // Create manager from initiator
    let mut manager = match initiator.create_manager() {
        Ok(m) => m,
        Err(e) => {
            eprintln!("Error initializing crawler: {}", e);
            std::process::exit(1);
        }
    };

    // Run the crawler
    match manager.run().await {
        Ok(exit_code) => {
            if launched_via_wizard {
                if let Some((dir, kind)) = siteone_crawler::wizard::offer_serve_after_export(&argv) {
                    let serve_mode = if kind == "offline" {
                        siteone_crawler::server::ServeMode::Offline
                    } else {
                        siteone_crawler::server::ServeMode::Markdown
                    };
                    siteone_crawler::server::run(std::path::PathBuf::from(&dir), serve_mode, serve_port, &serve_bind)
                        .await;
                } else {
                    siteone_crawler::wizard::press_enter_to_exit();
                }
            }
            if exit_code != 0 {
                std::process::exit(exit_code);
            }
        }
        Err(e) => {
            eprintln!("Crawler error: {}", e);
            if launched_via_wizard {
                siteone_crawler::wizard::press_enter_to_exit();
            }
            std::process::exit(1);
        }
    }
}


================================================
FILE: src/options/core_options.rs
================================================
// SiteOne Crawler - Core options (all CLI options)
// (c) Jan Reges <jan.reges@siteone.cz>
//

use regex::Regex;

use crate::debugger;
use crate::error::CrawlerError;
use crate::extra_column::ExtraColumn;
use crate::types::{DeviceType, OutputType};

use super::group::OptionGroup;
use super::option::{CrawlerOption, OptionValue};
use super::option_type::OptionType;
use super::options::Options;

pub const GROUP_BASIC_SETTINGS: &str = "basic-settings";
pub const GROUP_OUTPUT_SETTINGS: &str = "output-settings";
pub const GROUP_RESOURCE_FILTERING: &str = "resource-filtering";
pub const GROUP_ADVANCED_CRAWLER_SETTINGS: &str = "advanced-crawler-settings";
pub const GROUP_EXPERT_SETTINGS: &str = "expert-settings";
pub const GROUP_FILE_EXPORT_SETTINGS: &str = "file-export-settings";
pub const GROUP_MAILER_SETTINGS: &str = "mailer-settings";
pub const GROUP_MARKDOWN_EXPORT_SETTINGS: &str = "markdown-export-settings";
pub const GROUP_OFFLINE_EXPORT_SETTINGS: &str = "offline-export-settings";
pub const GROUP_SITEMAP_SETTINGS: &str = "sitemap-settings";
pub const GROUP_UPLOAD_SETTINGS: &str = "upload-settings";
pub const GROUP_FASTEST_ANALYZER: &str = "fastest-analyzer";
pub const GROUP_SEO_AND_OPENGRAPH_ANALYZER: &str = "seo-and-opengraph-analyzer";
pub const GROUP_SLOWEST_ANALYZER: &str = "slowest-analyzer";
pub const GROUP_CI_CD_SETTINGS: &str = "ci-cd-settings";
pub const GROUP_SERVER_SETTINGS: &str = "server-settings";

/// Result storage type
#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize)]
#[serde(rename_all = "lowercase")]
pub enum StorageType {
    Memory,
    File,
}

impl StorageType {
    pub fn from_text(text: &str) -> Result<Self, CrawlerError> {
        match text.trim().to_lowercase().as_str() {
            "memory" => Ok(StorageType::Memory),
            "file" => Ok(StorageType::File),
            other => Err(CrawlerError::Config(format!(
                "Unknown storage type '{}'. Supported values are: memory, file",
                other
            ))),
        }
    }

    pub fn as_str(&self) -> &'static str {
        match self {
            StorageType::Memory => "memory",
            StorageType::File => "file",
        }
    }
}

#[derive(Debug, Clone, serde::Serialize)]
#[serde(rename_all = "camelCase")]
pub struct CoreOptions {
    // basic settings
    pub url: String,
    pub single_page: bool,
    pub max_depth: i64,
    pub device: DeviceType,
    pub user_agent: Option<String>,
    pub timeout: i64,
    pub proxy: Option<String>,
    pub http_auth: Option<String>,
    pub accept_invalid_certs: bool,
    pub timezone: Option<String>,
    pub show_version_only: bool,
    pub show_help_only: bool,

    // output settings
    pub output_type: OutputType,
    pub url_column_size: Option<i64>,
    pub show_inline_criticals: bool,
    pub show_inline_warnings: bool,
    pub rows_limit: i64,
    pub extra_columns: Vec<ExtraColumn>,
    pub extra_columns_names_only: Vec<String>,
    pub show_scheme_and_host: bool,
    pub do_not_truncate_url: bool,
    pub hide_progress_bar: bool,
    pub hide_columns: Vec<String>,
    pub no_color: bool,
    pub force_color: bool,
    pub console_width: Option<i64>,

    // resource filtering
    pub disable_all_assets: bool,
    pub disable_javascript: bool,
    pub disable_styles: bool,
    pub disable_fonts: bool,
    pub disable_images: bool,
    pub disable_files: bool,
    pub remove_all_anchor_listeners: bool,

    // advanced crawler settings
    pub workers: i64,
    pub max_reqs_per_sec: f64,
    pub memory_limit: String,
    pub resolve: Vec<String>,
    pub websocket_server: Option<String>,
    pub ignore_robots_txt: bool,
    pub allowed_domains_for_external_files: Vec<String>,
    pub allowed_domains_for_crawling: Vec<String>,
    pub single_foreign_page: bool,
    pub result_storage: StorageType,
    pub result_storage_dir: String,
    pub result_storage_compression: bool,
    pub accept_encoding: String,
    pub max_queue_length: i64,
    pub max_visited_urls: i64,
    pub max_url_length: i64,
    pub max_skipped_urls: i64,
    pub max_non200_responses_per_basename: i64,
    pub include_regex: Vec<String>,
    pub ignore_regex: Vec<String>,
    pub regex_filtering_only_for_pages: bool,
    pub analyzer_filter_regex: Option<String>,
    pub add_random_query_params: bool,
    pub remove_query_params: bool,
    pub keep_query_params: Vec<String>,
    pub transform_url: Vec<String>,
    pub force_relative_urls: bool,

    // file export settings
    pub output_html_report: Option<String>,
    pub html_report_options: Option<String>,
    pub output_json_file: Option<String>,
    pub output_text_file: Option<String>,
    pub add_host_to_output_file: bool,
    pub add_timestamp_to_output_file: bool,

    // sitemap settings
    pub sitemap_xml_file: Option<String>,
    pub sitemap_txt_file: Option<String>,
    pub sitemap_base_priority: f64,
    pub sitemap_priority_increase: f64,

    // offline export settings
    pub offline_export_dir: Option<String>,
    pub offline_export_store_only_url_regex: Vec<String>,
    pub offline_export_remove_unwanted_code: bool,
    pub offline_export_no_auto_redirect_html: bool,
    pub offline_export_preserve_url_structure: bool,
    pub offline_export_preserve_urls: bool,
    pub replace_content: Vec<String>,
    pub replace_query_string: Vec<String>,
    pub offline_export_lowercase: bool,
    pub ignore_store_file_error: bool,
    pub disable_astro_inline_modules: bool,

    // markdown export settings
    pub markdown_export_dir: Option<String>,
    pub markdown_export_single_file: Option<String>,
    pub markdown_move_content_before_h1_to_end: bool,
    pub markdown_disable_images: bool,
    pub markdown_disable_files: bool,
    pub markdown_remove_links_and_images_from_single_file: bool,
    pub markdown_exclude_selector: Vec<String>,
    pub markdown_replace_content: Vec<String>,
    pub markdown_replace_query_string: Vec<String>,
    pub markdown_export_store_only_url_regex: Vec<String>,
    pub markdown_ignore_store_file_error: bool,

    // mailer settings
    pub mail_to: Vec<String>,
    pub mail_from: String,
    pub mail_from_name: String,
    pub mail_subject_template: String,
    pub mail_smtp_host: String,
    pub mail_smtp_port: i64,
    pub mail_smtp_user: Option<String>,
    pub mail_smtp_pass: Option<String>,

    // upload settings
    pub upload_enabled: bool,
    pub upload_to: String,
    pub upload_retention: String,
    pub upload_password: Option<String>,
    pub upload_timeout: i64,

    // expert settings
    pub http_cache_dir: Option<String>,
    pub http_cache_compression: bool,
    pub http_cache_ttl: Option<u64>,
    pub debug: bool,
    pub debug_log_file: Option<String>,
    pub debug_url_regex: Vec<String>,

    // fastest analyzer settings
    pub fastest_top_limit: i64,
    pub fastest_max_time: f64,

    // seo and opengraph analyzer settings
    pub max_heading_level: i64,

    // slowest analyzer settings
    pub slowest_top_limit: i64,
    pub slowest_min_time: f64,
    pub slowest_max_time: f64,

    // server settings
    pub serve_markdown_dir: Option<String>,
    pub serve_offline_dir: Option<String>,
    pub serve_port: i64,
    pub serve_bind_address: String,

    // html-to-markdown converter mode (standalone, no crawl)
    pub html_to_markdown_file: Option<String>,
    pub html_to_markdown_output: Option<String>,

    // ci/cd settings
    pub ci: bool,
    pub ci_min_score: f64,
    pub ci_min_performance: Option<f64>,
    pub ci_min_seo: Option<f64>,
    pub ci_min_security: Option<f64>,
    pub ci_min_accessibility: Option<f64>,
    pub ci_min_best_practices: Option<f64>,
    pub ci_max_404: i64,
    pub ci_max_5xx: i64,
    pub ci_max_criticals: i64,
    pub ci_max_warnings: Option<i64>,
    pub ci_max_avg_response: Option<f64>,
    pub ci_min_pages: i64,
    pub ci_min_assets: i64,
    pub ci_min_documents: i64,
}

impl CoreOptions {
    /// Create CoreOptions by parsing values from a populated Options registry.
    pub fn from_options(options: &Options) -> Result<Self, CrawlerError> {
        // Determine output directory prefix: try ./tmp/ first, fallback to system data dir
        let output_prefix = default_output_prefix();

        let mut core = CoreOptions {
            // basic settings
            url: String::new(),
            single_page: false,
            max_depth: 0,
            device: DeviceType::Desktop,
            user_agent: None,
            timeout: 5,
            proxy: None,
            http_auth: None,
            accept_invalid_certs: false,
            timezone: None,
            show_version_only: false,
            show_help_only: false,

            // output settings
            output_type: OutputType::Text,
            url_column_size: None,
            show_inline_criticals: false,
            show_inline_warnings: false,
            rows_limit: 200,
            extra_columns: Vec::new(),
            extra_columns_names_only: Vec::new(),
            show_scheme_and_host: false,
            do_not_truncate_url: false,
            hide_progress_bar: false,
            hide_columns: Vec::new(),
            no_color: false,
            force_color: false,
            console_width: None,

            // resource filtering
            disable_all_assets: false,
            disable_javascript: false,
            disable_styles: false,
            disable_fonts: false,
            disable_images: false,
            disable_files: false,
            remove_all_anchor_listeners: false,

            // advanced crawler settings
            workers: 3,
            max_reqs_per_sec: 10.0,
            memory_limit: "2048M".to_string(),
            resolve: Vec::new(),
            websocket_server: None,
            ignore_robots_txt: false,
            allowed_domains_for_external_files: Vec::new(),
            allowed_domains_for_crawling: Vec::new(),
            single_foreign_page: false,
            result_storage: StorageType::Memory,
            result_storage_dir: format!("{output_prefix}/result-storage"),
            result_storage_compression: false,
            accept_encoding: "gzip, deflate, br".to_string(),
            max_queue_length: 9000,
            max_visited_urls: 10000,
            max_url_length: 2083,
            max_skipped_urls: 10000,
            max_non200_responses_per_basename: 5,
            include_regex: Vec::new(),
            ignore_regex: Vec::new(),
            regex_filtering_only_for_pages: false,
            analyzer_filter_regex: None,
            add_random_query_params: false,
            remove_query_params: false,
            keep_query_params: Vec::new(),
            transform_url: Vec::new(),
            force_relative_urls: false,

            // file export settings (all 3 reports enabled by default)
            output_html_report: Some(
                std::path::Path::new(&output_prefix)
                    .join("%domain%.report.%datetime%.html")
                    .to_string_lossy()
                    .to_string(),
            ),
            html_report_options: None,
            output_json_file: Some(
                std::path::Path::new(&output_prefix)
                    .join("%domain%.output.%datetime%.json")
                    .to_string_lossy()
                    .to_string(),
            ),
            output_text_file: Some(
                std::path::Path::new(&output_prefix)
                    .join("%domain%.output.%datetime%.txt")
                    .to_string_lossy()
                    .to_string(),
            ),
            add_host_to_output_file: false,
            add_timestamp_to_output_file: false,

            // sitemap settings
            sitemap_xml_file: None,
            sitemap_txt_file: None,
            sitemap_base_priority: 0.5,
            sitemap_priority_increase: 0.1,

            // offline export settings
            offline_export_dir: None,
            offline_export_store_only_url_regex: Vec::new(),
            offline_export_remove_unwanted_code: true,
            offline_export_no_auto_redirect_html: false,
            offline_export_preserve_url_structure: false,
            offline_export_preserve_urls: false,
            replace_content: Vec::new(),
            replace_query_string: Vec::new(),
            offline_export_lowercase: false,
            ignore_store_file_error: false,
            disable_astro_inline_modules: false,

            // markdown export settings
            markdown_export_dir: None,
            markdown_export_single_file: None,
            markdown_move_content_before_h1_to_end: false,
            markdown_disable_images: false,
            markdown_disable_files: false,
            markdown_remove_links_and_images_from_single_file: false,
            markdown_exclude_selector: Vec::new(),
            markdown_replace_content: Vec::new(),
            markdown_replace_query_string: Vec::new(),
            markdown_export_store_only_url_regex: Vec::new(),
            markdown_ignore_store_file_error: false,

            // mailer settings
            mail_to: Vec::new(),
            mail_from: "siteone-crawler@your-hostname.com".to_string(),
            mail_from_name: "SiteOne Crawler".to_string(),
            mail_subject_template: "Crawler Report for %domain% (%date%)".to_string(),
            mail_smtp_host: "localhost".to_string(),
            mail_smtp_port: 25,
            mail_smtp_user: None,
            mail_smtp_pass: None,

            // upload settings
            upload_enabled: false,
            upload_to: "https://crawler.siteone.io/up".to_string(),
            upload_retention: "30d".to_string(),
            upload_password: None,
            upload_timeout: 3600,

            // expert settings
            http_cache_dir: Some(default_http_cache_dir()),
            http_cache_compression: false,
            http_cache_ttl: Some(24 * 3600), // 24 hours in seconds
            debug: false,
            debug_log_file: None,
            debug_url_regex: Vec::new(),

            // fastest analyzer settings
            fastest_top_limit: 20,
            fastest_max_time: 1.0,

            // seo and opengraph analyzer settings
            max_heading_level: 3,

            // slowest analyzer settings
            slowest_top_limit: 20,
            slowest_min_time: 0.01,
            slowest_max_time: 3.0,

            // server settings
            serve_markdown_dir: None,
            serve_offline_dir: None,
            serve_port: 8321,
            serve_bind_address: "127.0.0.1".to_string(),

            html_to_markdown_file: None,
            html_to_markdown_output: None,

            // ci/cd settings
            ci: false,
            ci_min_score: 5.0,
            ci_min_performance: Some(5.0),
            ci_min_seo: Some(5.0),
            ci_min_security: Some(5.0),
            ci_min_accessibility: Some(3.0),
            ci_min_best_practices: Some(5.0),
            ci_max_404: 0,
            ci_max_5xx: 0,
            ci_max_criticals: 0,
            ci_max_warnings: None,
            ci_max_avg_response: None,
            ci_min_pages: 10,
            ci_min_assets: 10,
            ci_min_documents: 0,
        };

        // Populate from option groups
        for (_apl_code, group) in options.get_groups() {
            for (_prop_name, option) in &group.options {
                let value = option.get_value()?;
                core.apply_option_value(&option.property_to_fill, value)?;
            }
        }

        // Disable all assets if set
        if core.disable_all_assets {
            core.disable_javascript = true;
            core.disable_styles = true;
            core.disable_fonts = true;
            core.disable_images = true;
            core.disable_files = true;
        }

        // In CI mode, disable default report outputs (user cares about exit code, not files).
        // Only suppress outputs that weren't explicitly set by the user on the command line.
        if core.ci {
            if !options.is_explicitly_set("outputHtmlReport") {
                core.output_html_report = None;
            }
            if !options.is_explicitly_set("outputJsonFile") {
                core.output_json_file = None;
            }
            if !options.is_explicitly_set("outputTextFile") {
                core.output_text_file = None;
            }
        }

        // Warn if --html-to-markdown-output is set without --html-to-markdown
        if core.html_to_markdown_output.is_some() && core.html_to_markdown_file.is_none() {
            return Err(CrawlerError::Config(
                "--html-to-markdown-output requires --html-to-markdown to be set.".to_string(),
            ));
        }

        // In html-to-markdown mode, validate input file and return early
        if let Some(ref html_file) = core.html_to_markdown_file {
            if !std::path::Path::new(html_file).exists() {
                return Err(CrawlerError::Config(format!(
                    "HTML file '{}' does not exist.",
                    html_file
                )));
            }
            if !std::path::Path::new(html_file).is_file() {
                return Err(CrawlerError::Config(format!("'{}' is not a file.", html_file)));
            }
            return Ok(core);
        }

        // In serve mode, skip normal crawl validation and return early
        if core.serve_markdown_dir.is_some() || core.serve_offline_dir.is_some() {
            return Ok(core);
        }

        // Validate required fields
        if core.url.is_empty() {
            return Err(CrawlerError::Config(
                "Invalid or undefined --url parameter.".to_string(),
            ));
        }
        if core.workers < 1 {
            return Err(CrawlerError::Config(format!(
                "Invalid value '{}' (minimum is 1) for --workers",
                core.workers
            )));
        }

        // Build extra_columns_names_only
        core.extra_columns_names_only = core
            .extra_columns
            .iter()
            .map(|ec| {
                let re = Regex::new(r"\s*\(.+$").ok();
                match re {
                    Some(r) => r.replace(&ec.name, "").to_string(),
                    None => ec.name.clone(),
                }
            })
            .collect();

        // Configure debugger
        debugger::set_config(core.debug, core.debug_log_file.as_deref());

        Ok(core)
    }

    fn apply_option_value(&mut self, property: &str, value: &OptionValue) -> Result<(), CrawlerError> {
        match property {
            "url" => {
                if let Some(s) = value.as_str() {
                    self.url = s.to_string();
                }
            }
            "singlePage" => {
                if let Some(b) = value.as_bool() {
                    self.single_page = b;
                }
            }
            "maxDepth" => {
                if let Some(n) = value.as_int() {
                    self.max_depth = n;
                }
            }
            "device" => {
                if let Some(s) = value.as_str() {
                    self.device = DeviceType::from_text(s)?;
                }
            }
            "userAgent" => {
                if let Some(s) = value.as_str() {
                    self.user_agent = Some(s.to_string());
                }
            }
            "timeout" => {
                if let Some(n) = value.as_int() {
                    self.timeout = n;
                }
            }
            "proxy" => {
                if let Some(s) = value.as_str() {
                    self.proxy = Some(s.to_string());
                }
            }
            "httpAuth" => {
                if let Some(s) = value.as_str() {
                    self.http_auth = Some(s.to_string());
                }
            }
            "acceptInvalidCerts" => {
                if let Some(b) = value.as_bool() {
                    self.accept_invalid_certs = b;
                }
            }
            "timezone" => {
                if let Some(s) = value.as_str() {
                    self.timezone = Some(s.to_string());
                }
            }
            "showHelpOnly" => {
                if let Some(b) = value.as_bool() {
                    self.show_help_only = b;
                }
            }
            "showVersionOnly" => {
                if let Some(b) = value.as_bool() {
                    self.show_version_only = b;
                }
            }
            "outputType" => {
                if let Some(s) = value.as_str() {
                    self.output_type = OutputType::from_text(s)?;
                }
            }
            "urlColumnSize" => {
                if let Some(n) = value.as_int() {
                    self.url_column_size = Some(n);
                }
            }
            "showInlineCriticals" => {
                if let Some(b) = value.as_bool() {
                    self.show_inline_criticals = b;
                }
            }
            "showInlineWarnings" => {
                if let Some(b) = value.as_bool() {
                    self.show_inline_warnings = b;
                }
            }
            "rowsLimit" => {
                if let Some(n) = value.as_int() {
                    self.rows_limit = n;
                }
            }
            "extraColumns" => {
                if let Some(arr) = value.as_array() {
                    for column_text in arr {
                        self.extra_columns.push(ExtraColumn::from_text(column_text)?);
                    }
                }
            }
            "showSchemeAndHost" => {
                if let Some(b) = value.as_bool() {
                    self.show_scheme_and_host = b;
                }
            }
            "doNotTruncateUrl" => {
                if let Some(b) = value.as_bool() {
                    self.do_not_truncate_url = b;
                }
            }
            "hideProgressBar" => {
                if let Some(b) = value.as_bool() {
                    self.hide_progress_bar = b;
                }
            }
            "hideColumns" => {
                if let Some(s) = value.as_str() {
                    self.hide_columns = s.split(',').map(|c| c.trim().to_lowercase()).collect();
                }
            }
            "noColor" => {
                if let Some(b) = value.as_bool() {
                    self.no_color = b;
                }
            }
            "forceColor" => {
                if let Some(b) = value.as_bool() {
                    self.force_color = b;
                }
            }
            "consoleWidth" => {
                if let Some(n) = value.as_int() {
                    self.console_width = Some(n);
                }
            }
            "disableAllAssets" => {
                if let Some(b) = value.as_bool() {
                    self.disable_all_assets = b;
                }
            }
            "disableJavascript" => {
                if let Some(b) = value.as_bool() {
                    self.disable_javascript = b;
                }
            }
            "disableStyles" => {
                if let Some(b) = value.as_bool() {
                    self.disable_styles = b;
                }
            }
            "disableFonts" => {
                if let Some(b) = value.as_bool() {
                    self.disable_fonts = b;
                }
            }
            "disableImages" => {
                if let Some(b) = value.as_bool() {
                    self.disable_images = b;
                }
            }
            "disableFiles" => {
                if let Some(b) = value.as_bool() {
                    self.disable_files = b;
                }
            }
            "removeAllAnchorListeners" => {
                if let Some(b) = value.as_bool() {
                    self.remove_all_anchor_listeners = b;
                }
            }
            "workers" => {
                if let Some(n) = value.as_int() {
                    self.workers = n;
                }
            }
            "maxReqsPerSec" => {
                if let Some(n) = value.as_float() {
                    self.max_reqs_per_sec = n;
                }
            }
            "memoryLimit" => {
                if let Some(s) = value.as_str() {
                    self.memory_limit = s.to_string();
                }
            }
            "resolve" => {
                if let Some(arr) = value.as_array() {
                    self.resolve = arr.clone();
                }
            }
            "websocketServer" => {
                if let Some(s) = value.as_str() {
                    self.websocket_server = Some(s.to_string());
                }
            }
            "ignoreRobotsTxt" => {
                if let Some(b) = value.as_bool() {
                    self.ignore_robots_txt = b;
                }
            }
            "allowedDomainsForExternalFiles" => {
                if let Some(arr) = value.as_array() {
                    self.allowed_domains_for_external_files = arr.clone();
                }
            }
            "allowedDomainsForCrawling" => {
                if let Some(arr) = value.as_array() {
                    self.allowed_domains_for_crawling = arr.clone();
                }
            }
            "singleForeignPage" => {
                if let Some(b) = value.as_bool() {
                    self.single_foreign_page = b;
                }
            }
            "resultStorage" => {
                if let Some(s) = value.as_str() {
                    self.result_storage = StorageType::from_text(s)?;
                }
            }
            "resultStorageDir" => {
                if let Some(s) = value.as_str() {
                    self.result_storage_dir = s.to_string();
                }
            }
            "resultStorageCompression" => {
                if let Some(b) = value.as_bool() {
                    self.result_storage_compression = b;
                }
            }
            "acceptEncoding" => {
                if let Some(s) = value.as_str() {
                    self.accept_encoding = s.to_string();
                }
            }
            "maxQueueLength" => {
                if let Some(n) = value.as_int() {
                    self.max_queue_length = n;
                }
            }
            "maxVisitedUrls" => {
                if let Some(n) = value.as_int() {
                    self.max_visited_urls = n;
                }
            }
            "maxUrlLength" => {
                if let Some(n) = value.as_int() {
                    self.max_url_length = n;
                }
            }
            "maxSkippedUrls" => {
                if let Some(n) = value.as_int() {
                    self.max_skipped_urls = n;
                }
            }
            "maxNon200ResponsesPerBasename" => {
                if let Some(n) = value.as_int() {
                    self.max_non200_responses_per_basename = n;
                }
            }
            "includeRegex" => {
                if let Some(arr) = value.as_array() {
                    self.include_regex = arr.clone();
                }
            }
            "ignoreRegex" => {
                if let Some(arr) = value.as_array() {
                    self.ignore_regex = arr.clone();
                }
            }
            "regexFilteringOnlyForPages" => {
                if let Some(b) = value.as_bool() {
                    self.regex_filtering_only_for_pages = b;
                }
            }
            "analyzerFilterRegex" => {
                if let Some(s) = value.as_str() {
                    self.analyzer_filter_regex = Some(s.to_string());
                }
            }
            "addRandomQueryParams" => {
                if let Some(b) = value.as_bool() {
                    self.add_random_query_params = b;
                }
            }
            "removeQueryParams" => {
                if let Some(b) = value.as_bool() {
                    self.remove_query_params = b;
                }
            }
            "keepQueryParams" => {
                if let Some(arr) = value.as_array() {
                    self.keep_query_params = arr.clone();
                }
            }
            "transformUrl" => {
                if let Some(arr) = value.as_array() {
                    self.transform_url = arr.clone();
                }
            }
            "forceRelativeUrls" => {
                if let Some(b) = value.as_bool() {
                    self.force_relative_urls = b;
                }
            }
            // file export options — support empty string to disable (set to None)
            "outputHtmlReport" => match value.as_str() {
                Some(s) => self.output_html_report = Some(s.to_string()),
                None => self.output_html_report = None,
            },
            "htmlReportOptions" => {
                if let Some(s) = value.as_str() {
                    self.html_report_options = Some(s.to_string());
                }
            }
            "outputJsonFile" => match value.as_str() {
                Some(s) => self.output_json_file = Some(s.to_string()),
                None => self.output_json_file = None,
            },
            "outputTextFile" => match value.as_str() {
                Some(s) => self.output_text_file = Some(s.to_string()),
                None => self.output_text_file = None,
            },
            "addHostToOutputFile" => {
                if let Some(b) = value.as_bool() {
                    self.add_host_to_output_file = b;
                }
            }
            "addTimestampToOutputFile" => {
                if let Some(b) = value.as_bool() {
                    self.add_timestamp_to_output_file = b;
                }
            }
            // sitemap options
            "outputSitemapXml" => {
                if let Some(s) = value.as_str() {
                    self.sitemap_xml_file = Some(s.to_string());
                }
            }
            "outputSitemapTxt" => {
                if let Some(s) = value.as_str() {
                    self.sitemap_txt_file = Some(s.to_string());
                }
            }
            "sitemapBasePriority" => {
                if let Some(n) = value.as_float() {
                    self.sitemap_base_priority = n;
                }
            }
            "sitemapPriorityIncrease" => {
                if let Some(n) = value.as_float() {
                    self.sitemap_priority_increase = n;
                }
            }
            // offline export options
            "offlineExportDirectory" => {
                if let Some(s) = value.as_str() {
                    self.offline_export_dir = Some(s.to_string());
                }
            }
            "offlineExportStoreOnlyUrlRegex" => {
                if let Some(arr) = value.as_array() {
                    self.offline_export_store_only_url_regex = arr.clone();
                }
            }
            "offlineExportRemoveUnwantedCode" => {
                if let Some(b) = value.as_bool() {
                    self.offline_export_remove_unwanted_code = b;
                }
            }
            "offlineExportNoAutoRedirectHtml" => {
                if let Some(b) = value.as_bool() {
                    self.offline_export_no_auto_redirect_html = b;
                }
            }
            "offlineExportPreserveUrlStructure" => {
                if let Some(b) = value.as_bool() {
                    self.offline_export_preserve_url_structure = b;
                }
            }
            "offlineExportPreserveUrls" => {
                if let Some(b) = value.as_bool() {
                    self.offline_export_preserve_urls = b;
                }
            }
            "replaceContent" => {
                if let Some(arr) = value.as_array() {
                    self.replace_content = arr.clone();
                }
            }
            "replaceQueryString" => {
                if let Some(arr) = value.as_array() {
                    self.replace_query_string = arr.clone();
                }
            }
            "offlineExportLowercase" => {
                if let Some(b) = value.as_bool() {
                    self.offline_export_lowercase = b;
                }
            }
            "ignoreStoreFileError" => {
                if let Some(b) = value.as_bool() {
                    self.ignore_store_file_error = b;
                }
            }
            "disableAstroInlineModules" => {
                if let Some(b) = value.as_bool() {
                    self.disable_astro_inline_modules = b;
                }
            }
            // markdown export options
            "markdownExportDirectory" => {
                if let Some(s) = value.as_str() {
                    self.markdown_export_dir = Some(s.to_string());
                }
            }
            "markdownExportSingleFile" => {
                if let Some(s) = value.as_str() {
                    self.markdown_export_single_file = Some(s.to_string());
                }
            }
            "markdownMoveContentBeforeH1ToEnd" => {
                if let Some(b) = value.as_bool() {
                    self.markdown_move_content_before_h1_to_end = b;
                }
            }
            "markdownDisableImages" => {
                if let Some(b) = value.as_bool() {
                    self.markdown_disable_images = b;
                }
            }
            "markdownDisableFiles" => {
                if let Some(b) = value.as_bool() {
                    self.markdown_disable_files = b;
                }
            }
            "markdownRemoveLinksAndImagesFromSingleFile" => {
                if let Some(b) = value.as_bool() {
                    self.markdown_remove_links_and_images_from_single_file = b;
                }
            }
            "markdownExcludeSelector" => {
                if let Some(arr) = value.as_array() {
                    self.markdown_exclude_selector = arr.clone();
                }
            }
            "markdownReplaceContent" => {
                if let Some(arr) = value.as_array() {
                    self.markdown_replace_content = arr.clone();
                }
            }
            "markdownReplaceQueryString" => {
                if let Some(arr) = value.as_array() {
                    self.markdown_replace_query_string = arr.clone();
                }
            }
            "markdownExportStoreOnlyUrlRegex" => {
                if let Some(arr) = value.as_array() {
                    self.markdown_export_store_only_url_regex = arr.clone();
                }
            }
            "markdownIgnoreStoreFileError" => {
                if let Some(b) = value.as_bool() {
                    self.markdown_ignore_store_file_error = b;
                }
            }
            // mailer options
            "mailTo" => {
                if let Some(arr) = value.as_array() {
                    self.mail_to = arr.clone();
                }
            }
            "mailFrom" => {
                if let Some(s) = value.as_str() {
                    self.mail_from = s.to_string();
                }
            }
            "mailFromName" => {
                if let Some(s) = value.as_str() {
                    self.mail_from_name = s.to_string();
                }
            }
            "mailSubjectTemplate" => {
                if let Some(s) = value.as_str() {
                    self.mail_subject_template = s.to_string();
                }
            }
            "mailSmtpHost" => {
                if let Some(s) = value.as_str() {
                    self.mail_smtp_host = s.to_string();
                }
            }
            "mailSmtpPort" => {
                if let Some(n) = value.as_int() {
                    self.mail_smtp_port = n;
                }
            }
            "mailSmtpUser" => {
                if let Some(s) = value.as_str() {
                    self.mail_smtp_user = Some(s.to_string());
                }
            }
            "mailSmtpPass" => {
                if let Some(s) = value.as_str() {
                    self.mail_smtp_pass = Some(s.to_string());
                }
            }
            // upload options
            "uploadEnabled" => {
                if let Some(b) = value.as_bool() {
                    self.upload_enabled = b;
                }
            }
            "uploadTo" => {
                if let Some(s) = value.as_str() {
                    self.upload_to = s.to_string();
                }
            }
            "uploadRetention" => {
                if let Some(s) = value.as_str() {
                    self.upload_retention = s.to_string();
                }
            }
            "uploadPassword" => {
                if let Some(s) = value.as_str() {
                    self.upload_password = Some(s.to_string());
                }
            }
            "uploadTimeout" => {
                if let Some(n) = value.as_int() {
                    self.upload_timeout = n;
                }
            }
            "httpCacheDir" => match value.as_str() {
                Some(s) => self.http_cache_dir = Some(s.to_string()),
                None => self.http_cache_dir = None,
            },
            "httpCacheCompression" => {
                if let Some(b) = value.as_bool() {
                    self.http_cache_compression = b;
                }
            }
            "httpCacheTtl" => {
                if let Some(s) = value.as_str() {
                    if s == "0" || s.is_empty() || s == "off" {
                        self.http_cache_ttl = None; // infinite
                    } else {
                        self.http_cache_ttl = Some(parse_duration_to_secs(s));
                    }
                }
            }
            "noCache" => {
                if value.as_bool() == Some(true) {
                    self.http_cache_dir = Some("off".to_string());
                }
            }
            "debug" => {
                if let Some(b) = value.as_bool() {
                    self.debug = b;
                }
            }
            "debugLogFile" => {
                if let Some(s) = value.as_str() {
                    self.debug_log_file = Some(s.to_string());
                }
            }
            "debugUrlRegex" => {
                if let Some(arr) = value.as_array() {
                    self.debug_url_regex = arr.clone();
                }
            }
            // fastest analyzer options
            "fastestTopLimit" => {
                if let Some(n) = value.as_int() {
                    self.fastest_top_limit = n;
                }
            }
            "fastestMaxTime" => {
                if let Some(n) = value.as_float() {
                    self.fastest_max_time = n;
                }
            }
            // seo and opengraph analyzer options
            "maxHeadingLevel" => {
                if let Some(n) = value.as_int() {
                    self.max_heading_level = n;
                }
            }
            // slowest analyzer options
            "slowestTopLimit" => {
                if let Some(n) = value.as_int() {
                    self.slowest_top_limit = n;
                }
            }
            "slowestMinTime" => {
                if let Some(n) = value.as_float() {
                    self.slowest_min_time = n;
                }
            }
            "slowestMaxTime" => {
                if let Some(n) = value.as_float() {
                    self.slowest_max_time = n;
                }
            }
            // ci/cd options
            "ci" => {
                if let Some(b) = value.as_bool() {
                    self.ci = b;
                }
            }
            "ciMinScore" => {
                if let Some(n) = value.as_float() {
                    self.ci_min_score = n;
                }
            }
            "ciMinPerformance" => {
                if let Some(n) = value.as_float() {
                    self.ci_min_performance = Some(n);
                }
            }
            "ciMinSeo" => {
                if let Some(n) = value.as_float() {
                    self.ci_min_seo = Some(n);
                }
            }
            "ciMinSecurity" => {
                if let Some(n) = value.as_float() {
                    self.ci_min_security = Some(n);
                }
            }
            "ciMinAccessibility" => {
                if let Some(n) = value.as_float() {
                    self.ci_min_accessibility = Some(n);
                }
            }
            "ciMinBestPractices" => {
                if let Some(n) = value.as_float() {
                    self.ci_min_best_practices = Some(n);
                }
            }
            "ciMax404" => {
                if let Some(n) = value.as_int() {
                    self.ci_max_404 = n;
                }
            }
            "ciMax5xx" => {
                if let Some(n) = value.as_int() {
                    self.ci_max_5xx = n;
                }
            }
            "ciMaxCriticals" => {
                if let Some(n) = value.as_int() {
                    self.ci_max_criticals = n;
                }
            }
            "ciMaxWarnings" => {
                if let Some(n) = value.as_int() {
                    self.ci_max_warnings = Some(n);
                }
            }
            "ciMaxAvgResponse" => {
                if let Some(n) = value.as_float() {
                    self.ci_max_avg_response = Some(n);
                }
            }
            "ciMinPages" => {
                if let Some(n) = value.as_int() {
                    self.ci_min_pages = n;
                }
            }
            "ciMinAssets" => {
                if let Some(n) = value.as_int() {
                    self.ci_min_assets = n;
                }
            }
            "ciMinDocuments" => {
                if let Some(n) = value.as_int() {
                    self.ci_min_documents = n;
                }
            }
            "serveMarkdownDirectory" => {
                if let Some(s) = value.as_str() {
                    self.serve_markdown_dir = Some(s.to_string());
                }
            }
            "serveOfflineDirectory" => {
                if let Some(s) = value.as_str() {
                    self.serve_offline_dir = Some(s.to_string());
                }
            }
            "servePort" => {
                if let Some(n) = value.as_int() {
                    self.serve_port = n;
                }
            }
            "serveBindAddress" => {
                if let Some(s) = value.as_str() {
                    self.serve_bind_address = s.to_string();
                }
            }
            "htmlToMarkdownFile" => {
                if let Some(s) = value.as_str() {
                    self.html_to_markdown_file = Some(s.to_string());
                }
            }
            "htmlToMarkdownOutput" => {
                if let Some(s) = value.as_str() {
                    self.html_to_markdown_output = Some(s.to_string());
                }
            }
            _ => {
                // Unknown property - ignore (may be from analyzer/exporter options)
            }
        }
        Ok(())
    }

    pub fn has_header_to_table(&self, header_name: &str) -> bool {
        self.extra_columns_names_only.iter().any(|name| name == header_name)
    }

    pub fn is_url_selected_for_debug(&self, url: &str) -> bool {
        if self.debug_url_regex.is_empty() {
            return false;
        }

        for regex_str in &self.debug_url_regex {
            if let Ok(re) = Regex::new(regex_str)
                && re.is_match(url)
            {
                return true;
            }
        }

        false
    }

    pub fn crawl_only_html_files(&self) -> bool {
        self.disable_all_assets
            || (self.disable_javascript
                && self.disable_styles
                && self.disable_fonts
                && self.disable_images
                && self.disable_files)
    }

    /// Get initial host from URL (with port if explicitly set)
    pub fn get_initial_host(&self, include_port_if_defined: bool) -> String {
        if let Ok(parsed) = url::Url::parse(&self.url) {
            let host = parsed.host_str().unwrap_or("").to_string();
            if include_port_if_defined && let Some(port) = parsed.port() {
                return format!("{}:{}", host, port);
            }
            host
        } else {
            String::new()
        }
    }

    /// Get scheme from initial URL
    pub fn get_initial_scheme(&self) -> String {
        if let Ok(parsed) = url::Url::parse(&self.url) {
            parsed.scheme().to_string()
        } else {
            String::new()
        }
    }
}

/// Build the complete Options registry with all option groups.
pub fn get_options() -> Options {
    let mut options = Options::new();

    // -------------------------------------------------------------------------
    // Basic settings (CoreOptions group 1)
    // -------------------------------------------------------------------------
    options.add_group(OptionGroup::new(
        GROUP_BASIC_SETTINGS,
        "Basic settings",
        vec![
            CrawlerOption::new(
                "--url", Some("-u"), "url", OptionType::Url, false,
                "Required URL. It can also be the URL to sitemap.xml. Enclose in quotes if URL contains query parameters.",
                None, true, false, None,
            ),
            CrawlerOption::new(
                "--single-page", Some("-sp"), "singlePage", OptionType::Bool, false,
                "Load only one page to which the URL is given (and its assets), but do not follow other pages.",
                Some("false"), false, false, None,
            ),
            CrawlerOption::new(
                "--max-depth", Some("-md"), "maxDepth", OptionType::Int, false,
                "Maximum crawling depth (for pages, not assets). Default is `0` (no limit). `1` means `/about` or `/about/`, `2` means `/about/contacts` etc.",
                Some("0"), false, false, None,
            ),
            CrawlerOption::new(
                "--device", Some("-d"), "device", OptionType::String, false,
                "Device type for User-Agent selection. Values `desktop`, `tablet`, `mobile`. Ignored with `--user-agent`.",
                Some("desktop"), false, false, None,
            ),
            CrawlerOption::new(
                "--user-agent", Some("-ua"), "userAgent", OptionType::String, false,
                "Override User-Agent selected by --device. If you add `!` at the end, the siteone-crawler/version will not be added as a signature at the end of the final user-agent.",
                None, true, false, None,
            ),
            CrawlerOption::new(
                "--timeout", Some("-t"), "timeout", OptionType::Int, false,
                "Request timeout (in sec).",
                Some("5"), false, false, None,
            ),
            CrawlerOption::new(
                "--proxy", Some("-p"), "proxy", OptionType::HostAndPort, false,
                "HTTP proxy in `host:port` format.",
                None, true, false, None,
            ),
            CrawlerOption::new(
                "--http-auth", Some("-ha"), "httpAuth", OptionType::String, false,
                "Basic HTTP authentication in `username:password` format.",
                None, true, false, None,
            ),
            CrawlerOption::new(
                "--accept-invalid-certs", Some("-aic"), "acceptInvalidCerts", OptionType::Bool, false,
                "Accept invalid or incomplete SSL/TLS certificates (e.g. expired, self-signed, or missing intermediate CA). Use with caution.",
                Some("false"), false, false, None,
            ),
            CrawlerOption::new(
                "--help", Some("-h"), "showHelpOnly", OptionType::Bool, false,
                "Show help and exit.",
                Some("false"), false, false, None,
            ),
            CrawlerOption::new(
                "--version", Some("-v"), "showVersionOnly", OptionType::Bool, false,
                "Show crawler version and exit.",
                Some("false"), false, false, None,
            ),
        ],
    ));

    // -------------------------------------------------------------------------
    // Output settings (CoreOptions group 2)
    // -------------------------------------------------------------------------
    options.add_group(OptionGroup::new(
        GROUP_OUTPUT_SETTINGS,
        "Output settings",
        vec![
            CrawlerOption::new(
                "--output", Some("-o"), "outputType", OptionType::String, false,
                "Output type `text` or `json`.",
                Some("text"), false, false, None,
            ),
            CrawlerOption::new(
                "--extra-columns", Some("-ec"), "extraColumns", OptionType::String, true,
                "Extra table headers for output table with option to set width and do-not-truncate (>), e.g., `DOM,X-Cache(10),Title(40>)`.",
                None, true, true, None,
            ),
            CrawlerOption::new(
                "--url-column-size", Some("-ucs"), "urlColumnSize", OptionType::Int, false,
                "URL column width. By default, it is calculated from the size of your terminal window.",
                None, true, false, None,
            ),
            CrawlerOption::new(
                "--timezone", Some("-tz"), "timezone", OptionType::String, false,
                "Timezone for datetimes in HTML reports and timestamps in output folders/files, e.g., `Europe/Prague`. Default is `UTC`.",
                None, true, false, None,
            ),
            CrawlerOption::new(
                "--rows-limit", Some("-rl"), "rowsLimit", OptionType::Int, false,
                "Max. number of rows to display in tables with analysis results (protection against very long and slow report)",
                Some("200"), false, false, None,
            ),
            CrawlerOption::new(
                "--show-inline-criticals", Some("-sic"), "showInlineCriticals", OptionType::Bool, false,
                "Show criticals from the analyzer directly in the URL table.",
                Some("false"), false, false, None,
            ),
            CrawlerOption::new(
                "--show-inline-warnings", Some("-siw"), "showInlineWarnings", OptionType::Bool, false,
                "Show warnings from the analyzer directly in the URL table.",
                Some("false"), false, false, None,
            ),
            CrawlerOption::new(
                "--do-not-truncate-url", Some("-dntu"), "doNotTruncateUrl", OptionType::Bool, false,
                "Avoid truncating URLs to `--url-column-size`.",
                Some("false"), false, false, None,
            ),
            CrawlerOption::new(
                "--show-scheme-and-host", Some("-ssah"), "showSchemeAndHost", OptionType::Bool, false,
                "Show the schema://host also of the original domain URL as well. By default, only path+query is displayed for original domain.",
                Some("false"), false, false, None,
            ),
            CrawlerOption::new(
                "--hide-progress-bar", Some("-hpb"), "hideProgressBar", OptionType::Bool, false,
                "Suppress progress bar in output.",
                Some("false"), false, false, None,
            ),
            CrawlerOption::new(
                "--hide-columns", Some("-hc"), "hideColumns", OptionType::String, false,
                "Hide specified columns from the progress table. Comma-separated list: type, time, size, cache.",
                None, true, false, None,
            ),
            CrawlerOption::new(
                "--no-color", Some("-nc"), "noColor", OptionType::Bool, false,
                "Disable colored output.",
                Some("false"), false, false, None,
            ),
            CrawlerOption::new(
                "--force-color", Some("-fc"), "forceColor", OptionType::Bool, false,
                "Force colored output regardless of support detection.",
                Some("false"), false, false, None,
            ),
        ],
    ));

    // -------------------------------------------------------------------------
    // Resource filtering (CoreOptions group 3)
    // -------------------------------------------------------------------------
    options.add_group(OptionGroup::new(
        GROUP_RESOURCE_FILTERING,
        "Resource filtering",
        vec![
            CrawlerOption::new(
                "--disable-all-assets", Some("-das"), "disableAllAssets", OptionType::Bool, false,
                "Disables crawling of all assets and files and only crawls pages in href attributes. Shortcut for calling all other `--disable-*` flags.",
                Some("false"), false, false, None,
            ),
            CrawlerOption::new(
                "--disable-javascript", Some("-dj"), "disableJavascript", OptionType::Bool, false,
                "Disables JavaScript downloading and removes all JavaScript code from HTML, including onclick and other on* handlers.",
                Some("false"), false, false, None,
            ),
            CrawlerOption::new(
                "--disable-styles", Some("-ds"), "disableStyles", OptionType::Bool, false,
                "Disables CSS file downloading and at the same time removes all style definitions by <style> tag or inline by style attributes.",
                Some("false"), false, false, None,
            ),
            CrawlerOption::new(
                "--disable-fonts", Some("-dfo"), "disableFonts", OptionType::Bool, false,
                "Disables font downloading and also removes all font/font-face definitions from CSS.",
                Some("false"), false, false, None,
            ),
            CrawlerOption::new(
                "--disable-images", Some("-di"), "disableImages", OptionType::Bool, false,
                "Disables downloading of all images and replaces found images in HTML with placeholder image only.",
                Some("false"), false, false, None,
            ),
            CrawlerOption::new(
                "--disable-files", Some("-df"), "disableFiles", OptionType::Bool, false,
                "Disables downloading of any files (typically downloadable documents) to which various links point.",
                Some("false"), false, false, None,
            ),
            CrawlerOption::new(
                "--remove-all-anchor-listeners", Some("-raal"), "removeAllAnchorListeners", OptionType::Bool, false,
                "On all links on the page remove any event listeners. Useful on some types of sites with modern JS frameworks.",
                Some("false"), false, false, None,
            ),
        ],
    ));

    // -------------------------------------------------------------------------
    // Advanced crawler settings (CoreOptions group 4)
    // -------------------------------------------------------------------------
    options.add_group(OptionGroup::new(
        GROUP_ADVANCED_CRAWLER_SETTINGS,
        "Advanced crawler settings",
        vec![
            CrawlerOption::new(
                "--workers", Some("-w"), "workers", OptionType::Int, false,
                "Max concurrent workers (threads). Crawler will not make more simultaneous requests to the server than this number.",
                Some("3"), false, false, None,
            ),
            CrawlerOption::new(
                "--max-reqs-per-sec", Some("-rps"), "maxReqsPerSec", OptionType::Float, false,
                "Max requests/s for whole crawler. Be careful not to cause a DoS attack.",
                Some("10"), false, false, None,
            ),
            CrawlerOption::new(
                "--memory-limit", Some("-ml"), "memoryLimit", OptionType::SizeMG, false,
                "Memory limit in units M (Megabytes) or G (Gigabytes).",
                Some("2048M"), false, false, None,
            ),
            CrawlerOption::new(
                "--resolve", Some("-res"), "resolve", OptionType::Resolve, true,
                "The ability to force the domain+port to resolve to its own IP address, just like CURL --resolve does. Example: `--resolve='www.mydomain.tld:80:127.0.0.1'`",
                None, true, true, None,
            ),
            CrawlerOption::new(
                "--allowed-domain-for-external-files", Some("-adf"), "allowedDomainsForExternalFiles", OptionType::String, true,
                "Primarily, the crawler crawls only the URL within the domain for initial URL. This allows you to enable loading of file content from another domain as well (e.g. if you want to load assets from a CDN). Can be specified multiple times. Use can use domains with wildcard '*'.",
                None, true, true, None,
            ),
            CrawlerOption::new(
                "--allowed-domain-for-crawling", Some("-adc"), "allowedDomainsForCrawling", OptionType::String, true,
                "This option will allow you to crawl all content from other listed domains - typically in the case of language mutations on other domains. Can be specified multiple times. Use can use domains with wildcard '*'.",
                None, true, true, None,
            ),
            CrawlerOption::new(
                "--single-foreign-page", Some("-sfp"), "singleForeignPage", OptionType::Bool, false,
                "If crawling of other domains is allowed (using `--allowed-domain-for-crawling`), it ensures that when another domain is not on same second-level domain, only that linked page and its assets are crawled from that foreign domain.",
                Some("false"), false, false, None,
            ),
            CrawlerOption::new(
                "--include-regex", Some("--include-regexp"), "includeRegex", OptionType::Regex, true,
                "Include only URLs matching at least one PCRE regex. Can be specified multiple times.",
                None, false, true, None,
            ),
            CrawlerOption::new(
                "--ignore-regex", Some("--ignore-regexp"), "ignoreRegex", OptionType::Regex, true,
                "Ignore URLs matching any PCRE regex. Can be specified multiple times.",
                None, false, true, None,
            ),
            CrawlerOption::new(
                "--regex-filtering-only-for-pages", None, "regexFilteringOnlyForPages", OptionType::Bool, false,
                "Set if you want filtering by `*-regex` rules apply only to page URLs, but static assets are loaded regardless of filtering.",
                Some("false"), false, false, None,
            ),
            CrawlerOption::new(
                "--analyzer-filter-regex", Some("--analyzer-filter-regexp"), "analyzerFilterRegex", OptionType::Regex, false,
                "Use only analyzers that match the specified regexp.",
                None, true, false, None,
            ),
            CrawlerOption::new(
                "--accept-encoding", None, "acceptEncoding", OptionType::String, false,
                "Set `Accept-Encoding` request header.",
                Some("gzip, deflate, br"), false, false, None,
            ),
            CrawlerOption::new(
                "--remove-query-params", Some("-rqp"), "removeQueryParams", OptionType::Bool, false,
                "Remove URL query parameters from crawled URLs.",
                Some("false"), false, false, None,
            ),
            CrawlerOption::new(
                "--keep-query-param", Some("-kqp"), "keepQueryParams", OptionType::String, true,
                "Keep only the specified query parameter(s) in discovered URLs. All other query parameters are removed. Can be specified multiple times. Ignored when `--remove-query-params` is active.",
                None, true, true, None,
            ),
            CrawlerOption::new(
                "--add-random-query-params", Some("-arqp"), "addRandomQueryParams", OptionType::Bool, false,
                "Add random query parameters to each crawled URL.",
                Some("false"), false, false, None,
            ),
            CrawlerOption::new(
                "--transform-url", Some("-tu"), "transformUrl", OptionType::ReplaceContent, true,
                "Transform URLs before crawling. Format: `from -> to` or `/regex/ -> replacement`. Example: `live-site.com -> local-site.local` or `/live-site\\.com\\/wp/ -> local-site.local/`. Can be specified multiple times.",
                None, true, true, None,
            ),
            CrawlerOption::new(
                "--force-relative-urls", Some("-fru"), "forceRelativeUrls", OptionType::Bool, false,
                "Normalize all discovered URLs matching the initial domain (incl. www variant and protocol differences) to relative paths. Prevents duplicate files in offline export when the site uses inconsistent URL formats.",
                Some("false"), false, false, None,
            ),
            CrawlerOption::new(
                "--ignore-robots-txt", Some("-irt"), "ignoreRobotsTxt", OptionType::Bool, false,
                "Should robots.txt content be ignored? Useful for crawling an otherwise private/unindexed site.",
                Some("false"), false, false, None,
            ),
            CrawlerOption::new(
                "--max-queue-length", Some("-mql"), "maxQueueLength", OptionType::Int, false,
                "Max URL queue length. It affects memory requirements.",
                Some("9000"), false, false, None,
            ),
            CrawlerOption::new(
                "--max-visited-urls", Some("-mvu"), "maxVisitedUrls", OptionType::Int, false,
                "Max visited URLs. It affects memory requirements.",
                Some("10000"), false, false, None,
            ),
            CrawlerOption::new(
                "--max-skipped-urls", Some("-msu"), "maxSkippedUrls", OptionType::Int, false,
                "Max skipped URLs. It affects memory requirements.",
                Some("10000"), false, false, None,
            ),
            CrawlerOption::new(
                "--max-url-length", Some("-mul"), "maxUrlLength", OptionType::Int, false,
                "Max URL length in chars. It affects memory requirements.",
                Some("2083"), false, false, None,
            ),
            CrawlerOption::new(
                "--max-non200-responses-per-basename", Some("-mnrpb"), "maxNon200ResponsesPerBasename", OptionType::Int, false,
                "Protection against looping with dynamic non-200 URLs. If a basename (the last part of the URL after the last slash) has more non-200 responses than this limit, other URLs with same basename will be ignored/skipped.",
                Some("5"), false, false, None,
            ),
        ],
    ));

    // -------------------------------------------------------------------------
    // Expert settings (CoreOptions group 5)
    // -------------------------------------------------------------------------
    options.add_group(OptionGroup::new(
        GROUP_EXPERT_SETTINGS,
        "Expert settings",
        vec![
            CrawlerOption::new(
                "--debug", None, "debug", OptionType::Bool, false,
                "Activate debug mode.",
                Some("false"), true, false, None,
            ),
            CrawlerOption::new(
                "--debug-log-file", None, "debugLogFile", OptionType::File, false,
                "Log file where to save debug messages. When --debug is not set and --debug-log-file is set, logging will be active without visible output.",
                None, true, false, None,
            ),
            CrawlerOption::new(
                "--debug-url-regex", None, "debugUrlRegex", OptionType::Regex, true,
                "Regex for URL(s) to debug. When crawled URL is matched, parsing, URL replacing and other actions are printed to output. Can be specified multiple times.",
                None, true, true, None,
            ),
            CrawlerOption::new(
                "--result-storage", Some("-rs"), "resultStorage", OptionType::String, false,
                "Result storage type for content and headers. Values: `memory` or `file`. Use `file` for large websites.",
                Some("memory"), false, false, None,
            ),
            {
                let prefix = default_output_prefix();
                CrawlerOption::new(
                    "--result-storage-dir", Some("-rsd"), "resultStorageDir", OptionType::Dir, false,
                    "Directory for --result-storage=file.",
                    Some(&format!("{prefix}/result-storage")), false, false, None,
                )
            },
            CrawlerOption::new(
                "--result-storage-compression", Some("-rsc"), "resultStorageCompression", OptionType::Bool, false,
                "Enable compression for results storage. Saves disk space, but uses more CPU.",
                Some("false"), false, false, None,
            ),
            {
                let cache_default = default_http_cache_dir();
                CrawlerOption::new(
                    "--http-cache-dir", Some("-hcd"), "httpCacheDir", OptionType::Dir, false,
                    "Cache dir for HTTP responses. Disable with --http-cache-dir='off' or --no-cache.",
                    Some(&cache_default), false, false, None,
                )
            },
            CrawlerOption::new(
                "--http-cache-compression", Some("-hcc"), "httpCacheCompression", OptionType::Bool, false,
                "Enable compression for HTTP cache storage. Saves disk space, but uses more CPU.",
                Some("false"), true, false, None,
            ),
            CrawlerOption::new(
                "--http-cache-ttl", Some("-hct"), "httpCacheTtl", OptionType::String, false,
                "TTL for HTTP cache entries (e.g. '1h', '7d', '30m'). Use '0' for infinite. Default: 24h.",
                Some("24h"), false, false, None,
            ),
            CrawlerOption::new(
                "--no-cache", None, "noCache", OptionType::Bool, false,
                "Disable HTTP cache completely. Shortcut for --http-cache-dir='off'.",
                Some("false"), false, false, None,
            ),
            CrawlerOption::new(
                "--websocket-server", Some("-ws"), "websocketServer", OptionType::HostAndPort, false,
                "Start crawler with websocket server on given host:port, typically `0.0.0.0:8000`.",
                None, true, false, None,
            ),
            CrawlerOption::new(
                "--console-width", Some("-cw"), "consoleWidth", OptionType::Int, false,
                "Enforce the definition of the console width and disable automatic detection.",
                None, true, false, None,
            ),
        ],
    ));

    // -------------------------------------------------------------------------
    // File export settings (FileExporter - alphabetically first exporter)
    // -------------------------------------------------------------------------
    options.add_group(OptionGroup::new(
        GROUP_FILE_EXPORT_SETTINGS,
        "File export settings",
        vec![
            {
                let prefix = default_output_prefix();
                let sep = std::path::MAIN_SEPARATOR;
                CrawlerOption::new(
                    "--output-html-report", None, "outputHtmlReport", OptionType::File, false,
                    "Save HTML report into that file. Set to empty '' to disable HTML report.",
                    Some(&format!("{prefix}{sep}%domain%.report.%datetime%.html")), true, false, None,
                )
            },
            CrawlerOption::new(
                "--html-report-options", None, "htmlReportOptions", OptionType::String, false,
                "Comma-separated list of sections to include in HTML report. Available sections: summary, seo-opengraph, image-gallery, video-gallery, visited-urls, dns-ssl, crawler-stats, crawler-info, headers, content-types, skipped-urls, caching, best-practices, accessibility, security, redirects, 404-pages, slowest-urls, fastest-urls, source-domains. Default: all sections.",
                None, true, false, None,
            ),
            {
                let prefix = default_output_prefix();
                let sep = std::path::MAIN_SEPARATOR;
                CrawlerOption::new(
                    "--output-json-file", None, "outputJsonFile", OptionType::File, false,
                    "Save report as JSON. Set to empty '' to disable JSON report.",
                    Some(&format!("{prefix}{sep}%domain%.output.%datetime%.json")), true, false, None,
                )
            },
            {
                let prefix = default_output_prefix();
                let sep = std::path::MAIN_SEPARATOR;
                CrawlerOption::new(
                    "--output-text-file", None, "outputTextFile", OptionType::File, false,
                    "Save output as TXT. Set to empty '' to disable TXT report.",
                    Some(&format!("{prefix}{sep}%domain%.output.%datetime%.txt")), true, false, None,
                )
            },
            CrawlerOption::new(
                "--add-host-to-output-file", None, "addHostToOutputFile", OptionType::Bool, false,
                "Append initial URL host to filename except sitemaps.",
                Some("false"), false, false, None,
            ),
            CrawlerOption::new(
                "--add-timestamp-to-output-file", None, "addTimestampToOutputFile", OptionType::Bool, false,
                "Append timestamp to filename except sitemaps.",
                Some("false"), false, false, None,
            ),
        ],
    ));

    // -------------------------------------------------------------------------
    // Mailer options (MailerExporter)
    // -------------------------------------------------------------------------
    options.add_group(OptionGroup::new(
        GROUP_MAILER_SETTINGS,
        "Mailer options",
        vec![
            CrawlerOption::new(
                "--mail-to",
                None,
                "mailTo",
                OptionType::Email,
                true,
                "E-mail report recipient address(es). Can be specified multiple times.",
                None,
                true,
                true,
                None,
            ),
            CrawlerOption::new(
                "--mail-from",
                None,
                "mailFrom",
                OptionType::Email,
                false,
                "E-mail sender address.",
                Some("siteone-crawler@your-hostname.com"),
                false,
                false,
                None,
            ),
            CrawlerOption::new(
                "--mail-from-name",
                None,
                "mailFromName",
                OptionType::String,
                false,
                "E-mail sender name",
                Some("SiteOne Crawler"),
                false,
                false,
                None,
            ),
            CrawlerOption::new(
                "--mail-subject-template",
                None,
                "mailSubjectTemplate",
                OptionType::String,
                false,
                "E-mail subject template. You can use dynamic variables %domain% and %datetime%",
                Some("Crawler Report for %domain% (%date%)"),
                true,
                false,
                None,
            ),
            CrawlerOption::new(
                "--mail-smtp-host",
                None,
                "mailSmtpHost",
                OptionType::String,
                false,
                "SMTP host.",
                Some("localhost"),
                true,
                false,
                None,
            ),
            CrawlerOption::new(
                "--mail-smtp-port",
                None,
                "mailSmtpPort",
                OptionType::Int,
                false,
                "SMTP port.",
                Some("25"),
                true,
                false,
                Some(vec!["1".to_string(), "65535".to_string()]),
            ),
            CrawlerOption::new(
                "--mail-smtp-user",
                None,
                "mailSmtpUser",
                OptionType::String,
                false,
                "SMTP user for authentication.",
                None,
                true,
                false,
                None,
            ),
            CrawlerOption::new(
                "--mail-smtp-pass",
                None,
                "mailSmtpPass",
                OptionType::String,
                false,
                "SMTP password for authentication.",
                None,
                true,
                false,
                None,
            ),
        ],
    ));

    // -------------------------------------------------------------------------
    // Markdown exporter options (MarkdownExporter)
    // -------------------------------------------------------------------------
    options.add_group(OptionGroup::new(
        GROUP_MARKDOWN_EXPORT_SETTINGS,
        "Markdown exporter options",
        vec![
            CrawlerOption::new(
                "--markdown-export-dir", Some("-med"), "markdownExportDirectory", OptionType::Dir, false,
                "Path to directory where to save the markdown version of the website.",
                None, true, false, None,
            ),
            CrawlerOption::new(
                "--markdown-export-single-file", None, "markdownExportSingleFile", OptionType::File, false,
                "Path to a file where to save the combined markdown files into one document. Requires --markdown-export-dir to be set.",
                None, true, false, None,
            ),
            CrawlerOption::new(
                "--markdown-move-content-before-h1-to-end", None, "markdownMoveContentBeforeH1ToEnd", OptionType::Bool, false,
                "Move all content before the main H1 heading (typically the header with the menu) to the end of the markdown.",
                Some("false"), true, false, None,
            ),
            CrawlerOption::new(
                "--markdown-disable-images", Some("-mdi"), "markdownDisableImages", OptionType::Bool, false,
                "Do not export and show images in markdown files. Images are enabled by default.",
                Some("false"), true, false, None,
            ),
            CrawlerOption::new(
                "--markdown-disable-files", Some("-mdf"), "markdownDisableFiles", OptionType::Bool, false,
                "Do not export and link files other than HTML/CSS/JS/fonts/images - eg. PDF, ZIP, etc. These files are enabled by default.",
                Some("false"), true, false, None,
            ),
            CrawlerOption::new(
                "--markdown-remove-links-and-images-from-single-file", None, "markdownRemoveLinksAndImagesFromSingleFile", OptionType::Bool, false,
                "Remove links and images from the combined single markdown file. Useful for AI tools that don't need these elements.",
                Some("false"), false, false, None,
            ),
            CrawlerOption::new(
                "--markdown-exclude-selector", Some("-mes"), "markdownExcludeSelector", OptionType::String, true,
                "Exclude some page content (DOM elements) from markdown export defined by CSS selectors like 'header', '.header', '#header', etc.",
                None, false, true, None,
            ),
            CrawlerOption::new(
                "--markdown-replace-content", None, "markdownReplaceContent", OptionType::ReplaceContent, true,
                "Replace text content with `foo -> bar` or regexp in PREG format: `/card[0-9]/i -> card`",
                None, true, true, None,
            ),
            CrawlerOption::new(
                "--markdown-replace-query-string", None, "markdownReplaceQueryString", OptionType::ReplaceContent, true,
                "Instead of using a short hash instead of a query string in the filename, just replace some characters. You can use simple format 'foo -> bar' or regexp in PREG format, e.g. '/([a-z]+)=([^&]*)(&|$)/i -> $1__$2'",
                None, true, true, None,
            ),
            CrawlerOption::new(
                "--markdown-export-store-only-url-regex", None, "markdownExportStoreOnlyUrlRegex", OptionType::Regex, true,
                "For debug - when filled it will activate debug mode and store only URLs which match one of these PCRE regexes. Can be specified multiple times.",
                None, true, true, None,
            ),
            CrawlerOption::new(
                "--markdown-ignore-store-file-error", None, "markdownIgnoreStoreFileError", OptionType::Bool, false,
                "Ignores any file storing errors. The export process will continue.",
                Some("false"), false, false, None,
            ),
        ],
    ));

    // -------------------------------------------------------------------------
    // Offline exporter options (OfflineWebsiteExporter)
    // -------------------------------------------------------------------------
    options.add_group(OptionGroup::new(
        GROUP_OFFLINE_EXPORT_SETTINGS,
        "Offline exporter options",
        vec![
            CrawlerOption::new(
                "--offline-export-dir", Some("-oed"), "offlineExportDirectory", OptionType::Dir, false,
                "Path to directory where to save the offline version of the website.",
                None, true, false, None,
            ),
            CrawlerOption::new(
                "--offline-export-store-only-url-regex", None, "offlineExportStoreOnlyUrlRegex", OptionType::Regex, true,
                "For debug - when filled it will activate debug mode and store only URLs which match one of these PCRE regexes. Can be specified multiple times.",
                None, true, true, None,
            ),
            CrawlerOption::new(
                "--offline-export-remove-unwanted-code", None, "offlineExportRemoveUnwantedCode", OptionType::Bool, false,
                "Remove unwanted code for offline mode? Typically JS of the analytics, social networks, cookie consent, cross origins, etc.",
                Some("true"), false, false, None,
            ),
            CrawlerOption::new(
                "--offline-export-no-auto-redirect-html", None, "offlineExportNoAutoRedirectHtml", OptionType::Bool, false,
                "Disable automatic creation of redirect HTML files for subfolders that contain an index.html file. This solves situations for URLs where sometimes the URL ends with a slash, sometimes it doesn't.",
                Some("false"), false, false, None,
            ),
            CrawlerOption::new(
                "--offline-export-preserve-url-structure", None, "offlineExportPreserveUrlStructure", OptionType::Bool, false,
                "Preserve the original URL path structure. E.g. /about is stored as about/index.html instead of about.html. Useful for web server deployment.",
                Some("false"), false, false, None,
            ),
            CrawlerOption::new(
                "--offline-export-preserve-urls", None, "offlineExportPreserveUrls", OptionType::Bool, false,
                "Preserve original URL format in exported HTML/CSS/JS. Same-domain links become root-relative (/path), cross-domain links stay absolute. Useful when exported HTML is processed by tools that need production URLs.",
                Some("false"), false, false, None,
            ),
            CrawlerOption::new(
                "--replace-content", None, "replaceContent", OptionType::ReplaceContent, true,
                "Replace HTML/JS/CSS content with `foo -> bar` or regexp in PREG format: `/card[0-9]/i -> card`",
                None, true, true, None,
            ),
            CrawlerOption::new(
                "--replace-query-string", None, "replaceQueryString", OptionType::ReplaceContent, true,
                "Instead of using a short hash instead of a query string in the filename, just replace some characters. You can use simple format 'foo -> bar' or regexp in PREG format, e.g. '/([a-z]+)=([^&]*)(&|$)/i -> $1__$2'",
                None, true, true, None,
            ),
            CrawlerOption::new(
                "--offline-export-lowercase", None, "offlineExportLowercase", OptionType::Bool, false,
                "Convert all filenames to lowercase for offline export. Useful for case-insensitive filesystems.",
                Some("false"), false, false, None,
            ),
            CrawlerOption::new(
                "--ignore-store-file-error", None, "ignoreStoreFileError", OptionType::Bool, false,
                "Ignores any file storing errors. The export process will continue.",
                Some("false"), false, false, None,
            ),
            CrawlerOption::new(
                "--disable-astro-inline-modules", None, "disableAstroInlineModules", OptionType::Bool, false,
                "Disables inlining of Astro module scripts for offline export. Scripts will remain as external files with corrected relative paths.",
                Some("false"), false, false, None,
            ),
        ],
    ));

    // -------------------------------------------------------------------------
    // Sitemap options (SitemapExporter)
    // -------------------------------------------------------------------------
    options.add_group(OptionGroup::new(
        GROUP_SITEMAP_SETTINGS,
        "Sitemap options",
        vec![
            CrawlerOption::new(
                "--sitemap-xml-file",
                None,
                "outputSitemapXml",
                OptionType::File,
                false,
                "Save sitemap to XML. `.xml` added if missing.",
                None,
                true,
                false,
                None,
            ),
            CrawlerOption::new(
                "--sitemap-txt-file",
                None,
                "outputSitemapTxt",
                OptionType::File,
                false,
                "Save sitemap to TXT. `.txt` added if missing.",
                None,
                true,
                false,
                None,
            ),
            CrawlerOption::new(
                "--sitemap-base-priority",
                None,
                "sitemapBasePriority",
                OptionType::Float,
                false,
                "Base priority for XML sitemap.",
                Some("0.5"),
                false,
                false,
                None,
            ),
            CrawlerOption::new(
                "--sitemap-priority-increase",
                None,
                "sitemapPriorityIncrease",
                OptionType::Float,
                false,
                "Priority increase value based on slashes count in the URL",
                Some("0.1"),
                false,
                false,
                None,
            ),
        ],
    ));

    // -------------------------------------------------------------------------
    // Upload options (UploadExporter)
    // -------------------------------------------------------------------------
    options.add_group(OptionGroup::new(
        GROUP_UPLOAD_SETTINGS,
        "Upload options",
        vec![
            CrawlerOption::new(
                "--upload", Some("-up"), "uploadEnabled", OptionType::Bool, false,
                "Enable HTML report upload to `--upload-to`.",
                Some("false"), false, false, None,
            ),
            CrawlerOption::new(
                "--upload-to", Some("-upt"), "uploadTo", OptionType::Url, false,
                "URL of the endpoint where to send the HTML report.",
                Some("https://crawler.siteone.io/up"), false, false, None,
            ),
            CrawlerOption::new(
                "--upload-retention", Some("-upr"), "uploadRetention", OptionType::String, false,
                "How long should the HTML report be kept in the online version? Values: 1h / 4h / 12h / 24h / 3d / 7d / 30d / 365d / forever",
                Some("30d"), false, false, None,
            ),
            CrawlerOption::new(
                "--upload-password", Some("-uppass"), "uploadPassword", OptionType::String, false,
                "Optional password, which must be entered (the user will be 'crawler') to display the online HTML report.",
                None, true, false, None,
            ),
            CrawlerOption::new(
                "--upload-timeout", Some("-upti"), "uploadTimeout", OptionType::Int, false,
                "Upload timeout in seconds.",
                Some("3600"), false, false, None,
            ),
        ],
    ));

    // -------------------------------------------------------------------------
    // Fastest URL analyzer (FastestAnalyzer)
    // -------------------------------------------------------------------------
    options.add_group(OptionGroup::new(
        GROUP_FASTEST_ANALYZER,
        "Fastest URL analyzer",
        vec![
            CrawlerOption::new(
                "--fastest-urls-top-limit",
                None,
                "fastestTopLimit",
                OptionType::Int,
                false,
                "Number of URL addresses in TOP fastest URL addresses.",
                Some("20"),
                false,
                false,
                None,
            ),
            CrawlerOption::new(
                "--fastest-urls-max-time",
                None,
                "fastestMaxTime",
                OptionType::Float,
                false,
                "The maximum response time for an URL address to be evaluated as fast.",
                Some("1"),
                false,
                false,
                None,
            ),
        ],
    ));

    // -------------------------------------------------------------------------
    // SEO and OpenGraph analyzer (SeoAndOpenGraphAnalyzer)
    // -------------------------------------------------------------------------
    options.add_group(OptionGroup::new(
        GROUP_SEO_AND_OPENGRAPH_ANALYZER,
        "SEO and OpenGraph analyzer",
        vec![CrawlerOption::new(
            "--max-heading-level",
            None,
            "maxHeadingLevel",
            OptionType::Int,
            false,
            "Maximal analyzer heading level from 1 to 6.",
            Some("3"),
            false,
            false,
            Some(vec!["1".to_string(), "6".to_string()]),
        )],
    ));

    // -------------------------------------------------------------------------
    // Slowest URL analyzer (SlowestAnalyzer)
    // -------------------------------------------------------------------------
    options.add_group(OptionGroup::new(
        GROUP_SLOWEST_ANALYZER,
        "Slowest URL analyzer",
        vec![
            CrawlerOption::new(
                "--slowest-urls-top-limit",
                None,
                "slowestTopLimit",
                OptionType::Int,
                false,
                "Number of URL addresses in TOP slowest URL addresses.",
                Some("20"),
                false,
                false,
                None,
            ),
            CrawlerOption::new(
                "--slowest-urls-min-time",
                None,
                "slowestMinTime",
                OptionType::Float,
                false,
                "The minimum response time for an URL address to be added to TOP slow selection.",
                Some("0.01"),
                false,
                false,
                None,
            ),
            CrawlerOption::new(
                "--slowest-urls-max-time",
                None,
                "slowestMaxTime",
                OptionType::Float,
                false,
                "The maximum response time for an URL address to be evaluated as very slow.",
                Some("3"),
                false,
                false,
                None,
            ),
        ],
    ));

    // -------------------------------------------------------------------------
    // CI/CD settings
    // -------------------------------------------------------------------------
    options.add_group(OptionGroup::new(
        GROUP_CI_CD_SETTINGS,
        "CI/CD settings",
        vec![
            CrawlerOption::new(
                "--ci",
                None,
                "ci",
                OptionType::Bool,
                false,
                "Enable CI/CD quality gate. Crawler exits with code 10 if thresholds are not met.",
                Some("false"),
                false,
                false,
                None,
            ),
            CrawlerOption::new(
                "--ci-min-score",
                None,
                "ciMinScore",
                OptionType::Float,
                false,
                "Minimum overall quality score (0.0-10.0).",
                Some("5.0"),
                false,
                false,
                Some(vec!["0.0".into(), "10.0".into()]),
            ),
            CrawlerOption::new(
                "--ci-min-performance",
                None,
                "ciMinPerformance",
                OptionType::Float,
                false,
                "Minimum Performance category score (0.0-10.0). Default value is `5`.",
                Some("5"),
                true,
                false,
                Some(vec!["0.0".into(), "10.0".into()]),
            ),
            CrawlerOption::new(
                "--ci-min-seo",
                None,
                "ciMinSeo",
                OptionType::Float,
                false,
                "Minimum SEO category score (0.0-10.0). Default value is `5`.",
                Some("5"),
                true,
                false,
                Some(vec!["0.0".into(), "10.0".into()]),
            ),
            CrawlerOption::new(
                "--ci-min-security",
                None,
                "ciMinSecurity",
                OptionType::Float,
                false,
                "Minimum Security category score (0.0-10.0). Default value is `5`.",
                Some("5"),
                true,
                false,
                Some(vec!["0.0".into(), "10.0".into()]),
            ),
            CrawlerOption::new(
                "--ci-min-accessibility",
                None,
                "ciMinAccessibility",
                OptionType::Float,
                false,
                "Minimum Accessibility category score (0.0-10.0). Default value is `3`.",
                Some("3"),
                true,
                false,
                Some(vec!["0.0".into(), "10.0".into()]),
            ),
            CrawlerOption::new(
                "--ci-min-best-practices",
                None,
                "ciMinBestPractices",
                OptionType::Float,
                false,
                "Minimum Best Practices category score (0.0-10.0). Default value is `5`.",
                Some("5"),
                true,
                false,
                Some(vec!["0.0".into(), "10.0".into()]),
            ),
            CrawlerOption::new(
                "--ci-max-404",
                None,
                "ciMax404",
                OptionType::Int,
                false,
                "Maximum number of 404 responses allowed.",
                Some("0"),
                false,
                false,
                None,
            ),
            CrawlerOption::new(
                "--ci-max-5xx",
                None,
                "ciMax5xx",
                OptionType::Int,
                false,
                "Maximum number of 5xx server error responses allowed.",
                Some("0"),
                false,
                false,
                None,
            ),
            CrawlerOption::new(
                "--ci-max-criticals",
                None,
                "ciMaxCriticals",
                OptionType::Int,
                false,
                "Maximum number of critical analysis findings allowed.",
                Some("0"),
                false,
                false,
                None,
            ),
            CrawlerOption::new(
                "--ci-max-warnings",
                None,
                "ciMaxWarnings",
                OptionType::Int,
                false,
                "Maximum number of warning analysis findings allowed.",
                None,
                true,
                false,
                None,
            ),
            CrawlerOption::new(
                "--ci-max-avg-response",
                None,
                "ciMaxAvgResponse",
                OptionType::Float,
                false,
                "Maximum average response time in seconds.",
                None,
                true,
                false,
                None,
            ),
            CrawlerOption::new(
                "--ci-min-pages",
                None,
                "ciMinPages",
                OptionType::Int,
                false,
                "Minimum number of HTML pages that must be found.",
                Some("10"),
                false,
                false,
                None,
            ),
            CrawlerOption::new(
                "--ci-min-assets",
                None,
                "ciMinAssets",
                OptionType::Int,
                false,
                "Minimum number of assets (JS, CSS, images, fonts) that must be found.",
                Some("10"),
                false,
                false,
                None,
            ),
            CrawlerOption::new(
                "--ci-min-documents",
                None,
                "ciMinDocuments",
                OptionType::Int,
                false,
                "Minimum number of documents (PDF, etc.) that must be found.",
                Some("0"),
                false,
                false,
                None,
            ),
        ],
    ));

    // -------------------------------------------------------------------------
    // Server options (built-in HTTP server for serving exports)
    // -------------------------------------------------------------------------
    options.add_group(OptionGroup::new(
        GROUP_SERVER_SETTINGS,
        "Server options",
        vec![
            CrawlerOption::new(
                "--serve-markdown", Some("-sm"), "serveMarkdownDirectory", OptionType::Dir, false,
                "Start HTTP server to browse a markdown export directory. Renders .md files as styled HTML with table and accordion support. No crawling is performed.",
                None, true, false, None,
            ),
            CrawlerOption::new(
                "--serve-offline", Some("-so"), "serveOfflineDirectory", OptionType::Dir, false,
                "Start HTTP server to browse an offline HTML export directory. Serves files with Content-Security-Policy restricting to same origin. No crawling is performed.",
                None, true, false, None,
            ),
            CrawlerOption::new(
                "--serve-port", Some("-sport"), "servePort", OptionType::Int, false,
                "Port for the built-in HTTP server (used with --serve-markdown or --serve-offline).",
                Some("8321"), false, false, None,
            ),
            CrawlerOption::new(
                "--serve-bind-address", Some("-sba"), "serveBindAddress", OptionType::String, false,
                "Bind address for the built-in HTTP server. Default is 127.0.0.1 (localhost only). Use 0.0.0.0 to listen on all network interfaces.",
                Some("127.0.0.1"), false, false, None,
            ),
            CrawlerOption::new(
                "--html-to-markdown", Some("-htm"), "htmlToMarkdownFile", OptionType::String, false,
                "Convert a local HTML file to Markdown and print to stdout. Uses the same pipeline as --markdown-export-dir. Respects --markdown-disable-images, --markdown-disable-files, --markdown-move-content-before-h1-to-end, and --markdown-exclude-selector. No crawling is performed.",
                None, true, false, None,
            ),
            CrawlerOption::new(
                "--html-to-markdown-output", Some("-htmo"), "htmlToMarkdownOutput", OptionType::String, false,
                "Output file path for --html-to-markdown. If not set, markdown is printed to stdout.",
                None, true, false, None,
            ),
        ],
    ));

    options
}

/// Parse CLI arguments (raw argv) into a fully populated CoreOptions.
/// Read config file and return its lines as CLI-style arguments.
/// Config file format: one argument per line, `#` for comments, blank lines ignored.
/// Example:
///   --workers=5
///   --max-reqs-per-sec=20
///   # This is a comment
///   --output=json
fn read_config_file(path: &str) -> Result<Vec<String>, CrawlerError> {
    let content = std::fs::read_to_string(path)
        .map_err(|e| CrawlerError::Config(format!("Cannot read config file '{}': {}", path, e)))?;
    let args: Vec<String> = content
        .lines()
        .map(|line| line.trim())
        .filter(|line| !line.is_empty() && !line.starts_with('#'))
        .map(|line| line.to_string())
        .collect();
    Ok(args)
}

/// Load config from file: --config-file=PATH, ~/.siteone-crawler.conf, or /etc/siteone-crawler.conf.
/// Returns merged argv with config args prepended (CLI args take precedence).
fn merge_config_file_args(argv: &[String]) -> Result<Vec<String>, CrawlerError> {
    // Extract --config-file from argv
    let mut config_path: Option<String> = None;
    for arg in argv {
        if let Some(path) = arg.strip_prefix("--config-file=") {
            config_path = Some(path.to_string());
            break;
        }
    }

    // If no explicit config file, try auto-discovery
    if config_path.is_none() {
        let home_conf = std::env::var("HOME")
            .ok()
            .map(|h| format!("{}/.siteone-crawler.conf", h));
        let candidates = [home_conf, Some("/etc/siteone-crawler.conf".to_string())];
        for candidate in candidates.iter().flatten() {
            if std::path::Path::new(candidate).exists() {
                config_path = Some(candidate.clone());
                break;
            }
        }
    }

    if let Some(ref path) = config_path {
        let config_args = read_config_file(path)?;
        // Merge: config args first, then real argv (CLI overrides config)
        // Filter out --config-file from real argv
        let real_args: Vec<String> = argv
            .iter()
            .filter(|a| !a.starts_with("--config-file="))
            .cloned()
            .collect();
        let mut merged = Vec::new();
        if !real_args.is_empty() {
            merged.push(real_args[0].clone()); // binary name
        }
        merged.extend(config_args);
        if real_args.len() > 1 {
            merged.extend_from_slice(&real_args[1..]);
        }
        Ok(merged)
    } else {
        Ok(argv.to_vec())
    }
}

/// This is the main entry point for option parsing.
pub fn parse_argv(argv: &[String]) -> Result<CoreOptions, CrawlerError> {
    // Merge config file args with CLI args (CLI takes precedence)
    let merged_argv = merge_config_file_args(argv)?;
    let argv = &merged_argv;

    let mut options = get_options();

    // Collect all known option names and alt names for unknown detection
    let mut known_options: Vec<String> = Vec::new();
    let mut bool_options: std::collections::HashSet<String> = std::collections::HashSet::new();
    for (_apl_code, group) in options.get_groups() {
        for (_prop_name, option) in &group.options {
            known_options.push(option.name.clone());
            if matches!(option.option_type, OptionType::Bool) {
                bool_options.insert(option.name.clone());
            }
            if let Some(ref alt) = option.alt_name {
                known_options.push(alt.clone());
                if matches!(option.option_type, OptionType::Bool) {
                    bool_options.insert(alt.clone());
                }
            }
        }
    }
    // Also accept --config-file as known
    known_options.push("--config-file".to_string());

    // Check for unknown options
    let mut unknown_options: Vec<String> = Vec::new();
    let mut i = 0;
    while i < argv.len() {
        let arg = argv[i].trim();
        if arg.is_empty() || arg.starts_with('#') {
            i += 1;
            continue;
        }
        // Skip the program name (first argv element or any non-option arg)
        if !arg.starts_with('-') {
            // Check if this is a value consumed by a previous space-separated option
            // (non-option args that aren't the script name are potentially unknown)
            if i > 0 {
                // Check if previous arg was a known option that could consume this as value
                let prev = &argv[i - 1];
                let prev_name = prev.split('=').next().unwrap_or(prev);
                let is_prev_known_non_bool = known_options.iter().any(|k| k == prev_name) && !prev.contains('=');
                if !is_prev_known_non_bool {
                    // Not a consumed value — could be unknown, but skip argv[0] (binary name)
                    // We just skip non-dash args silently (they might be the binary path)
                }
            }
            i += 1;
            continue;
        }
        // Extract option name without value (strip =...)
        let arg_without_value = if let Some(eq_pos) = arg.find('=') {
            &arg[..eq_pos]
        } else {
            arg
        };
        if !known_options.iter().any(|k| k == arg_without_value) {
            unknown_options.push(arg.to_string());
        } else if !arg.contains('=') && !bool_options.contains(arg_without_value) {
            // Known non-bool option without '=' — the next token is its value, skip it
            i += 1;
        }
        i += 1;
    }
    if !unknown_options.is_empty() {
        return Err(CrawlerError::Config(format!(
            "Unknown options: {}",
            unknown_options.join(", ")
        )));
    }

    // Parse all options from argv
    for (_apl_code, group) in options.get_groups_mut() {
        for (_prop_name, option) in group.options.iter_mut() {
            option.set_value_from_argv(argv)?;

            // Set domain for use in file/dir %domain% placeholder
            if option.property_to_fill == "url"
                && let Ok(value) = option.get_value()
                && let Some(url_str) = value.as_str()
                && let Ok(parsed) = url::Url::parse(url_str)
            {
                CrawlerOption::set_extras_domain(parsed.host_str());
            }
        }
    }

    CoreOptions::from_options(&options)
}

/// Generate help text for all options, organized by groups.
pub fn get_help_text() -> String {
    use crate::options::option_type::OptionType;
    use crate::utils;

    let options = get_options();
    let mut help = String::new();

    for (_apl_code, group) in options.get_groups() {
        let group_label = format!("{}:", group.name);
        let dashes = "-".repeat(group_label.len());
        help.push_str(&format!(
            "{}\n{}\n",
            utils::get_color_text(&group_label, "yellow", false),
            utils::get_color_text(&dashes, "yellow", false),
        ));

        for (_prop_name, option) in &group.options {
            // Build option name with type suffix
            let type_suffix = match option.option_type {
                OptionType::Int => "=<int>",
                OptionType::String | OptionType::Float | OptionType::ReplaceContent => "=<val>",
                OptionType::SizeMG => "=<size>",
                OptionType::Regex => "=<regex>",
                OptionType::Email => "=<email>",
                OptionType::Url => "=<url>",
                OptionType::File => "=<file>",
                OptionType::Dir => "=<dir>",
                OptionType::HostAndPort => "=<host:port>",
                OptionType::Resolve => "=<domain:port:ip>",
                OptionType::Bool => "",
            };
            let name_and_value = format!("{}{}", option.name, type_suffix);

            // Description: trim trailing '. ' then append '.'
            let desc = option.description.trim_end_matches(['.', ' ']);
            let desc_with_period = format!("{}.", desc);

            // Default value display logic:
            // Bool options with default false don't show a default.
            // Bool options with default true show as "1".
            let default_info = match option.default_value {
                Some(ref dv) if !dv.is_empty() && !desc_with_period.contains("Default") => {
                    if option.option_type == OptionType::Bool {
                        // true displays as "1", false is not shown
                        if dv == "true" || dv == "1" {
                            " Default value is `1`.".to_string()
                        } else {
                            String::new()
                        }
                    } else {
                        format!(" Default value is `{}`.", dv)
                    }
                }
                _ => String::new(),
            };

            // Ensure at least one space between name+type and description
            let padded = if name_and_value.len() >= 33 {
                format!("{} ", name_and_value)
            } else {
                format!("{:<33}", name_and_value)
            };

            help.push_str(&format!("{}{}{}\n", padded, desc_with_period, default_info));
        }

        help.push('\n');
    }

    help
}

/// Parse a human-readable duration string (e.g. "24h", "7d", "30m", "3600s", "3600") to seconds.
fn parse_duration_to_secs(s: &str) -> u64 {
    let s = s.trim();
    if let Some(num) = s.strip_suffix('d') {
        num.parse::<u64>().unwrap_or(1) * 86400
    } else if let Some(num) = s.strip_suffix('h') {
        num.parse::<u64>().unwrap_or(1) * 3600
    } else if let Some(num) = s.strip_suffix('m') {
        num.parse::<u64>().unwrap_or(1) * 60
    } else if let Some(num) = s.strip_suffix('s') {
        num.parse::<u64>().unwrap_or(0)
    } else {
        // Plain number = seconds
        s.parse::<u64>().unwrap_or(86400)
    }
}

/// Returns the platform-appropriate default HTTP cache directory.
/// Uses dirs::cache_dir() for XDG/macOS/Windows compliance:
///   Linux:   ~/.cache/siteone-crawler/http-cache
///   macOS:   ~/Library/Caches/siteone-crawler/http-cache
///   Windows: C:\Users\<user>\AppData\Local\siteone-crawler\http-cache
/// Falls back to "tmp/http-client-cache" if system cache dir is unavailable.
fn default_http_cache_dir() -> String {
    dirs::cache_dir()
        .map(|p| {
            p.join("siteone-crawler")
                .join("http-cache")
                .to_string_lossy()
                .to_string()
        })
        .unwrap_or_else(|| "tmp/http-client-cache".to_string())
}

/// Returns the default output directory prefix for reports and result storage.
/// Tries `./tmp/` in CWD first; if it can't be created (e.g. read-only filesystem),
/// falls back to `dirs::data_local_dir()/siteone-crawler/` (platform-appropriate).
/// Result is cached via OnceLock so the notice is printed at most once.
fn default_output_prefix() -> String {
    static PREFIX: std::sync::OnceLock<String> = std::sync::OnceLock::new();
    PREFIX
        .get_or_init(|| {
            let tmp_path = std::path::Path::new("tmp");
            if tmp_path.is_dir() || std::fs::create_dir_all(tmp_path).is_ok() {
                return "tmp".to_string();
            }
            if let Some(data_dir) = dirs::data_local_dir() {
                let fallback = data_dir.join("siteone-crawler");
                if fallback.is_dir() || std::fs::create_dir_all(&fallback).is_ok() {
                    let path = fallback.to_string_lossy().to_string();
                    eprintln!(
                        "Notice: Cannot create ./tmp/ in current directory. Output files will be stored in: {}",
                        path
                    );
                    return path;
                }
            }
            // Last resort — use tmp and let it fail later with a clear error
            "tmp".to_string()
        })
        .clone()
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::options::option::OptionValue;

    fn make_default_core_options() -> CoreOptions {
        CoreOptions {
            url: "https://test.com".to_string(),
            single_page: false,
            max_depth: 0,
            device: DeviceType::Desktop,
            user_agent: None,
            timeout: 5,
            proxy: None,
            http_auth: None,
            accept_invalid_certs: false,
            timezone: None,
            show_version_only: false,
            show_help_only: false,
            output_type: OutputType::Text,
            url_column_size: None,
            show_inline_criticals: false,
            show_inline_warnings: false,
            rows_limit: 200,
            extra_columns: Vec::new(),
            extra_columns_names_only: Vec::new(),
            show_scheme_and_host: false,
            do_not_truncate_url: false,
            hide_progress_bar: false,
            hide_columns: Vec::new(),
            no_color: false,
            force_color: false,
            console_width: None,
            disable_all_assets: false,
            disable_javascript: false,
            disable_styles: false,
            disable_fonts: false,
            disable_images: false,
            disable_files: false,
            remove_all_anchor_listeners: false,
            workers: 3,
            max_reqs_per_sec: 10.0,
            memory_limit: "2048M".to_string(),
            resolve: Vec::new(),
            websocket_server: None,
            ignore_robots_txt: false,
            allowed_domains_for_external_files: Vec::new(),
            allowed_domains_for_crawling: Vec::new(),
            single_foreign_page: false,
            result_storage: StorageType::Memory,
            result_storage_dir: "tmp/result-storage".to_string(),
            result_storage_compression: false,
            accept_encoding: "gzip, deflate, br".to_string(),
            max_queue_length: 9000,
            max_visited_urls: 10000,
            max_url_length: 2083,
            max_skipped_urls: 10000,
            max_non200_responses_per_basename: 5,
            include_regex: Vec::new(),
            ignore_regex: Vec::new(),
            regex_filtering_only_for_pages: false,
            analyzer_filter_regex: None,
            add_random_query_params: false,
            remove_query_params: false,
            keep_query_params: Vec::new(),
            transform_url: Vec::new(),
            force_relative_urls: false,
            output_html_report: None,
            html_report_options: None,
            output_json_file: None,
            output_text_file: None,
            add_host_to_output_file: false,
            add_timestamp_to_output_file: false,
            sitemap_xml_file: None,
            sitemap_txt_file: None,
            sitemap_base_priority: 0.5,
            sitemap_priority_increase: 0.1,
            offline_export_dir: None,
            offline_export_store_only_url_regex: Vec::new(),
            offline_export_remove_unwanted_code: true,
            offline_export_no_auto_redirect_html: false,
            offline_export_preserve_url_structure: false,
            offline_export_preserve_urls: false,
            replace_content: Vec::new(),
            replace_query_string: Vec::new(),
            offline_export_lowercase: false,
            ignore_store_file_error: false,
            disable_astro_inline_modules: false,
            markdown_export_dir: None,
            markdown_export_single_file: None,
            markdown_move_content_before_h1_to_end: false,
            markdown_disable_images: false,
            markdown_disable_files: false,
            markdown_remove_links_and_images_from_single_file: false,
            markdown_exclude_selector: Vec::new(),
            markdown_replace_content: Vec::new(),
            markdown_replace_query_string: Vec::new(),
            markdown_export_store_only_url_regex: Vec::new(),
            markdown_ignore_store_file_error: false,
            mail_to: Vec::new(),
            mail_from: "test@test.com".to_string(),
            mail_from_name: "Test".to_string(),
            mail_subject_template: "Test".to_string(),
            mail_smtp_host: "localhost".to_string(),
            mail_smtp_port: 25,
            mail_smtp_user: None,
            mail_smtp_pass: None,
            upload_enabled: false,
            upload_to: String::new(),
            upload_retention: "30d".to_string(),
            upload_password: None,
            upload_timeout: 3600,
            http_cache_dir: None,
            http_cache_compression: false,
            http_cache_ttl: None,
            debug: false,
            debug_log_file: None,
            debug_url_regex: Vec::new(),
            fastest_top_limit: 20,
            fastest_max_time: 1.0,
            max_heading_level: 3,
            slowest_top_limit: 20,
            slowest_min_time: 0.01,
            slowest_max_time: 3.0,
            serve_markdown_dir: None,
            serve_offline_dir: None,
            serve_port: 8321,
            serve_bind_address: "127.0.0.1".to_string(),
            html_to_markdown_file: None,
            html_to_markdown_output: None,
            ci: false,
            ci_min_score: 5.0,
            ci_min_performance: Some(5.0),
            ci_min_seo: Some(5.0),
            ci_min_security: Some(5.0),
            ci_min_accessibility: Some(3.0),
            ci_min_best_practices: Some(5.0),
            ci_max_404: 0,
            ci_max_5xx: 0,
            ci_max_criticals: 0,
            ci_max_warnings: None,
            ci_max_avg_response: None,
            ci_min_pages: 10,
            ci_min_assets: 10,
            ci_min_documents: 0,
        }
    }

    #[test]
    fn ci_defaults() {
        let opts = make_default_core_options();
        assert!(!opts.ci);
        assert_eq!(opts.ci_min_score, 5.0);
        assert_eq!(opts.ci_max_404, 0);
        assert_eq!(opts.ci_max_5xx, 0);
        assert_eq!(opts.ci_max_criticals, 0);
    }

    #[test]
    fn apply_ci_bool() {
        let mut opts = make_default_core_options();
        opts.apply_option_value("ci", &OptionValue::Bool(true)).unwrap();
        assert!(opts.ci);
    }

    #[test]
    fn apply_ci_min_score() {
        let mut opts = make_default_core_options();
        opts.apply_option_value("ciMinScore", &OptionValue::Float(7.5)).unwrap();
        assert_eq!(opts.ci_min_score, 7.5);
    }

    #[test]
    fn apply_ci_max_404() {
        let mut opts = make_default_core_options();
        opts.apply_option_value("ciMax404", &OptionValue::Int(5)).unwrap();
        assert_eq!(opts.ci_max_404, 5);
    }

    #[test]
    fn apply_ci_max_warnings() {
        let mut opts = make_default_core_options();
        opts.apply_option_value("ciMaxWarnings", &OptionValue::Int(10)).unwrap();
        assert_eq!(opts.ci_max_warnings, Some(10));
    }

    #[test]
    fn apply_ci_max_avg_response() {
        let mut opts = make_default_core_options();
        opts.apply_option_value("ciMaxAvgResponse", &OptionValue::Float(2.0))
            .unwrap();
        assert_eq!(opts.ci_max_avg_response, Some(2.0));
    }

    #[test]
    fn apply_unknown_key_no_error() {
        let mut opts = make_default_core_options();
        let result = opts.apply_option_value("nonExistent", &OptionValue::Bool(true));
        assert!(result.is_ok());
    }

    #[test]
    fn ci_option_group_exists() {
        let options = get_options();
        let group = options.get_group(GROUP_CI_CD_SETTINGS);
        assert!(group.is_some());
        let group = group.unwrap();
        assert_eq!(group.options.len(), 15);
    }

    // ---- Duration parsing tests ----

    #[test]
    fn parse_duration_days() {
        assert_eq!(parse_duration_to_secs("7d"), 7 * 86400);
    }

    #[test]
    fn parse_duration_hours() {
        assert_eq!(parse_duration_to_secs("24h"), 24 * 3600);
    }

    #[test]
    fn parse_duration_minutes() {
        assert_eq!(parse_duration_to_secs("30m"), 30 * 60);
    }

    #[test]
    fn parse_duration_seconds() {
        assert_eq!(parse_duration_to_secs("3600s"), 3600);
        assert_eq!(parse_duration_to_secs("3600"), 3600);
    }

    #[test]
    fn parse_duration_invalid_number() {
        // "abcd" suffix 'd' → parse "abc" fails → fallback to 1 day
        assert_eq!(parse_duration_to_secs("abcd"), 86400);
    }

    // ---- Config file parsing tests ----

    #[test]
    fn read_config_file_parses_args() {
        let dir = std::env::temp_dir();
        let path = dir.join("test_crawler_config_1.conf");
        std::fs::write(&path, "--workers=5\n--max-reqs-per-sec=20\n").unwrap();
        let args = read_config_file(path.to_str().unwrap()).unwrap();
        assert_eq!(args, vec!["--workers=5", "--max-reqs-per-sec=20"]);
        std::fs::remove_file(&path).ok();
    }

    #[test]
    fn read_config_file_ignores_comments_and_blank_lines() {
        let dir = std::env::temp_dir();
        let path = dir.join("test_crawler_config_2.conf");
        std::fs::write(&path, "# comment\n\n--workers=3\n  # another comment\n  \n--debug\n").unwrap();
        let args = read_config_file(path.to_str().unwrap()).unwrap();
        assert_eq!(args, vec!["--workers=3", "--debug"]);
        std::fs::remove_file(&path).ok();
    }

    #[test]
    fn read_config_file_nonexistent_returns_error() {
        let result = read_config_file("/nonexistent/path/config.conf");
        assert!(result.is_err());
    }

    #[test]
    fn merge_config_file_args_with_explicit_config() {
        let dir = std::env::temp_dir();
        let path = dir.join("test_crawler_config_3.conf");
        std::fs::write(&path, "--workers=5\n--debug\n").unwrap();
        let argv = vec![
            "siteone-crawler".to_string(),
            format!("--config-file={}", path.display()),
            "--url=https://example.com".to_string(),
        ];
        let merged = merge_config_file_args(&argv).unwrap();
        // Config args prepended after binary name, CLI args follow
        assert_eq!(merged[0], "siteone-crawler");
        assert!(merged.contains(&"--workers=5".to_string()));
        assert!(merged.contains(&"--debug".to_string()));
        assert!(merged.contains(&"--url=https://example.com".to_string()));
        // --config-file itself should be filtered out
        assert!(!merged.iter().any(|a| a.starts_with("--config-file=")));
        std::fs::remove_file(&path).ok();
    }

    #[test]
    fn merge_config_file_args_without_config() {
        let argv = vec!["siteone-crawler".to_string(), "--url=https://example.com".to_string()];
        let merged = merge_config_file_args(&argv).unwrap();
        // No config file exists, so argv is returned as-is
        assert_eq!(merged, argv);
    }

    // ---- New option apply tests for recent features ----

    #[test]
    fn apply_force_relative_urls() {
        let mut opts = make_default_core_options();
        assert!(!opts.force_relative_urls);
        opts.apply_option_value("forceRelativeUrls", &OptionValue::Bool(true))
            .unwrap();
        assert!(opts.force_relative_urls);
    }

    #[test]
    fn apply_offline_export_preserve_url_structure() {
        let mut opts = make_default_core_options();
        assert!(!opts.offline_export_preserve_url_structure);
        opts.apply_option_value("offlineExportPreserveUrlStructure", &OptionValue::Bool(true))
            .unwrap();
        assert!(opts.offline_export_preserve_url_structure);
    }

    #[test]
    fn apply_offline_export_preserve_urls() {
        let mut opts = make_default_core_options();
        assert!(!opts.offline_export_preserve_urls);
        opts.apply_option_value("offlineExportPreserveUrls", &OptionValue::Bool(true))
            .unwrap();
        assert!(opts.offline_export_preserve_urls);
    }
}


================================================
FILE: src/options/group.rs
================================================
// SiteOne Crawler - Option group for organizing options
// (c) Jan Reges <jan.reges@siteone.cz>
//

use indexmap::IndexMap;

use super::option::CrawlerOption;

#[derive(Debug, Clone)]
pub struct OptionGroup {
    /// Unique application code for the group
    pub apl_code: String,

    /// Readable name for the group
    pub name: String,

    /// Options indexed by property_to_fill name
    pub options: IndexMap<String, CrawlerOption>,
}

impl OptionGroup {
    pub fn new(apl_code: &str, name: &str, options: Vec<CrawlerOption>) -> Self {
        let mut options_map = IndexMap::new();
        for option in options {
            options_map.insert(option.property_to_fill.clone(), option);
        }

        Self {
            apl_code: apl_code.to_string(),
            name: name.to_string(),
            options: options_map,
        }
    }
}


================================================
FILE: src/options/mod.rs
================================================
// SiteOne Crawler - Options module
// (c) Jan Reges <jan.reges@siteone.cz>
//
// CLI option definitions and parsing

pub mod core_options;
pub mod group;
pub mod option;
pub mod option_type;
#[allow(clippy::module_inception)]
pub mod options;


================================================
FILE: src/options/option.rs
================================================
// SiteOne Crawler - Option definition and value parsing
// (c) Jan Reges <jan.reges@siteone.cz>
//

use std::sync::Mutex;

use regex::Regex;

use crate::error::CrawlerError;
use crate::utils;

use super::option_type::OptionType;

static EXTRAS_DOMAIN: Mutex<Option<String>> = Mutex::new(None);

#[derive(Debug, Clone)]
pub enum OptionValue {
    None,
    Bool(bool),
    Int(i64),
    Float(f64),
    Str(String),
    Array(Vec<String>),
}

impl OptionValue {
    pub fn as_bool(&self) -> Option<bool> {
        match self {
            OptionValue::Bool(v) => Some(*v),
            _ => None,
        }
    }

    pub fn as_int(&self) -> Option<i64> {
        match self {
            OptionValue::Int(v) => Some(*v),
            _ => None,
        }
    }

    pub fn as_float(&self) -> Option<f64> {
        match self {
            OptionValue::Float(v) => Some(*v),
            _ => None,
        }
    }

    pub fn as_str(&self) -> Option<&str> {
        match self {
            OptionValue::Str(v) => Some(v.as_str()),
            _ => None,
        }
    }

    pub fn as_array(&self) -> Option<&Vec<String>> {
        match self {
            OptionValue::Array(v) => Some(v),
            _ => None,
        }
    }

    pub fn is_none(&self) -> bool {
        matches!(self, OptionValue::None)
    }
}

#[derive(Debug, Clone)]
pub struct CrawlerOption {
    /// Option name with '--' prefix, for example "--user-agent"
    pub name: String,

    /// Optional alternative (short) name with '-', for example "-ua" for "--user-agent"
    pub alt_name: Option<String>,

    /// Property name to fill in CoreOptions struct
    pub property_to_fill: String,

    /// Option value type
    pub option_type: OptionType,

    /// Is array of comma delimited values
    pub is_array: bool,

    /// Description for help
    pub description: String,

    /// Default value as string representation
    pub default_value: Option<String>,

    /// Whether the value can be null/empty
    pub is_nullable: bool,

    /// Whether the option can be specified multiple times
    pub callable_multiple_times: bool,

    /// Optional extras (e.g. min/max range for numeric types)
    pub extras: Option<Vec<String>>,

    /// Parsed value from argv
    value: Option<OptionValue>,

    /// Whether value has been set from argv
    is_value_set: bool,

    /// Whether the user explicitly provided this option on the command line
    /// (as opposed to using the default value)
    is_explicitly_set: bool,
}

impl CrawlerOption {
    #[allow(clippy::too_many_arguments)]
    pub fn new(
        name: &str,
        alt_name: Option<&str>,
        property_to_fill: &str,
        option_type: OptionType,
        is_array: bool,
        description: &str,
        default_value: Option<&str>,
        is_nullable: bool,
        callable_multiple_times: bool,
        extras: Option<Vec<String>>,
    ) -> Self {
        Self {
            name: name.to_string(),
            alt_name: alt_name.map(|s| s.to_string()),
            property_to_fill: property_to_fill.to_string(),
            option_type,
            is_array,
            description: description.to_string(),
            default_value: default_value.map(|s| s.to_string()),
            is_nullable,
            callable_multiple_times,
            extras,
            value: None,
            is_value_set: false,
            is_explicitly_set: false,
        }
    }

    pub fn set_value_from_argv(&mut self, argv: &[String]) -> Result<(), CrawlerError> {
        if self.is_value_set {
            return Err(CrawlerError::Config(format!(
                "Value for option {} is already set. Did you call set_value_from_argv() twice?",
                self.name
            )));
        }

        let mut value: Option<String> = self.default_value.clone();
        let mut array_values: Vec<String> = if self.is_array {
            if let Some(ref dv) = self.default_value {
                if dv.is_empty() { Vec::new() } else { vec![dv.clone()] }
            } else {
                Vec::new()
            }
        } else {
            Vec::new()
        };
        let mut has_default_been_replaced = false;
        let mut defined_by_alt_name = false;

        // Find value in arguments
        let mut i = 0;
        while i < argv.len() {
            let arg = &argv[i];
            let mut arg_value: Option<String> = None;

            if arg == &self.name || self.alt_name.as_deref() == Some(arg.as_str()) {
                if self.option_type == OptionType::Bool {
                    // Flag-style: --debug or -d (no value, implies true)
                    arg_value = Some("true".to_string());
                } else {
                    // Non-bool option without '=': look for value in next argument
                    if i + 1 < argv.len() && !argv[i + 1].starts_with('-') {
                        i += 1;
                        arg_value = Some(argv[i].clone());
                    } else {
                        // No value provided — set to empty so validation catches it
                        arg_value = Some(String::new());
                    }
                }
            } else if let Some(rest) = arg.strip_prefix(&format!("{}=", self.name)) {
                arg_value = Some(rest.to_string());
            } else if let Some(ref alt) = self.alt_name
                && let Some(rest) = arg.strip_prefix(&format!("{}=", alt))
            {
                arg_value = Some(rest.to_string());
                defined_by_alt_name = true;
            }

            if let Some(ref mut av) = arg_value {
                self.is_explicitly_set = true;
                unquote_value(av);

                if self.is_array {
                    if !has_default_been_replaced {
                        // First user-provided value replaces the default
                        array_values.clear();
                        has_default_been_replaced = true;
                    }
                    if av.contains(',') {
                        let parts: Vec<String> = av
                            .split(',')
                            .map(|s| s.trim().to_string())
                            .filter(|s| !s.is_empty())
                            .map(|mut s| {
                                unquote_value(&mut s);
                                s
                            })
                            .collect();
                        array_values.extend(parts);
                    } else {
                        array_values.push(av.clone());
                    }
                } else {
                    value = Some(av.clone());
                }
            }
            i += 1;
        }

        // Handle array default from string
        if self.is_array
            && let Some(ref v) = value
            && !v.is_empty()
            && array_values.is_empty()
        {
            let mut unquoted = v.clone();
            unquote_value(&mut unquoted);
            let parts: Vec<String> = unquoted
                .split(',')
                .map(|s| s.trim().to_string())
                .filter(|s| !s.is_empty())
                .map(|mut s| {
                    unquote_value(&mut s);
                    s
                })
                .collect();
            array_values = parts;
        }

        // Validate and correct types
        if self.is_array {
            for item in &array_values {
                self.validate_value(Some(item), defined_by_alt_name)?;
            }
            // Filter out empty strings
            let filtered: Vec<String> = array_values.into_iter().filter(|s| !s.trim().is_empty()).collect();
            self.value = Some(OptionValue::Array(filtered));
        } else {
            self.validate_value(value.as_deref(), defined_by_alt_name)?;
            self.value = Some(self.correct_value_type(value.as_deref())?);
        }

        self.is_value_set = true;
        Ok(())
    }

    pub fn is_explicitly_set(&self) -> bool {
        self.is_explicitly_set
    }

    pub fn get_value(&self) -> Result<&OptionValue, CrawlerError> {
        if !self.is_value_set {
            return Err(CrawlerError::Config(format!(
                "Value for option {} is not set. Did you call set_value_from_argv()?",
                self.name
            )));
        }
        match &self.value {
            Some(v) => Ok(v),
            None => Err(CrawlerError::Config(format!(
                "Value for option {} is not set",
                self.name
            ))),
        }
    }

    fn validate_value(&self, value: Option<&str>, _defined_by_alt_name: bool) -> Result<(), CrawlerError> {
        // Always use the long name for error messages
        let display_name = &self.name;

        // Handle nullable
        if self.is_nullable && (value.is_none() || value == Some("")) {
            return Ok(());
        }

        let val = match value {
            Some(v) => v,
            None => {
                if !self.is_nullable {
                    // URL type gives specific error, not generic "is required"
                    if self.option_type == OptionType::Url {
                        return Err(CrawlerError::Config(format!(
                            "Option {} must be valid URL (starting with http:// or https://)",
                            display_name
                        )));
                    }
                    return Err(CrawlerError::Config(format!("Option {} is required", display_name)));
                }
                return Ok(());
            }
        };

        match self.option_type {
            OptionType::Int => {
                let parsed: Result<i64, _> = val.parse();
                match parsed {
                    Ok(n) if n < 0 => {
                        return Err(CrawlerError::Config(format!(
                            "Option {} ({}) must be positive integer",
                            display_name, val
                        )));
                    }
                    Err(_) => {
                        return Err(CrawlerError::Config(format!(
                            "Option {} ({}) must be positive integer",
                            display_name, val
                        )));
                    }
                    _ => {}
                }
            }
            OptionType::Float => {
                if val.parse::<f64>().is_err() {
                    return Err(CrawlerError::Config(format!(
                        "Option {} ({}) must be float",
                        display_name, val
                    )));
                }
            }
            OptionType::Bool => {
                if !["1", "0", "yes", "no", "true", "false"].contains(&val) {
                    return Err(CrawlerError::Config(format!(
                        "Option {} ({}) must be boolean (1/0, yes/no, true/false)",
                        display_name, val
                    )));
                }
            }
            OptionType::String => {
                // Strings are always valid
            }
            OptionType::SizeMG => {
                let re = Regex::new(r"^\d+(\.\d+)?[MG]$").map_err(|e| CrawlerError::Config(e.to_string()))?;
                if !re.is_match(val) {
                    return Err(CrawlerError::Config(format!(
                        "Option {} ({}) must be string with M/G suffix (for example 512M or 1.5G)",
                        display_name, val
                    )));
                }
            }
            OptionType::Regex => {
                if fancy_regex::Regex::new(val).is_err() {
                    return Err(CrawlerError::Config(format!(
                        "Option {} ({}) must be valid PCRE regular expression",
                        display_name, val
                    )));
                }
            }
            OptionType::Url => {
                let corrected = correct_url(val);
                if corrected.is_empty() {
                    return Err(CrawlerError::Config(format!(
                        "Option {} must be valid URL (starting with http:// or https://)",
                        display_name
                    )));
                }
                if url::Url::parse(&corrected).is_err() {
                    // Try with URL-encoded version for international characters
                    let encoded: String = corrected
                        .chars()
                        .map(|c| {
                            if c.is_ascii_graphic() || c == ' ' {
                                c.to_string()
                            } else {
                                percent_encoding::utf8_percent_encode(
                                    &c.to_string(),
                                    percent_encoding::NON_ALPHANUMERIC,
                                )
                                .to_string()
                            }
                        })
                        .collect();
                    if url::Url::parse(&encoded).is_err() {
                        return Err(CrawlerError::Config(format!(
                            "Option {} ({}) must be valid URL",
                            display_name, val
                        )));
                    }
                }
            }
            OptionType::Email => {
                // Simple email validation
                if !val.contains('@') || !val.contains('.') {
                    return Err(CrawlerError::Config(format!(
                        "Option {} ({}) must be valid email '{}'",
                        display_name, val, val
                    )));
                }
            }
            OptionType::File => {
                // File path validation - just ensure it's a non-empty string.
                // Writability is checked at export time.
            }
            OptionType::Dir => {
                if val == "off" || val.is_empty() {
                    return Ok(());
                }
                let mut path = val.to_string();
                replace_placeholders(&mut path);
                let abs_path = utils::get_absolute_path(&path);
                if abs_path.trim().is_empty() {
                    return Err(CrawlerError::Config(format!(
                        "Option {} ({}) must be string",
                        display_name, val
                    )));
                }
                let dir_path = std::path::Path::new(&abs_path);
                if !dir_path.exists() && std::fs::create_dir_all(dir_path).is_err() {
                    return Err(CrawlerError::Config(format!(
                        "Option {} ({}) must be valid and writable directory. Check permissions.",
                        display_name, abs_path
                    )));
                }
            }
            OptionType::HostAndPort => {
                let re = Regex::new(r"^[a-zA-Z0-9\-.:]{1,100}:[0-9]{1,5}$")
                    .map_err(|e| CrawlerError::Config(e.to_string()))?;
                if !re.is_match(val) {
                    return Err(CrawlerError::Config(format!(
                        "Option {} ({}) must be in format host:port",
                        display_name, val
                    )));
                }
            }
            OptionType::ReplaceContent => {
                let re = Regex::new(r"^.+->").map_err(|e| CrawlerError::Config(e.to_string()))?;
                if !re.is_match(val) {
                    return Err(CrawlerError::Config(format!(
                        "Option {} ({}) must be in format `foo -> bar` or `/preg-regexp/ -> bar`)",
                        display_name, val
                    )));
                }

                let parts: Vec<&str> = val.splitn(2, "->").collect();
                let replace_from = parts[0].trim();
                let is_regex = crate::utils::is_regex_pattern(replace_from);

                if is_regex && Regex::new(replace_from).is_err() {
                    return Err(CrawlerError::Config(format!(
                        "Option {} and its first part ({}) must be valid PCRE regular expression",
                        display_name, replace_from
                    )));
                }
            }
            OptionType::Resolve => {
                // --resolve is in the same format as curl --resolve (ipv4 and ipv6 supported)
                let re = Regex::new(r"^[a-zA-Z0-9\-.]{1,200}:[0-9]{1,5}:[a-fA-F0-9\-.:]{1,100}$")
                    .map_err(|e| CrawlerError::Config(e.to_string()))?;
                if !re.is_match(val) {
                    return Err(CrawlerError::Config(format!(
                        "Option {} ({}) must be in format `domain:port:ip`",
                        display_name, val
                    )));
                }
            }
        }

        // Extra validations for numeric range
        if (self.option_type == OptionType::Int || self.option_type == OptionType::Float)
            && self.extras.as_ref().map(|e| e.len()) == Some(2)
            && let Ok(num) = val.parse::<f64>()
        {
            let extras = self.extras.as_ref().map(|e| {
                let min = e[0].parse::<f64>().unwrap_or(f64::MIN);
                let max = e[1].parse::<f64>().unwrap_or(f64::MAX);
                (min, max)
            });
            if let Some((min, max)) = extras
                && (num < min || num > max)
            {
                return Err(CrawlerError::Config(format!(
                    "Option {} ({}) must be in range {}-{}",
                    display_name, val, min, max
                )));
            }
        }

        Ok(())
    }

    fn correct_value_type(&self, value: Option<&str>) -> Result<OptionValue, CrawlerError> {
        if self.is_nullable && (value.is_none() || value == Some("")) {
            return Ok(OptionValue::None);
        }

        let val = match value {
            Some(v) => v,
            None => return Ok(OptionValue::None),
        };

        match self.option_type {
            OptionType::Int => {
                let n = val
                    .parse::<i64>()
                    .map_err(|_| CrawlerError::Config(format!("Cannot parse '{}' as integer", val)))?;
                Ok(OptionValue::Int(n))
            }
            OptionType::Float => {
                let n = val
                    .parse::<f64>()
                    .map_err(|_| CrawlerError::Config(format!("Cannot parse '{}' as float", val)))?;
                Ok(OptionValue::Float(n))
            }
            OptionType::Bool => {
                let b = ["1", "yes", "true"].contains(&val);
                Ok(OptionValue::Bool(b))
            }
            OptionType::String
            | OptionType::SizeMG
            | OptionType::Regex
            | OptionType::Email
            | OptionType::HostAndPort
            | OptionType::ReplaceContent
            | OptionType::Resolve => Ok(OptionValue::Str(val.to_string())),
            OptionType::Url => {
                let corrected = correct_url(val);
                Ok(OptionValue::Str(corrected))
            }
            OptionType::File => {
                let mut path = val.to_string();
                replace_placeholders(&mut path);
                Ok(OptionValue::Str(utils::get_absolute_path(&path)))
            }
            OptionType::Dir => {
                if val == "off" || val.is_empty() {
                    return Ok(OptionValue::Str(val.to_string()));
                }
                let mut path = val.to_string();
                replace_placeholders(&mut path);
                Ok(OptionValue::Str(utils::get_absolute_path(&path)))
            }
        }
    }

    pub fn set_extras_domain(domain: Option<&str>) {
        if let Ok(mut d) = EXTRAS_DOMAIN.lock() {
            *d = domain.map(|s| s.to_string());
        }
    }
}

/// Correct URL to valid URL, e.g. crawler.siteone.io => https://crawler.siteone.io,
/// or localhost to http://localhost
fn correct_url(url: &str) -> String {
    if !url.starts_with("http") {
        let re = Regex::new(r"^[a-zA-Z0-9\-.:]{1,100}$").ok();
        if re.map(|r| r.is_match(url)).unwrap_or(false) {
            let default_protocol = if url.contains('.') { "https" } else { "http" };
            return format!("{}://{}", default_protocol, url.trim_start_matches('/'));
        }
    }
    url.to_string()
}

/// Remove quotes from given string - as a quote we consider chars " ' `
fn unquote_value(value: &mut String) {
    let bytes = value.as_bytes();
    if bytes.len() >= 2 {
        let first = bytes[0];
        let last = bytes[bytes.len() - 1];
        if (first == b'"' && last == b'"') || (first == b'\'' && last == b'\'') || (first == b'`' && last == b'`') {
            *value = value[1..value.len() - 1].to_string();
        }
    }
}

/// Replace placeholders like %domain%, %date%, %datetime% in file/dir paths
fn replace_placeholders(value: &mut String) {
    let domain = EXTRAS_DOMAIN.lock().ok().and_then(|d| d.clone()).unwrap_or_default();

    let now = chrono::Local::now();
    let date = now.format("%Y-%m-%d").to_string();
    let datetime = now.format("%Y%m%d-%H%M%S").to_string();

    *value = value
        .replace("%domain%", &domain)
        .replace("%date%", &date)
        .replace("%datetime%", &datetime);
}


================================================
FILE: src/options/option_type.rs
================================================
// SiteOne Crawler - Option type definitions
// (c) Jan Reges <jan.reges@siteone.cz>
//

use std::fmt;

#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum OptionType {
    Int,
    Float,
    Bool,
    String,
    SizeMG,
    Email,
    Url,
    Regex,
    File,
    Dir,
    HostAndPort,
    ReplaceContent,
    Resolve,
}

impl fmt::Display for OptionType {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        match self {
            OptionType::Int => write!(f, "INT"),
            OptionType::Float => write!(f, "FLOAT"),
            OptionType::Bool => write!(f, "BOOL"),
            OptionType::String => write!(f, "STRING"),
            OptionType::SizeMG => write!(f, "SIZE_M_G"),
            OptionType::Email => write!(f, "EMAIL"),
            OptionType::Url => write!(f, "URL"),
            OptionType::Regex => write!(f, "REGEX"),
            OptionType::File => write!(f, "FILE"),
            OptionType::Dir => write!(f, "DIR"),
            OptionType::HostAndPort => write!(f, "HOST_AND_PORT"),
            OptionType::ReplaceContent => write!(f, "REPLACE_CONTENT"),
            OptionType::Resolve => write!(f, "RESOLVE"),
        }
    }
}


================================================
FILE: src/options/options.rs
================================================
// SiteOne Crawler - Options registry
// (c) Jan Reges <jan.reges@siteone.cz>
//

use indexmap::IndexMap;

use super::group::OptionGroup;

#[derive(Debug, Clone)]
pub struct Options {
    groups: IndexMap<String, OptionGroup>,
}

impl Options {
    pub fn new() -> Self {
        Self {
            groups: IndexMap::new(),
        }
    }

    pub fn add_group(&mut self, group: OptionGroup) {
        self.groups.insert(group.apl_code.clone(), group);
    }

    pub fn get_groups(&self) -> &IndexMap<String, OptionGroup> {
        &self.groups
    }

    pub fn get_groups_mut(&mut self) -> &mut IndexMap<String, OptionGroup> {
        &mut self.groups
    }

    pub fn get_group(&self, apl_code: &str) -> Option<&OptionGroup> {
        self.groups.get(apl_code)
    }

    pub fn get_group_mut(&mut self, apl_code: &str) -> Option<&mut OptionGroup> {
        self.groups.get_mut(apl_code)
    }

    /// Check if a specific option was explicitly provided on the command line
    /// (as opposed to using its default value). `property` is the camelCase property name.
    pub fn is_explicitly_set(&self, property: &str) -> bool {
        self.groups
            .values()
            .any(|g| g.options.get(property).is_some_and(|o| o.is_explicitly_set()))
    }
}

impl Default for Options {
    fn default() -> Self {
        Self::new()
    }
}


================================================
FILE: src/output/json_output.rs
================================================
// SiteOne Crawler - JsonOutput (JSON output)
// (c) Jan Reges <jan.reges@siteone.cz>
//

use std::collections::HashMap;
use std::io::Write;

use serde_json::{Value, json};

use crate::components::summary::summary::Summary;
use crate::components::super_table::SuperTable;
use crate::extra_column::ExtraColumn;
use crate::output::output::{BasicStats, CrawlerInfo, Output};
use crate::output::output_type::OutputType;
use crate::scoring::ci_gate::CiGateResult;
use crate::scoring::quality_score::QualityScores;
use crate::utils;

pub struct JsonOutput {
    crawler_info: CrawlerInfo,
    print_to_output: bool,

    json: serde_json::Map<String, Value>,

    /// Extra columns from options (user-specified)
    extra_columns: Vec<ExtraColumn>,

    /// Serialized options for JSON output
    options_json: Option<Value>,

    /// For progress display on stderr
    hide_progress_bar: bool,
    max_stderr_length: usize,
}

impl JsonOutput {
    pub fn new(
        crawler_info: CrawlerInfo,
        extra_columns: Vec<ExtraColumn>,
        hide_progress_bar: bool,
        print_to_output: bool,
        options_json: Option<Value>,
    ) -> Self {
        Self {
            crawler_info,
            print_to_output,
            json: serde_json::Map::new(),
            extra_columns,
            options_json,
            hide_progress_bar,
            max_stderr_length: 0,
        }
    }

    pub fn get_json(&self) -> String {
        let value = Value::Object(self.json.clone());
        serde_json::to_string_pretty(&value)
            .unwrap_or_else(|e| format!("{{\"error\": \"unable to serialize JSON: {}\"}}", e))
    }
}

impl Output for JsonOutput {
    fn add_banner(&mut self) {
        self.json.insert(
            "crawler".to_string(),
            serde_json::to_value(&self.crawler_info).unwrap_or(Value::Null),
        );
    }

    fn add_used_options(&mut self) {
        if let Some(ref options) = self.options_json {
            self.json.insert("options".to_string(), options.clone());
        }
    }

    fn set_extra_columns_from_analysis(&mut self, extra_columns: Vec<ExtraColumn>) {
        let columns_json: Vec<Value> = extra_columns
            .iter()
            .map(|col| serde_json::to_value(col).unwrap_or(Value::Null))
            .collect();
        self.json
            .insert("extraColumnsFromAnalysis".to_string(), Value::Array(columns_json));
    }

    fn add_table_header(&mut self) {
        self.json.insert("results".to_string(), Value::Array(Vec::new()));
    }

    fn add_table_row(
        &mut self,
        response_headers: &HashMap<String, String>,
        url: &str,
        status: i32,
        elapsed_time: f64,
        size: i64,
        content_type: i32,
        extra_parsed_content: &HashMap<String, String>,
        progress_status: &str,
        cache_type_flags: i32,
        cache_lifetime: Option<i32>,
    ) {
        let status_str = utils::get_http_client_code_with_error_description(status, false);

        // extras: empty array [] when no extra columns, object {} when populated
        let extras_value = if self.extra_columns.is_empty() {
            Value::Array(Vec::new())
        } else {
            let mut extras = serde_json::Map::new();
            for extra_column in &self.extra_columns {
                let header_name = &extra_column.name;
                let value = if let Some(v) = extra_parsed_content.get(header_name) {
                    v.trim().to_string()
                } else if let Some(v) = response_headers.get(&header_name.to_lowercase()) {
                    v.trim().to_string()
                } else {
                    String::new()
                };
                extras.insert(header_name.clone(), Value::String(value));
            }
            Value::Object(extras)
        };

        let row = json!({
            "url": url,
            "status": status_str,
            "elapsedTime": (elapsed_time * 1000.0).round() / 1000.0,
            "size": size,
            "type": content_type,
            "cacheTypeFlags": cache_type_flags,
            "cacheLifetime": cache_lifetime,
            "extras": extras_value,
        });

        if let Some(Value::Array(results)) = self.json.get_mut("results") {
            results.push(row);
        }

        // Print progress to stderr in JSON mode
        if !self.hide_progress_bar && self.print_to_output {
            let parts: Vec<&str> = progress_status.splitn(2, '/').collect();
            let done: usize = parts.first().and_then(|s| s.parse().ok()).unwrap_or(0);
            let total: usize = parts.get(1).and_then(|s| s.parse().ok()).unwrap_or(1);

            let console_width = utils::get_console_width();
            let text_width_without_url: usize = 65;

            let truncated_url = utils::truncate_in_two_thirds(
                url,
                console_width.saturating_sub(text_width_without_url),
                "\u{2026}",
                None,
            );

            let progress_to_stderr = format!(
                "\rProgress: {:<7} | {} {} | {}",
                progress_status,
                utils::get_progress_bar(done, total, 25),
                utils::get_formatted_duration(elapsed_time),
                truncated_url,
            );

            self.max_stderr_length = self.max_stderr_length.max(progress_to_stderr.len());
            let padded = format!("{:<width$}", progress_to_stderr, width = self.max_stderr_length);

            eprint!("{}", padded);
            let _ = std::io::stderr().flush();
        }
    }

    fn add_super_table(&mut self, table: &SuperTable) {
        if !self.json.contains_key("tables") {
            self.json
                .insert("tables".to_string(), Value::Object(serde_json::Map::new()));
        }

        if let Some(table_json) = table.get_json_output()
            && let Some(Value::Object(tables)) = self.json.get_mut("tables")
        {
            tables.insert(table.apl_code.clone(), table_json);
        }
    }

    fn add_total_stats(&mut self, stats: &BasicStats) {
        if self.print_to_output {
            eprintln!("\n");
        }

        // Build countByStatus as string-keyed object (JSON requires string keys)
        let count_by_status: serde_json::Map<String, Value> = stats
            .count_by_status
            .iter()
            .map(|(k, v)| (k.to_string(), json!(*v)))
            .collect();

        let stats_json = json!({
            "totalUrls": stats.total_urls,
            "totalSize": stats.total_size,
            "totalSizeFormatted": stats.total_size_formatted,
            "totalExecutionTime": stats.total_execution_time,
            "totalRequestsTimes": stats.total_requests_times,
            "totalRequestsTimesAvg": stats.total_requests_times_avg,
            "totalRequestsTimesMin": stats.total_requests_times_min,
            "totalRequestsTimesMax": stats.total_requests_times_max,
            "countByStatus": count_by_status,
        });
        self.json.insert("stats".to_string(), stats_json);
    }

    fn add_notice(&mut self, text: &str) {
        if !self.json.contains_key("notice") {
            self.json.insert("notice".to_string(), Value::Array(Vec::new()));
        }

        let now = chrono::Local::now();
        let timestamped = format!("{} | {}", now.format("%Y-%m-%d %H:%M:%S"), text);

        if let Some(Value::Array(notices)) = self.json.get_mut("notice") {
            notices.push(Value::String(timestamped));
        }
    }

    fn add_error(&mut self, text: &str) {
        if !self.json.contains_key("error") {
            self.json.insert("error".to_string(), Value::Array(Vec::new()));
        }

        let now = chrono::Local::now();
        let timestamped = format!("{} | {}", now.format("%Y-%m-%d %H:%M:%S"), text);

        if let Some(Value::Array(errors)) = self.json.get_mut("error") {
            errors.push(Value::String(timestamped));
        }
    }

    fn add_quality_scores(&mut self, scores: &QualityScores) {
        if let Ok(value) = serde_json::to_value(scores) {
            self.json.insert("qualityScores".to_string(), value);
        }
    }

    fn add_ci_gate_result(&mut self, result: &CiGateResult) {
        if let Ok(value) = serde_json::to_value(result) {
            self.json.insert("ciGate".to_string(), value);
        }
    }

    fn add_summary(&mut self, summary: &mut Summary) {
        if let Ok(summary_value) = serde_json::to_value(summary) {
            self.json.insert("summary".to_string(), summary_value);
        }
    }

    fn set_export_file_paths(
        &mut self,
        offline_paths: Option<&HashMap<String, String>>,
        markdown_paths: Option<&HashMap<String, String>>,
    ) {
        if let Some(Value::Array(results)) = self.json.get_mut("results") {
            for result in results.iter_mut() {
                if let Some(url) = result.get("url").and_then(|v| v.as_str()) {
                    let url_owned = url.to_string();
                    if let Some(paths) = offline_paths
                        && let Some(path) = paths.get(&url_owned)
                        && let Some(obj) = result.as_object_mut()
                    {
                        obj.insert("offlineFilePath".to_string(), Value::String(path.clone()));
                    }
                    if let Some(paths) = markdown_paths
                        && let Some(path) = paths.get(&url_owned)
                        && let Some(obj) = result.as_object_mut()
                    {
                        obj.insert("markdownFilePath".to_string(), Value::String(path.clone()));
                    }
                }
            }
        }
    }

    fn get_type(&self) -> OutputType {
        OutputType::Json
    }

    fn end(&mut self) {
        if !self.print_to_output {
            return;
        }

        let json = self.get_json();
        println!("{}", json);
    }

    fn get_json_content(&self) -> Option<String> {
        Some(self.get_json())
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::scoring::ci_gate::{CiCheck, CiGateResult};
    use crate::scoring::quality_score::{CategoryScore, QualityScores};

    fn make_json_output() -> JsonOutput {
        JsonOutput::new(CrawlerInfo::default(), vec![], true, false, None)
    }

    fn make_pass_result() -> CiGateResult {
        CiGateResult {
            passed: true,
            exit_code: 0,
            checks: vec![],
        }
    }

    fn make_fail_result() -> CiGateResult {
        CiGateResult {
            passed: false,
            exit_code: 10,
            checks: vec![
                CiCheck {
                    metric: "Overall score".into(),
                    operator: ">=".into(),
                    threshold: 5.0,
                    actual: 3.0,
                    passed: false,
                },
                CiCheck {
                    metric: "404 errors".into(),
                    operator: "<=".into(),
                    threshold: 0.0,
                    actual: 2.0,
                    passed: false,
                },
                CiCheck {
                    metric: "5xx errors".into(),
                    operator: "<=".into(),
                    threshold: 0.0,
                    actual: 0.0,
                    passed: true,
                },
            ],
        }
    }

    fn parse_json(output: &JsonOutput) -> serde_json::Value {
        serde_json::from_str(&output.get_json()).unwrap()
    }

    #[test]
    fn ci_gate_present_when_added() {
        let mut output = make_json_output();
        output.add_ci_gate_result(&make_pass_result());
        let json = parse_json(&output);
        assert!(json.get("ciGate").is_some());
    }

    #[test]
    fn ci_gate_absent_when_not_added() {
        let output = make_json_output();
        let json = parse_json(&output);
        assert!(json.get("ciGate").is_none());
    }

    #[test]
    fn ci_gate_passed_true() {
        let mut output = make_json_output();
        output.add_ci_gate_result(&make_pass_result());
        let json = parse_json(&output);
        let ci_gate = json.get("ciGate").unwrap();
        assert_eq!(ci_gate.get("passed").unwrap().as_bool().unwrap(), true);
        assert_eq!(ci_gate.get("exitCode").unwrap().as_i64().unwrap(), 0);
    }

    #[test]
    fn ci_gate_passed_false() {
        let mut output = make_json_output();
        output.add_ci_gate_result(&make_fail_result());
        let json = parse_json(&output);
        let ci_gate = json.get("ciGate").unwrap();
        assert_eq!(ci_gate.get("passed").unwrap().as_bool().unwrap(), false);
        assert_eq!(ci_gate.get("exitCode").unwrap().as_i64().unwrap(), 10);
    }

    #[test]
    fn ci_gate_checks_array() {
        let mut output = make_json_output();
        output.add_ci_gate_result(&make_fail_result());
        let json = parse_json(&output);
        let checks = json["ciGate"]["checks"].as_array().unwrap();
        assert_eq!(checks.len(), 3);
    }

    #[test]
    fn quality_scores_in_json() {
        let mut output = make_json_output();
        let scores = QualityScores {
            overall: CategoryScore {
                name: "Overall".into(),
                code: "overall".into(),
                score: 8.5,
                label: "Good".into(),
                weight: 1.0,
                deductions: vec![],
            },
            categories: vec![],
        };
        output.add_quality_scores(&scores);
        let json = parse_json(&output);
        assert!(json.get("qualityScores").is_some());
    }

    fn add_sample_rows(output: &mut JsonOutput) {
        output.add_table_header();
        let headers = HashMap::new();
        let extras = HashMap::new();
        output.add_table_row(
            &headers,
            "https://example.com/",
            200,
            0.1,
            5000,
            1,
            &extras,
            "1/3",
            0,
            None,
        );
        output.add_table_row(
            &headers,
            "https://example.com/about",
            200,
            0.2,
            3000,
            1,
            &extras,
            "2/3",
            0,
            None,
        );
        output.add_table_row(
            &headers,
            "https://example.com/missing",
            404,
            0.05,
            1000,
            1,
            &extras,
            "3/3",
            0,
            None,
        );
    }

    #[test]
    fn export_file_paths_offline_only() {
        let mut output = make_json_output();
        add_sample_rows(&mut output);

        let mut offline = HashMap::new();
        offline.insert("https://example.com/".to_string(), "index.html".to_string());
        offline.insert("https://example.com/about".to_string(), "about.html".to_string());

        output.set_export_file_paths(Some(&offline), None);

        let json = parse_json(&output);
        let results = json["results"].as_array().unwrap();
        assert_eq!(results[0]["offlineFilePath"], "index.html");
        assert_eq!(results[1]["offlineFilePath"], "about.html");
        assert!(results[2].get("offlineFilePath").is_none());
        // No markdown paths
        assert!(results[0].get("markdownFilePath").is_none());
    }

    #[test]
    fn export_file_paths_both() {
        let mut output = make_json_output();
        add_sample_rows(&mut output);

        let mut offline = HashMap::new();
        offline.insert("https://example.com/".to_string(), "index.html".to_string());

        let mut markdown = HashMap::new();
        markdown.insert("https://example.com/".to_string(), "index.md".to_string());
        markdown.insert("https://example.com/about".to_string(), "about.md".to_string());

        output.set_export_file_paths(Some(&offline), Some(&markdown));

        let json = parse_json(&output);
        let results = json["results"].as_array().unwrap();
        assert_eq!(results[0]["offlineFilePath"], "index.html");
        assert_eq!(results[0]["markdownFilePath"], "index.md");
        assert!(results[1].get("offlineFilePath").is_none());
        assert_eq!(results[1]["markdownFilePath"], "about.md");
        // 404 page has neither
        assert!(results[2].get("offlineFilePath").is_none());
        assert!(results[2].get("markdownFilePath").is_none());
    }

    #[test]
    fn export_file_paths_none_changes_nothing() {
        let mut output = make_json_output();
        add_sample_rows(&mut output);

        output.set_export_file_paths(None, None);

        let json = parse_json(&output);
        let results = json["results"].as_array().unwrap();
        assert!(results[0].get("offlineFilePath").is_none());
        assert!(results[0].get("markdownFilePath").is_none());
    }
}


================================================
FILE: src/output/mod.rs
================================================
pub mod json_output;
pub mod multi_output;
#[allow(clippy::module_inception)]
pub mod output;
pub mod output_type;
pub mod text_output;


================================================
FILE: src/output/multi_output.rs
================================================
// SiteOne Crawler - MultiOutput (delegates to multiple outputs)
// (c) Jan Reges <jan.reges@siteone.cz>
//

use std::collections::HashMap;

use crate::components::summary::summary::Summary;
use crate::components::super_table::SuperTable;
use crate::extra_column::ExtraColumn;
use crate::output::output::{BasicStats, Output};
use crate::output::output_type::OutputType;
use crate::scoring::ci_gate::CiGateResult;
use crate::scoring::quality_score::QualityScores;

#[derive(Default)]
pub struct MultiOutput {
    outputs: Vec<Box<dyn Output>>,
}

impl MultiOutput {
    pub fn new() -> Self {
        Self::default()
    }

    pub fn add_output(&mut self, output: Box<dyn Output>) {
        self.outputs.push(output);
    }

    pub fn get_outputs(&self) -> &[Box<dyn Output>] {
        &self.outputs
    }

    pub fn get_outputs_mut(&mut self) -> &mut [Box<dyn Output>] {
        &mut self.outputs
    }

    pub fn get_output_by_type(&self, output_type: OutputType) -> Option<&dyn Output> {
        self.outputs
            .iter()
            .find(|o| o.get_type() == output_type)
            .map(|o| o.as_ref())
    }

    pub fn get_output_by_type_mut(&mut self, output_type: OutputType) -> Option<&mut Box<dyn Output>> {
        self.outputs.iter_mut().find(|o| o.get_type() == output_type)
    }
}

impl Output for MultiOutput {
    fn add_banner(&mut self) {
        for output in &mut self.outputs {
            output.add_banner();
        }
    }

    fn add_used_options(&mut self) {
        for output in &mut self.outputs {
            output.add_used_options();
        }
    }

    fn set_extra_columns_from_analysis(&mut self, extra_columns: Vec<ExtraColumn>) {
        for output in &mut self.outputs {
            output.set_extra_columns_from_analysis(extra_columns.clone());
        }
    }

    fn add_table_header(&mut self) {
        for output in &mut self.outputs {
            output.add_table_header();
        }
    }

    fn add_table_row(
        &mut self,
        response_headers: &HashMap<String, String>,
        url: &str,
        status: i32,
        elapsed_time: f64,
        size: i64,
        content_type: i32,
        extra_parsed_content: &HashMap<String, String>,
        progress_status: &str,
        cache_type_flags: i32,
        cache_lifetime: Option<i32>,
    ) {
        for output in &mut self.outputs {
            output.add_table_row(
                response_headers,
                url,
                status,
                elapsed_time,
                size,
                content_type,
                extra_parsed_content,
                progress_status,
                cache_type_flags,
                cache_lifetime,
            );
        }
    }

    fn add_super_table(&mut self, table: &SuperTable) {
        for output in &mut self.outputs {
            output.add_super_table(table);
        }
    }

    fn add_total_stats(&mut self, stats: &BasicStats) {
        for output in &mut self.outputs {
            output.add_total_stats(stats);
        }
    }

    fn add_notice(&mut self, text: &str) {
        for output in &mut self.outputs {
            output.add_notice(text);
        }
    }

    fn add_error(&mut self, text: &str) {
        for output in &mut self.outputs {
            output.add_error(text);
        }
    }

    fn add_quality_scores(&mut self, scores: &QualityScores) {
        for output in &mut self.outputs {
            output.add_quality_scores(scores);
        }
    }

    fn add_ci_gate_result(&mut self, result: &CiGateResult) {
        for output in &mut self.outputs {
            output.add_ci_gate_result(result);
        }
    }

    fn add_summary(&mut self, summary: &mut Summary) {
        for output in &mut self.outputs {
            output.add_summary(summary);
        }
    }

    fn set_export_file_paths(
        &mut self,
        offline_paths: Option<&HashMap<String, String>>,
        markdown_paths: Option<&HashMap<String, String>>,
    ) {
        for output in &mut self.outputs {
            output.set_export_file_paths(offline_paths, markdown_paths);
        }
    }

    fn get_type(&self) -> OutputType {
        OutputType::Multi
    }

    fn end(&mut self) {
        for output in &mut self.outputs {
            output.end();
        }
    }

    fn get_output_text(&self) -> Option<String> {
        for output in &self.outputs {
            if let Some(text) = output.get_output_text() {
                return Some(text);
            }
        }
        None
    }

    fn get_json_content(&self) -> Option<String> {
        for output in &self.outputs {
            if let Some(json) = output.get_json_content() {
                return Some(json);
            }
        }
        None
    }
}


================================================
FILE: src/output/output.rs
================================================
// SiteOne Crawler - Output trait
// (c) Jan Reges <jan.reges@siteone.cz>

use std::collections::{BTreeMap, HashMap};

use crate::components::summary::summary::Summary;
use crate::components::super_table::SuperTable;
use crate::extra_column::ExtraColumn;
use crate::output::output_type::OutputType;
use crate::scoring::ci_gate::CiGateResult;
use crate::scoring::quality_score::QualityScores;

/// Trait for crawler output implementations (text console, JSON, multi-output).
///
/// All implementations must be Send + Sync for use in async contexts.
pub trait Output: Send + Sync {
    /// Print the banner (ASCII art for text, crawler info for JSON).
    fn add_banner(&mut self);

    /// Print the used crawler options.
    fn add_used_options(&mut self);

    /// Set extra columns from analysis that will be added to the URL table.
    fn set_extra_columns_from_analysis(&mut self, extra_columns: Vec<ExtraColumn>);

    /// Print the URL table header row.
    fn add_table_header(&mut self);

    /// Print a single URL table row with crawl result data.
    ///
    /// # Arguments
    /// * `response_headers` - flat response headers (lowercase key -> value)
    /// * `url` - the visited URL
    /// * `status` - HTTP status code (negative for errors)
    /// * `elapsed_time` - request duration in seconds
    /// * `size` - response body size in bytes
    /// * `content_type` - content type ID (see ContentTypeId)
    /// * `extra_parsed_content` - extra column values extracted from the response
    /// * `progress_status` - progress string like "45/100"
    /// * `cache_type_flags` - bitwise cache type flags
    /// * `cache_lifetime` - cache lifetime in seconds, if known
    #[allow(clippy::too_many_arguments)]
    fn add_table_row(
        &mut self,
        response_headers: &HashMap<String, String>,
        url: &str,
        status: i32,
        elapsed_time: f64,
        size: i64,
        content_type: i32,
        extra_parsed_content: &HashMap<String, String>,
        progress_status: &str,
        cache_type_flags: i32,
        cache_lifetime: Option<i32>,
    );

    /// Add a SuperTable to the output.
    fn add_super_table(&mut self, table: &SuperTable);

    /// Add total crawl statistics.
    ///
    /// # Arguments
    /// * `stats` - basic crawl statistics
    fn add_total_stats(&mut self, stats: &BasicStats);

    /// Add a notice/informational message.
    fn add_notice(&mut self, text: &str);

    /// Add an error message.
    fn add_error(&mut self, text: &str);

    /// Add quality scores before the summary.
    fn add_quality_scores(&mut self, _scores: &QualityScores) {}

    /// Add CI/CD quality gate result after quality scores.
    fn add_ci_gate_result(&mut self, _result: &CiGateResult) {}

    /// Add the final summary with status items.
    fn add_summary(&mut self, summary: &mut Summary);

    /// Get the output type enum variant.
    fn get_type(&self) -> OutputType;

    /// Finalize and flush the output.
    fn end(&mut self);

    /// Get the accumulated text output content (for file export).
    /// Only TextOutput implements this meaningfully.
    fn get_output_text(&self) -> Option<String> {
        None
    }

    /// Get the accumulated JSON output content (for file export).
    /// Only JsonOutput implements this meaningfully.
    fn get_json_content(&self) -> Option<String> {
        None
    }

    /// Inject export file paths into results (for JSON output).
    /// `offline_paths` maps URL -> relative offline file path.
    /// `markdown_paths` maps URL -> relative markdown file path.
    fn set_export_file_paths(
        &mut self,
        _offline_paths: Option<&HashMap<String, String>>,
        _markdown_paths: Option<&HashMap<String, String>>,
    ) {
    }
}

/// Basic crawl statistics, used by add_total_stats().
/// This is a simplified version; the full Status/BasicStats will be provided by the result module.
#[derive(Debug, Clone, Default)]
pub struct BasicStats {
    pub total_urls: usize,
    pub total_size: i64,
    pub total_size_formatted: String,
    pub total_execution_time: f64,
    pub total_requests_times: f64,
    pub total_requests_times_avg: f64,
    pub total_requests_times_min: f64,
    pub total_requests_times_max: f64,
    pub count_by_status: BTreeMap<i32, usize>,
    pub count_by_content_type: BTreeMap<i32, usize>,
}

/// Crawler info for the JSON banner output.
#[derive(Debug, Clone, Default, serde::Serialize)]
#[serde(rename_all = "camelCase")]
pub struct CrawlerInfo {
    pub name: String,
    pub version: String,
    pub executed_at: String,
    pub command: String,
    pub hostname: String,
    pub final_user_agent: String,
    // Used by TextOutput for the banner (not serialized to JSON)
    #[serde(skip)]
    pub url: String,
    #[serde(skip)]
    pub device: String,
    #[serde(skip)]
    pub workers: usize,
}


================================================
FILE: src/output/output_type.rs
================================================
// SiteOne Crawler - OutputType enum
// (c) Jan Reges <jan.reges@siteone.cz>

use serde::{Deserialize, Serialize};
use std::fmt;

use crate::error::CrawlerError;

#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
#[serde(rename_all = "lowercase")]
pub enum OutputType {
    Text,
    Json,
    Multi,
}

impl OutputType {
    pub fn from_text(text: &str) -> Result<Self, CrawlerError> {
        match text.trim().to_lowercase().as_str() {
            "text" => Ok(OutputType::Text),
            "json" => Ok(OutputType::Json),
            other => Err(CrawlerError::Parse(format!(
                "Unknown output type '{}'. Supported values are: {}",
                other,
                Self::available_text_types().join(", ")
            ))),
        }
    }

    pub fn available_text_types() -> Vec<&'static str> {
        vec!["text", "json"]
    }

    pub fn as_str(&self) -> &'static str {
        match self {
            OutputType::Text => "text",
            OutputType::Json => "json",
            OutputType::Multi => "multi",
        }
    }
}

impl fmt::Display for OutputType {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        f.write_str(self.as_str())
    }
}


================================================
FILE: src/output/text_output.rs
================================================
// SiteOne Crawler - TextOutput (console output)
// (c) Jan Reges <jan.reges@siteone.cz>
//

use std::collections::HashMap;
use std::io::Write;

use crate::components::summary::summary::Summary;
use crate::components::super_table::SuperTable;
use crate::extra_column::ExtraColumn;
use crate::output::output::{BasicStats, CrawlerInfo, Output};
use crate::output::output_type::OutputType;
use crate::scoring::ci_gate::CiGateResult;
use crate::scoring::quality_score::QualityScores;
use crate::types::ContentTypeId;
use crate::utils;

pub struct TextOutput {
    version: String,
    print_to_output: bool,
    extra_columns_from_analysis_width: usize,
    extra_columns_width: usize,

    terminal_width: usize,
    compact_mode: bool,
    progress_bar_width: usize,

    /// Extra columns from analysis that will be added to the table
    extra_columns_from_analysis: Vec<ExtraColumn>,

    /// Extra columns from options (user-specified)
    extra_columns: Vec<ExtraColumn>,

    output_text: String,

    origin_host: String,

    // Options that control output behavior
    hide_progress_bar: bool,
    show_scheme_and_host: bool,
    do_not_truncate_url: bool,
    add_random_query_params: bool,
    url_column_size: Option<usize>,
    show_inline_criticals: bool,
    show_inline_warnings: bool,
    hide_columns: Vec<String>,
    workers: usize,
    memory_limit: String,
    disable_animation: bool,

    /// Cached computed URL column size
    cached_url_column_size: Option<usize>,
}

impl TextOutput {
    #[allow(clippy::too_many_arguments)]
    pub fn new(
        crawler_info: CrawlerInfo,
        extra_columns: Vec<ExtraColumn>,
        hide_progress_bar: bool,
        show_scheme_and_host: bool,
        do_not_truncate_url: bool,
        add_random_query_params: bool,
        url_column_size: Option<usize>,
        show_inline_criticals: bool,
        show_inline_warnings: bool,
        hide_columns: Vec<String>,
        workers: usize,
        memory_limit: String,
        print_to_output: bool,
        disable_animation: bool,
    ) -> Self {
        let terminal_width = utils::get_console_width().min(345);
        let compact_mode = terminal_width < 140;

        let mut extra_columns_width: usize = 0;
        for extra_column in &extra_columns {
            extra_columns_width += extra_column.get_length() + 3; // 3 = 2 spaces + 1 pipe
        }

        let progress_bar_width = if hide_progress_bar {
            0
        } else if compact_mode {
            8
        } else {
            26
        };

        let origin_host = extract_host(&crawler_info.url);

        Self {
            version: crawler_info.version.clone(),
            print_to_output,
            extra_columns_from_analysis_width: 0,
            extra_columns_width,
            terminal_width,
            compact_mode,
            progress_bar_width,
            extra_columns_from_analysis: Vec::new(),
            extra_columns,
            output_text: String::new(),
            origin_host,
            hide_progress_bar,
            show_scheme_and_host,
            do_not_truncate_url,
            add_random_query_params,
            url_column_size,
            show_inline_criticals,
            show_inline_warnings,
            hide_columns,
            workers,
            memory_limit,
            disable_animation,
            cached_url_column_size: None,
        }
    }

    fn is_column_hidden(&self, name: &str) -> bool {
        self.hide_columns.iter().any(|c| c == name)
    }

    /// Width of hidden columns (to reclaim for URL column sizing).
    fn hidden_columns_width(&self) -> usize {
        let mut w = 0;
        if self.is_column_hidden("type") {
            w += 11;
        } // "| Type     "
        if self.is_column_hidden("time") {
            w += 9;
        } // "| Time   "
        if self.is_column_hidden("size") {
            w += 9;
        } // "| Size   "
        if self.is_column_hidden("cache") {
            w += 9;
        } // "| Cache  "
        w
    }

    fn add_to_output(&mut self, output: &str) {
        if self.print_to_output {
            print!("{}", output);
            // Flush stdout to ensure immediate display
            let _ = std::io::stdout().flush();
        }
        self.output_text.push_str(output);
    }

    pub fn get_output_text(&self) -> &str {
        &self.output_text
    }

    fn get_url_column_size(&mut self) -> usize {
        if let Some(cached) = self.cached_url_column_size {
            return cached;
        }

        let size = if let Some(url_col_size) = self.url_column_size {
            url_col_size.min(184)
        } else {
            let status_type_time_size_cache_width: usize = 49usize.saturating_sub(self.hidden_columns_width());
            let free_reserve: usize = 5;

            let url_column_size = self
                .terminal_width
                .saturating_sub(self.progress_bar_width)
                .saturating_sub(status_type_time_size_cache_width)
                .saturating_sub(self.extra_columns_width)
                .saturating_sub(self.extra_columns_from_analysis_width)
                .saturating_sub(free_reserve);

            url_column_size.clamp(20, 184)
        };

        self.cached_url_column_size = Some(size);
        size
    }

    /// Generate polynomial delays for banner animation.
    fn get_polynomial_delays(total_time: f64, iterations: usize, power: u32) -> Vec<f64> {
        let mut delays = Vec::with_capacity(iterations);
        let mut total_poly_sum: f64 = 0.0;

        for i in 1..=iterations {
            total_poly_sum += (i as f64).powi(power as i32);
        }

        for i in 1..=iterations {
            delays.push(((i as f64).powi(power as i32) / total_poly_sum) * total_time);
        }

        delays
    }
}

impl Output for TextOutput {
    fn add_banner(&mut self) {
        // ASCII art banner - generated by https://www.asciiart.eu/image-to-ascii :-)
        let mut banner = String::from("\n");
        banner.push_str(" ####                ####             #####        \n");
        banner.push_str(" ####                ####           #######        \n");
        banner.push_str(" ####      ###       ####         #########        \n");
        banner.push_str(" ####     ######     ####       ###### ####        \n");
        banner.push_str("  ######################       #####   ####        \n");
        banner.push_str("    #######    #######       #####     ####        \n");
        banner.push_str("    #######    #######         #       ####        \n");
        banner.push_str("  ######################               ####        \n");
        banner.push_str(" ####     ######     ####              ####        \n");
        banner.push_str(" ####       ##       ####              ####        \n");
        banner.push_str(" ####                ####       ################## \n");
        banner.push_str(" ####                ####       ################## \n");
        banner.push('\n');
        banner.push_str(&"=".repeat(50));
        banner.push('\n');

        let texts = [
            format!("SiteOne Crawler, v{}", self.version),
            "Author: jan.reges@siteone.cz".to_string(),
        ];

        for text in &texts {
            banner.push_str(&format!("# {:<46} #\n", text));
        }
        banner.push_str(&"=".repeat(50));

        // Loading the rocket on the ramp and show banner with fancy polynomial delays
        let lines: Vec<&str> = banner.split('\n').collect();
        if self.disable_animation {
            for line in &lines {
                self.add_to_output(&format!("{}\n", utils::get_color_text(line, "yellow", false)));
            }
            self.add_to_output("\n\n");
        } else {
            let delays = Self::get_polynomial_delays(1.2, lines.len(), 2);
            for (counter, line) in lines.iter().enumerate() {
                self.add_to_output(&format!("{}\n", utils::get_color_text(line, "yellow", false)));

                // Add delay between lines
                if counter < delays.len() {
                    let usleep_time = std::time::Duration::from_micros((delays[counter] * 1_000_000.0) as u64);
                    std::thread::sleep(usleep_time);
                }
            }

            // The rocket takes off smoothly :)
            std::thread::sleep(std::time::Duration::from_millis(300));
            self.add_to_output("\n");
            std::thread::sleep(std::time::Duration::from_millis(150));
            self.add_to_output("\n");
        }

        if self.compact_mode {
            self.add_to_output(&utils::get_color_text(
                &format!(
                    "Detected terminal width {} < 140 chars - compact mode activated.\n\n",
                    self.terminal_width
                ),
                "yellow",
                false,
            ));
        }
    }

    fn add_used_options(&mut self) {
        // Intentionally left empty
    }

    fn set_extra_columns_from_analysis(&mut self, extra_columns: Vec<ExtraColumn>) {
        self.extra_columns_from_analysis_width = 0;
        for extra_column in &extra_columns {
            self.extra_columns_from_analysis_width += extra_column.get_length() + 3;
            // 3 = 2 spaces + 1 pipe
        }
        self.extra_columns_from_analysis = extra_columns;
        // Reset cached URL column size since widths changed
        self.cached_url_column_size = None;
    }

    fn add_table_header(&mut self) {
        let url_col_size = self.get_url_column_size();
        let mut header = format!("{:<width$} | Status", "URL", width = url_col_size);
        if !self.is_column_hidden("type") {
            header.push_str(" | Type    ");
        }
        if !self.is_column_hidden("time") {
            header.push_str(" | Time  ");
        }
        if !self.is_column_hidden("size") {
            header.push_str(" | Size  ");
        }
        if !self.is_column_hidden("cache") {
            header.push_str(" | Cache ");
        }

        if !self.hide_progress_bar {
            let progress_label = if self.compact_mode {
                "Progress"
            } else {
                "Progress report"
            };
            header = format!(
                "{:<width$}| {}",
                progress_label,
                header,
                width = self.progress_bar_width
            );
        }

        for extra_column in &self.extra_columns_from_analysis {
            header.push_str(&format!(
                " | {:<width$}",
                extra_column.name,
                width = extra_column.get_length().max(4)
            ));
        }

        for extra_column in &self.extra_columns {
            header.push_str(&format!(
                " | {:<width$}",
                extra_column.name,
                width = extra_column.get_length().max(4)
            ));
        }
        header.push('\n');

        let header_len = header.len();
        self.add_to_output(&format!(
            "{}{}\n",
            utils::get_color_text(&header, "gray", false),
            "-".repeat(header_len)
        ));
    }

    fn add_table_row(
        &mut self,
        response_headers: &HashMap<String, String>,
        url: &str,
        status: i32,
        elapsed_time: f64,
        size: i64,
        content_type: i32,
        extra_parsed_content: &HashMap<String, String>,
        progress_status: &str,
        cache_type_flags: i32,
        cache_lifetime: Option<i32>,
    ) {
        let is_external_url = !url.contains(&format!("://{}", self.origin_host));

        let url_for_table = if !self.show_scheme_and_host && !is_external_url {
            // Strip scheme and host from URL
            strip_scheme_and_host(url)
        } else {
            url.to_string()
        };

        let url_col_size = self.get_url_column_size();

        let colored_status = utils::get_colored_status_code(status, 6);

        let content_type_name = ContentTypeId::from_i32(content_type)
            .map(|ct| ct.name())
            .unwrap_or("Other");
        let content_type_padded = format!("{:<8}", content_type_name);

        let colored_elapsed_time = utils::get_colored_request_time(elapsed_time, 6);

        let colored_size = if size > 1024 * 1024 {
            utils::get_color_text(&format!("{:<6}", utils::get_formatted_size(size, 0)), "red", false)
        } else {
            format!("{:<6}", utils::get_formatted_size(size, 0))
        };

        let content_type_header = response_headers.get("content-type").map(|s| s.as_str()).unwrap_or("");
        let is_asset = utils::is_asset_by_content_type(content_type_header);
        let colored_cache = get_colored_cache_info(cache_type_flags, cache_lifetime, is_asset);

        // Process extra columns from analysis
        let mut extra_headers_content = String::new();
        let mut extra_new_line = String::new();
        let extra_new_line_prefix = "  ";

        for extra_column in &self.extra_columns_from_analysis {
            let value = extra_parsed_content
                .get(&extra_column.name)
                .map(|s| s.as_str())
                .unwrap_or("");

            // For analysis results, we use the value as-is (colored output already applied).
            // Manual padding is needed because ANSI color codes would be counted by format!("{:<width$}").
            let truncated = extra_column.get_truncated_value(Some(value)).unwrap_or_default();
            let target_width = extra_column.get_length().max(4);
            let visible_len = utils::remove_ansi_colors(&truncated).chars().count();
            let padding = target_width.saturating_sub(visible_len);
            extra_headers_content.push_str(&format!(" | {}{}", truncated, " ".repeat(padding)));

            // Show inline criticals/warnings if configured
            if self.show_inline_criticals && value.contains("[CRITICAL]") {
                extra_new_line.push_str(&format!("{}\u{26D4} {}\n", extra_new_line_prefix, value));
            }
            if self.show_inline_warnings && value.contains("[WARNING]") {
                extra_new_line.push_str(&format!("{}\u{26A0}\u{FE0F} {}\n", extra_new_line_prefix, value));
            }
        }

        // Process extra columns from options
        for extra_column in &self.extra_columns {
            let mut value = String::new();
            let header_name = &extra_column.name;

            if let Some(v) = extra_parsed_content.get(header_name) {
                value = v.trim().to_string();
            } else if let Some(v) = response_headers.get(&header_name.to_lowercase()) {
                value = v.trim().to_string();
            }

            let truncated = extra_column.get_truncated_value(Some(&value)).unwrap_or_default();
            let target_width = extra_column.get_length().max(4);
            let visible_len = utils::remove_ansi_colors(&truncated).chars().count();
            let padding = target_width.saturating_sub(visible_len);
            extra_headers_content.push_str(&format!(" | {}{}", truncated, " ".repeat(padding)));
        }

        let mut url_display = url_for_table.clone();

        if self.add_random_query_params {
            url_display.push_str(&utils::get_color_text("+%random-query%", "gray", false));
        }

        if !self.do_not_truncate_url {
            url_display = utils::truncate_in_two_thirds(&url_display, url_col_size, "\u{2026}", None);
        }

        // Progress content
        let progress_content = if !self.hide_progress_bar {
            let parts: Vec<&str> = progress_status.splitn(2, '/').collect();
            let done: usize = parts.first().and_then(|s| s.parse().ok()).unwrap_or(0);
            let total: usize = parts.get(1).and_then(|s| s.parse().ok()).unwrap_or(1);

            if self.compact_mode {
                format!("{:<7} |", progress_status)
            } else {
                let progress_to_stderr =
                    format!("{:<7} | {}", progress_status, utils::get_progress_bar(done, total, 10));
                format!("{:<17}", progress_to_stderr)
            }
        } else {
            String::new()
        };

        // Manual ANSI-aware padding for url_display (truncation may add colored "…")
        let url_visible_len = utils::remove_ansi_colors(&url_display).chars().count();
        let url_padding = url_col_size.saturating_sub(url_visible_len);
        let url_padded = format!("{}{}", url_display, " ".repeat(url_padding));

        let mut output = format!("{} {} | {}", progress_content, url_padded, colored_status);
        if !self.is_column_hidden("type") {
            output.push_str(&format!(" | {}", content_type_padded));
        }
        if !self.is_column_hidden("time") {
            output.push_str(&format!(" | {}", colored_elapsed_time));
        }
        if !self.is_column_hidden("size") {
            output.push_str(&format!(" | {}", colored_size));
        }
        if !self.is_column_hidden("cache") {
            output.push_str(&format!(" | {}", colored_cache));
        }
        output.push_str(&format!("{}\n", extra_headers_content));

        if !extra_new_line.is_empty() {
            let combined = format!("{}{}\n", output, extra_new_line.trim_end());
            self.add_to_output(&combined);
        } else {
            self.add_to_output(&output);
        }
    }

    fn add_super_table(&mut self, table: &SuperTable) {
        self.add_to_output("\n");
        self.add_to_output(&table.get_console_output());
    }

    fn add_total_stats(&mut self, stats: &BasicStats) {
        self.add_to_output("\n");
        self.add_to_output(&"=".repeat(self.terminal_width));
        self.add_to_output("\n");

        let peak_memory = utils::get_peak_memory_usage();
        let peak_memory_str = if peak_memory > 0 {
            format!(
                " (max used {})",
                utils::get_color_text(&utils::get_formatted_size(peak_memory, 0), "cyan", false,)
            )
        } else {
            String::new()
        };
        let result_header = format!(
            "Total execution time {} using {} workers and {} memory limit{}\n",
            utils::get_color_text(
                &utils::get_formatted_duration(stats.total_execution_time),
                "cyan",
                false,
            ),
            utils::get_color_text(&self.workers.to_string(), "cyan", false),
            utils::get_color_text(&self.memory_limit, "cyan", false),
            peak_memory_str,
        );
        self.add_to_output(&result_header);

        let reqs_per_sec = if stats.total_execution_time > 0.0 {
            (stats.total_urls as f64 / stats.total_execution_time) as i64
        } else {
            0
        };
        let bytes_per_sec = if stats.total_execution_time > 0.0 {
            (stats.total_size as f64 / stats.total_execution_time) as i64
        } else {
            0
        };

        self.add_to_output(&format!(
            "Total of {} visited URLs with a total size of {} and power of {} with download speed {}\n",
            utils::get_color_text(&stats.total_urls.to_string(), "cyan", false),
            utils::get_color_text(&stats.total_size_formatted, "cyan", false),
            utils::get_color_text(&format!("{} reqs/s", reqs_per_sec), "magenta", false),
            utils::get_color_text(
                &format!("{}/s", utils::get_formatted_size(bytes_per_sec, 0)),
                "magenta",
                false,
            ),
        ));

        self.add_to_output(&format!(
            "Response times: AVG {} MIN {} MAX {} TOTAL {}\n",
            utils::get_color_text(
                &utils::get_formatted_duration(stats.total_requests_times_avg),
                "magenta",
                false,
            ),
            utils::get_color_text(
                &utils::get_formatted_duration(stats.total_requests_times_min),
                "green",
                false,
            ),
            utils::get_color_text(
                &utils::get_formatted_duration(stats.total_requests_times_max),
                "red",
                false,
            ),
            utils::get_color_text(
                &utils::get_formatted_duration(stats.total_requests_times),
                "cyan",
                false,
            ),
        ));

        self.add_to_output(&"=".repeat(self.terminal_width));
        self.add_to_output("\n");
    }

    fn add_notice(&mut self, text: &str) {
        self.add_to_output(&format!("{}\n", utils::get_color_text(text, "blue", false)));
    }

    fn add_error(&mut self, text: &str) {
        self.add_to_output(&format!("{}\n", utils::get_color_text(text, "red", false)));
    }

    fn add_quality_scores(&mut self, scores: &QualityScores) {
        // Content: "  " + name(16) + bar(25) + "  " + score(7) + "  " + label(9) + "  " = 65
        let inner = 65;

        let mut out = String::new();
        out.push('\n');

        // Top border
        out.push_str(&format!("\u{2554}{}\u{2557}\n", "\u{2550}".repeat(inner)));

        // Title
        let title = "WEBSITE QUALITY SCORE";
        let pad = (inner as isize - title.len() as isize) / 2;
        let pad = pad.max(0) as usize;
        out.push_str(&format!(
            "\u{2551}{}{:<width$}\u{2551}\n",
            " ".repeat(pad),
            title,
            width = inner - pad,
        ));

        // Separator
        out.push_str(&format!("\u{2560}{}\u{2563}\n", "\u{2550}".repeat(inner)));

        // Overall score bar
        out.push_str(&format_score_line(&scores.overall, inner, true));

        // Separator
        out.push_str(&format!("\u{2560}{}\u{2563}\n", "\u{2550}".repeat(inner)));

        // Category scores
        for cat in &scores.categories {
            out.push_str(&format_score_line(cat, inner, false));
        }

        // Bottom border
        out.push_str(&format!("\u{255A}{}\u{255D}\n", "\u{2550}".repeat(inner)));

        self.add_to_output(&out);
    }

    fn add_ci_gate_result(&mut self, result: &CiGateResult) {
        let inner = 62;
        let mut out = String::new();
        out.push('\n');

        let border_color = if result.passed { "green" } else { "red" };

        // Top border
        out.push_str(&utils::get_color_text(
            &format!("\u{2554}{}\u{2557}", "\u{2550}".repeat(inner)),
            border_color,
            false,
        ));
        out.push('\n');

        // Title
        let title = "CI/CD QUALITY GATE";
        let pad = (inner as isize - title.len() as isize) / 2;
        let pad = pad.max(0) as usize;
        let title_line = format!("{}{:<width$}", " ".repeat(pad), title, width = inner - pad,);
        out.push_str(&utils::get_color_text("\u{2551}", border_color, false));
        out.push_str(&title_line);
        out.push_str(&utils::get_color_text("\u{2551}", border_color, false));
        out.push('\n');

        // Separator
        out.push_str(&utils::get_color_text(
            &format!("\u{2560}{}\u{2563}", "\u{2550}".repeat(inner)),
            border_color,
            false,
        ));
        out.push('\n');

        // Check lines
        for check in &result.checks {
            let (tag, tag_color) = if check.passed {
                ("[PASS]", "green")
            } else {
                ("[FAIL]", "red")
            };

            let detail = if check.passed {
                format!(
                    "{}: {} {} {}",
                    check.metric,
                    format_num(check.actual),
                    check.operator,
                    format_num(check.threshold)
                )
            } else if check.operator == ">=" {
                format!(
                    "{}: {} < {} (min: {})",
                    check.metric,
                    format_num(check.actual),
                    format_num(check.threshold),
                    format_num(check.threshold)
                )
            } else {
                format!(
                    "{}: {} > {} (max: {})",
                    check.metric,
                    format_num(check.actual),
                    format_num(check.threshold),
                    format_num(check.threshold)
                )
            };

            let content = format!("  {} {}", tag, detail);
            let visible_len = content.chars().count();
            let padding = inner.saturating_sub(visible_len);

            let colored_tag = utils::get_color_text(tag, tag_color, false);
            let line_content = format!("  {} {}{}", colored_tag, detail, " ".repeat(padding));

            out.push_str(&utils::get_color_text("\u{2551}", border_color, false));
            out.push_str(&line_content);
            out.push_str(&utils::get_color_text("\u{2551}", border_color, false));
            out.push('\n');
        }

        // Result separator
        out.push_str(&utils::get_color_text(
            &format!("\u{2560}{}\u{2563}", "\u{2550}".repeat(inner)),
            border_color,
            false,
        ));
        out.push('\n');

        // Result line
        let failed_count = result.checks.iter().filter(|c| !c.passed).count();
        let total_count = result.checks.len();
        let result_text = if result.passed {
            format!(
                "RESULT: PASS ({} of {} checks passed) \u{2014} exit code 0",
                total_count, total_count
            )
        } else {
            format!(
                "RESULT: FAIL ({} of {} checks failed) \u{2014} exit code 10",
                failed_count, total_count
            )
        };
        let result_content = format!("  {}", result_text);
        let visible_len = result_content.chars().count();
        let padding = inner.saturating_sub(visible_len);

        out.push_str(&utils::get_color_text("\u{2551}", border_color, false));
        out.push_str(&utils::get_color_text(
            &format!("{}{}", result_content, " ".repeat(padding)),
            border_color,
            false,
        ));
        out.push_str(&utils::get_color_text("\u{2551}", border_color, false));
        out.push('\n');

        // Bottom border
        out.push_str(&utils::get_color_text(
            &format!("\u{255A}{}\u{255D}", "\u{2550}".repeat(inner)),
            border_color,
            false,
        ));
        out.push('\n');

        self.add_to_output(&out);
    }

    fn add_summary(&mut self, summary: &mut Summary) {
        self.add_to_output("\n");
        self.add_to_output(&summary.get_as_console_text());
    }

    fn get_type(&self) -> OutputType {
        OutputType::Text
    }

    fn end(&mut self) {
        self.add_to_output("\n");
    }

    fn get_output_text(&self) -> Option<String> {
        Some(self.output_text.clone())
    }
}

// ---- Helper functions ----

/// Extract the host part from a URL.
fn extract_host(url: &str) -> String {
    if let Ok(parsed) = url::Url::parse(url) {
        parsed.host_str().unwrap_or("").to_string()
    } else {
        String::new()
    }
}

/// Strip scheme and host from a URL, leaving only the path (and query).
fn strip_scheme_and_host(url: &str) -> String {
    if let Ok(parsed) = url::Url::parse(url) {
        let path = parsed.path();
        if let Some(query) = parsed.query() {
            format!("{}?{}", path, query)
        } else {
            path.to_string()
        }
    } else {
        url.to_string()
    }
}

// Cache type flag constants
const CACHE_TYPE_HAS_NO_STORE: i32 = 2048;
const CACHE_TYPE_HAS_ETAG: i32 = 4;
const CACHE_TYPE_HAS_LAST_MODIFIED: i32 = 8;

/// Get colored cache info string.
fn get_colored_cache_info(cache_type_flags: i32, cache_lifetime: Option<i32>, is_asset: bool) -> String {
    let critical_color = "red";
    let warning_color = "yellow";
    let notice_color = "magenta";
    let neutral_color = "gray";
    let ok_color = "green";

    let str_pad_to = 6;

    if let Some(lifetime) = cache_lifetime {
        let color = if is_asset {
            if lifetime <= 0 {
                critical_color
            } else if lifetime < 7200 {
                warning_color
            } else if lifetime < 86400 {
                notice_color
            } else {
                ok_color
            }
        } else {
            neutral_color
        };
        utils::get_color_text(
            &format!(
                "{:<width$}",
                utils::get_formatted_cache_lifetime(lifetime as i64),
                width = str_pad_to
            ),
            color,
            false,
        )
    } else if cache_type_flags & CACHE_TYPE_HAS_NO_STORE != 0 {
        let color = if is_asset { critical_color } else { notice_color };
        utils::get_color_text(&format!("{:<width$}", "0s", width = str_pad_to), color, false)
    } else if cache_type_flags & CACHE_TYPE_HAS_ETAG != 0 {
        let color = if is_asset { warning_color } else { notice_color };
        utils::get_color_text(&format!("{:<width$}", "etag", width = str_pad_to), color, false)
    } else if cache_type_flags & CACHE_TYPE_HAS_LAST_MODIFIED != 0 {
        let color = if is_asset { warning_color } else { notice_color };
        utils::get_color_text(&format!("{:<width$}", "lm", width = str_pad_to), color, false)
    } else {
        let color = if is_asset { critical_color } else { notice_color };
        utils::get_color_text(&format!("{:<width$}", "none", width = str_pad_to), color, false)
    }
}

/// Format a number for CI gate display: integers without decimals, floats with one decimal.
fn format_num(v: f64) -> String {
    if v == v.floor() && v.abs() < 1e15 {
        format!("{}", v as i64)
    } else {
        format!("{:.1}", v)
    }
}

/// Format a single score line for the quality score box.
fn format_score_line(
    cat: &crate::scoring::quality_score::CategoryScore,
    inner_width: usize,
    _is_overall: bool,
) -> String {
    let bar_width = 25;
    let filled = ((cat.score / 10.0) * bar_width as f64).round() as usize;
    let empty = bar_width - filled;
    let bar = format!("{}{}", "\u{2588}".repeat(filled), "\u{2591}".repeat(empty),);

    let score_str = format!("{:>7}", format!("{:.1}/10", cat.score));
    let label_padded = format!("{:<16}", cat.name);
    let label_str = format!("{:<9}", cat.label);
    let content = format!("  {}{}  {}  {}", label_padded, bar, score_str, label_str);

    // Calculate visible width using char count (Unicode block chars are 1 display char each)
    let visible_width = content.chars().count();
    let padding = inner_width.saturating_sub(visible_width);

    // Colorize the entire content
    let colored = utils::get_color_text(&content, cat.console_color(), false);

    format!("\u{2551}{}{}\u{2551}\n", colored, " ".repeat(padding))
}


================================================
FILE: src/result/basic_stats.rs
================================================
// SiteOne Crawler - BasicStats
// (c) Jan Reges <jan.reges@siteone.cz>

use std::collections::BTreeMap;
use std::time::Instant;

use serde::{Deserialize, Serialize};

use crate::result::visited_url::VisitedUrl;
use crate::utils;

#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct BasicStats {
    pub total_execution_time: f64,
    pub total_urls: usize,
    pub total_size: i64,
    pub total_size_formatted: String,
    pub total_requests_times: f64,
    pub total_requests_times_avg: f64,
    pub total_requests_times_min: f64,
    pub total_requests_times_max: f64,
    pub count_by_status: BTreeMap<i32, usize>,
    pub count_by_content_type: BTreeMap<i32, usize>,
}

impl BasicStats {
    #[allow(clippy::too_many_arguments)]
    pub fn new(
        total_execution_time: f64,
        total_urls: usize,
        total_size: i64,
        total_size_formatted: String,
        total_requests_times: f64,
        total_requests_times_avg: f64,
        total_requests_times_min: f64,
        total_requests_times_max: f64,
        count_by_status: BTreeMap<i32, usize>,
        count_by_content_type: BTreeMap<i32, usize>,
    ) -> Self {
        Self {
            total_execution_time,
            total_urls,
            total_size,
            total_size_formatted,
            total_requests_times,
            total_requests_times_avg,
            total_requests_times_min,
            total_requests_times_max,
            count_by_status,
            count_by_content_type,
        }
    }

    pub fn from_visited_urls(visited_urls: &[&VisitedUrl], start_time: Instant) -> Self {
        let total_urls = visited_urls.len();
        let mut total_size: i64 = 0;
        let mut total_time: f64 = 0.0;
        let mut min_time: Option<f64> = None;
        let mut max_time: Option<f64> = None;
        let mut count_by_status: BTreeMap<i32, usize> = BTreeMap::new();
        let mut count_by_content_type: BTreeMap<i32, usize> = BTreeMap::new();

        for url in visited_urls {
            total_time += url.request_time;
            total_size += url.size.unwrap_or(0);
            *count_by_status.entry(url.status_code).or_insert(0) += 1;
            *count_by_content_type.entry(url.content_type as i32).or_insert(0) += 1;
            min_time = Some(match min_time {
                Some(current) => current.min(url.request_time),
                None => url.request_time,
            });
            max_time = Some(match max_time {
                Some(current) => current.max(url.request_time),
                None => url.request_time,
            });
        }

        let total_execution_time = (start_time.elapsed().as_secs_f64() * 1000.0).round() / 1000.0;
        let total_requests_times = (total_time * 1000.0).round() / 1000.0;
        let total_requests_times_avg = if total_urls > 0 {
            (total_time / total_urls as f64 * 1000.0).round() / 1000.0
        } else {
            0.0
        };
        let total_requests_times_min = (min_time.unwrap_or(0.0) * 1000.0).round() / 1000.0;
        let total_requests_times_max = (max_time.unwrap_or(0.0) * 1000.0).round() / 1000.0;

        Self {
            total_execution_time,
            total_urls,
            total_size,
            total_size_formatted: utils::get_formatted_size(total_size, 0),
            total_requests_times,
            total_requests_times_avg,
            total_requests_times_min,
            total_requests_times_max,
            count_by_status,
            count_by_content_type,
        }
    }

    pub fn get_as_html(&self) -> String {
        let mut html = String::from("<table class=\"table table-bordered table-striped table-hover\">");
        html.push_str("<tr><th colspan=\"2\">Basic stats</th></tr>");
        html.push_str(&format!(
            "<tr><td>Total execution time</td><td>{}</td></tr>",
            utils::get_formatted_duration(self.total_execution_time)
        ));
        html.push_str(&format!("<tr><td>Total URLs</td><td>{}</td></tr>", self.total_urls));
        html.push_str(&format!(
            "<tr><td>Total size</td><td>{}</td></tr>",
            self.total_size_formatted
        ));
        html.push_str(&format!(
            "<tr><td>Requests - total time</td><td>{}</td></tr>",
            utils::get_formatted_duration(self.total_requests_times)
        ));
        html.push_str(&format!(
            "<tr><td>Requests - avg time</td><td>{}</td></tr>",
            utils::get_formatted_duration(self.total_requests_times_avg)
        ));
        html.push_str(&format!(
            "<tr><td>Requests - min time</td><td>{}</td></tr>",
            utils::get_formatted_duration(self.total_requests_times_min)
        ));
        html.push_str(&format!(
            "<tr><td>Requests - max time</td><td>{}</td></tr>",
            utils::get_formatted_duration(self.total_requests_times_max)
        ));
        html.push_str("<tr><td>Requests by status</td><td>");
        for (status_code, count) in &self.count_by_status {
            let colored = utils::get_colored_status_code(*status_code, 0);
            let colored_html = utils::convert_bash_colors_in_text_to_html(&colored);
            html.push_str(&format!("{}: {}<br>", colored_html, count));
        }
        html.push_str("</td></tr>");
        html.push_str("</table>");

        html
    }
}


================================================
FILE: src/result/manager_stats.rs
================================================
// SiteOne Crawler - ManagerStats
// (c) Jan Reges <jan.reges@siteone.cz>

use std::collections::HashMap;
use std::time::Instant;

use crate::components::super_table::SuperTable;
use crate::components::super_table_column::SuperTableColumn;
use crate::utils;

#[derive(Debug, Default)]
pub struct ManagerStats {
    /// Total exec times of analyzer methods
    exec_times: HashMap<String, f64>,

    /// Total exec counts of analyzer methods
    exec_counts: HashMap<String, usize>,
}

impl ManagerStats {
    pub fn new() -> Self {
        Self {
            exec_times: HashMap::new(),
            exec_counts: HashMap::new(),
        }
    }

    /// Measure and increment exec time and count of analyzer method
    pub fn measure_exec_time(&mut self, class: &str, method: &str, start_time: Instant) {
        let elapsed = start_time.elapsed().as_secs_f64();
        let key = format!("{}::{}", class, method);

        *self.exec_times.entry(key.clone()).or_insert(0.0) += elapsed;
        *self.exec_counts.entry(key).or_insert(0) += 1;
    }

    pub fn get_super_table(
        &self,
        apl_code: &str,
        title: &str,
        empty_table_message: &str,
        external_times: Option<&HashMap<String, f64>>,
        external_counts: Option<&HashMap<String, usize>>,
    ) -> SuperTable {
        let mut data: Vec<HashMap<String, String>> = Vec::new();

        // Internal stats
        for (class_and_method, exec_time) in &self.exec_times {
            let short_name = class_and_method
                .rsplit("::")
                .next()
                .map(|method| {
                    let class_part = class_and_method.split("::").next().unwrap_or(class_and_method);
                    let short_class = class_part.rsplit('/').next().unwrap_or(class_part);
                    let short_class = short_class.rsplit('\\').next().unwrap_or(short_class);
                    format!("{}::{}", short_class, method)
                })
                .unwrap_or_else(|| class_and_method.clone());

            let mut row = HashMap::new();
            row.insert("classAndMethod".to_string(), short_name);
            row.insert("execTime".to_string(), format!("{}", exec_time));
            row.insert(
                "execTimeFormatted".to_string(),
                utils::get_formatted_duration(*exec_time),
            );
            row.insert(
                "execCount".to_string(),
                format!("{}", self.exec_counts.get(class_and_method).copied().unwrap_or(0)),
            );
            data.push(row);
        }

        // External stats (if any)
        if let Some(ext_times) = external_times {
            for (class_and_method, exec_time) in ext_times {
                let short_name = class_and_method
                    .rsplit("::")
                    .next()
                    .map(|method| {
                        let class_part = class_and_method.split("::").next().unwrap_or(class_and_method);
                        let short_class = class_part.rsplit('/').next().unwrap_or(class_part);
                        let short_class = short_class.rsplit('\\').next().unwrap_or(short_class);
                        format!("{}::{}", short_class, method)
                    })
                    .unwrap_or_else(|| class_and_method.clone());

                let mut row = HashMap::new();
                row.insert("classAndMethod".to_string(), short_name);
                row.insert("execTime".to_string(), format!("{}", exec_time));
                row.insert(
                    "execTimeFormatted".to_string(),
                    utils::get_formatted_duration(*exec_time),
                );
                row.insert(
                    "execCount".to_string(),
                    format!(
                        "{}",
                        external_counts
                            .and_then(|c| c.get(class_and_method))
                            .copied()
                            .unwrap_or(0)
                    ),
                );
                data.push(row);
            }
        }

        let columns = vec![
            SuperTableColumn::new(
                "classAndMethod".to_string(),
                "Class::method".to_string(),
                -1, // AUTO_WIDTH
                None,
                None,
                false,
                false,
                false,
                true,
                None,
            ),
            SuperTableColumn::new(
                "execTime".to_string(),
                "Exec time".to_string(),
                9,
                Some(Box::new(|value: &str, _render_into: &str| {
                    if let Ok(v) = value.parse::<f64>() {
                        utils::get_colored_request_time(v, 9)
                    } else {
                        value.to_string()
                    }
                })),
                None,
                false,
                false,
                false,
                true,
                None,
            ),
            SuperTableColumn::new(
                "execCount".to_string(),
                "Exec count".to_string(),
                -1, // AUTO_WIDTH
                None,
                None,
                false,
                false,
                false,
                true,
                None,
            ),
        ];

        let mut super_table = SuperTable::new(
            apl_code.to_string(),
            title.to_string(),
            empty_table_message.to_string(),
            columns,
            false,
            Some("execTime".to_string()),
            "DESC".to_string(),
            None,
            None,
            None,
        );

        super_table.set_data(data);
        super_table
    }

    pub fn get_exec_times(&self) -> &HashMap<String, f64> {
        &self.exec_times
    }

    pub fn get_exec_counts(&self) -> &HashMap<String, usize> {
        &self.exec_counts
    }
}


================================================
FILE: src/result/mod.rs
================================================
pub mod basic_stats;
pub mod manager_stats;
pub mod status;
pub mod storage;
pub mod visited_url;


================================================
FILE: src/result/status.rs
================================================
// SiteOne Crawler - Status (central crawl state)
// (c) Jan Reges <jan.reges@siteone.cz>

use std::collections::HashMap;
use std::sync::{Mutex, RwLock};
use std::time::Instant;

use indexmap::IndexMap;

use crate::analysis::result::url_analysis_result::UrlAnalysisResult;
use crate::components::summary::item::Item;
use crate::components::summary::item_status::ItemStatus;
use crate::components::summary::summary::Summary;
use crate::components::super_table::SuperTable;
use crate::info::Info;
use crate::result::basic_stats::BasicStats;
use crate::result::storage::storage::Storage;
use crate::result::visited_url::VisitedUrl;
use crate::types::{ContentTypeId, SkippedReason};

/// Central state for the crawl result.
/// Must be Send + Sync for concurrent access from multiple workers.
pub struct Status {
    /// Content storage (memory or file) - used only if store_content is true
    storage: Box<dyn Storage>,

    /// Store content of visited URLs (HTML, CSS, JS, images, ...) to storage
    store_content: bool,

    /// Crawl start time
    start_time: Instant,

    /// Basic stats/metrics about visited URLs (lazily computed)
    basic_stats: RwLock<Option<BasicStats>>,

    /// Overall summary of the crawl
    summary: Mutex<Summary>,

    /// SuperTables that are at the beginning of the page
    super_tables_at_beginning: Mutex<Vec<SuperTable>>,

    /// SuperTables that are at the end of the page
    super_tables_at_end: Mutex<Vec<SuperTable>>,

    /// Crawler info
    crawler_info: RwLock<Info>,

    /// Visited URLs, keyed by uq_id (IndexMap preserves crawl/insertion order)
    visited_urls: Mutex<IndexMap<String, VisitedUrl>>,

    /// Analysis results per visited URL uq_id
    visited_url_to_analysis_result: Mutex<HashMap<String, Vec<UrlAnalysisResultEntry>>>,

    /// Robots.txt content - key is "scheme://host:port"
    robots_txt_content: RwLock<HashMap<String, String>>,

    /// Skipped URLs (transferred from crawler after crawling)
    skipped_urls: Mutex<Vec<SkippedUrlEntry>>,
}

/// Entry for a skipped URL stored in Status
#[derive(Debug, Clone)]
pub struct SkippedUrlEntry {
    pub url: String,
    pub reason: SkippedReason,
    pub source_uq_id: String,
    pub source_attr: i32,
}

/// Per-URL analysis result entry stored in Status
#[derive(Debug, Clone)]
pub struct UrlAnalysisResultEntry {
    pub analysis_name: String,
    pub result: UrlAnalysisResult,
}

// SAFETY: Status uses internal synchronization primitives (Mutex, RwLock)
// for all mutable state, making it safe to share across threads.
unsafe impl Send for Status {}
unsafe impl Sync for Status {}

impl Status {
    pub fn new(storage: Box<dyn Storage>, store_content: bool, crawler_info: Info, start_time: Instant) -> Self {
        Self {
            storage,
            store_content,
            start_time,
            basic_stats: RwLock::new(None),
            summary: Mutex::new(Summary::new()),
            super_tables_at_beginning: Mutex::new(Vec::new()),
            super_tables_at_end: Mutex::new(Vec::new()),
            crawler_info: RwLock::new(crawler_info),
            visited_urls: Mutex::new(IndexMap::new()),
            visited_url_to_analysis_result: Mutex::new(HashMap::new()),
            robots_txt_content: RwLock::new(HashMap::new()),
            skipped_urls: Mutex::new(Vec::new()),
        }
    }

    pub fn add_visited_url(
        &mut self,
        visited_url: VisitedUrl,
        body: Option<&[u8]>,
        headers: Option<&HashMap<String, String>>,
    ) {
        let uq_id = visited_url.uq_id.clone();
        let content_type = visited_url.content_type;

        if let Ok(mut urls) = self.visited_urls.lock() {
            urls.insert(uq_id.clone(), visited_url);
        }

        if self.store_content {
            if let Some(body_bytes) = body {
                let content = if content_type == ContentTypeId::Html {
                    // Trim whitespace for HTML (text-safe operation)
                    let text = String::from_utf8_lossy(body_bytes);
                    text.trim().as_bytes().to_vec()
                } else {
                    body_bytes.to_vec()
                };
                // Ignore storage errors - they are non-fatal
                let _ = self.storage.save(&uq_id, &content);
            }

            if let Some(hdrs) = headers {
                // Serialize headers as JSON for storage
                if let Ok(serialized) = serde_json::to_string(hdrs) {
                    let _ = self.storage.save(&format!("{}.headers", uq_id), serialized.as_bytes());
                }
            }
        }

        // Invalidate cached basic stats
        if let Ok(mut stats) = self.basic_stats.write() {
            *stats = None;
        }
    }

    pub fn add_summary_item_by_ranges(
        &self,
        apl_code: &str,
        value: f64,
        ranges: &[(f64, f64)],
        text_per_range: &[&str],
    ) {
        let mut status = ItemStatus::Info;
        let mut text = format!("{} out of range ({})", apl_code, value);

        for (range_id, range) in ranges.iter().enumerate() {
            if value >= range.0 && value <= range.1 {
                if let Ok(s) = ItemStatus::from_range_id(range_id as i32) {
                    status = s;
                }
                if let Some(tmpl) = text_per_range.get(range_id) {
                    text = tmpl.replace("{}", &format!("{}", value));
                }
                break;
            }
        }

        if let Ok(mut summary) = self.summary.lock() {
            summary.add_item(Item::new(apl_code.to_string(), text, status));
        }
    }

    pub fn add_ok_to_summary(&self, apl_code: &str, text: &str) {
        if let Ok(mut summary) = self.summary.lock() {
            summary.add_item(Item::new(apl_code.to_string(), text.to_string(), ItemStatus::Ok));
        }
    }

    pub fn add_notice_to_summary(&self, apl_code: &str, text: &str) {
        if let Ok(mut summary) = self.summary.lock() {
            summary.add_item(Item::new(apl_code.to_string(), text.to_string(), ItemStatus::Notice));
        }
    }

    pub fn add_info_to_summary(&self, apl_code: &str, text: &str) {
        if let Ok(mut summary) = self.summary.lock() {
            summary.add_item(Item::new(apl_code.to_string(), text.to_string(), ItemStatus::Info));
        }
    }

    pub fn add_warning_to_summary(&self, apl_code: &str, text: &str) {
        if let Ok(mut summary) = self.summary.lock() {
            summary.add_item(Item::new(apl_code.to_string(), text.to_string(), ItemStatus::Warning));
        }
    }

    pub fn add_critical_to_summary(&self, apl_code: &str, text: &str) {
        if let Ok(mut summary) = self.summary.lock() {
            summary.add_item(Item::new(apl_code.to_string(), text.to_string(), ItemStatus::Critical));
        }
    }

    pub fn get_summary(&self) -> Summary {
        self.summary.lock().map(|s| s.clone()).unwrap_or_default()
    }

    pub fn with_summary<F, R>(&self, f: F) -> Option<R>
    where
        F: FnOnce(&mut Summary) -> R,
    {
        self.summary.lock().ok().map(|mut s| f(&mut s))
    }

    /// Get stored body as raw bytes (preserves binary data for images, fonts, etc.)
    pub fn get_url_body(&self, uq_id: &str) -> Option<Vec<u8>> {
        if !self.store_content {
            return None;
        }
        self.storage.load(uq_id).ok().filter(|b| !b.is_empty())
    }

    /// Get stored body as text (lossy UTF-8 conversion). Use for HTML/CSS/JS processing.
    pub fn get_url_body_text(&self, uq_id: &str) -> Option<String> {
        self.get_url_body(uq_id)
            .map(|b| String::from_utf8_lossy(&b).into_owned())
    }

    pub fn get_url_headers(&self, uq_id: &str) -> Option<HashMap<String, String>> {
        let key = format!("{}.headers", uq_id);
        let data = self.storage.load(&key).ok()?;
        if data.is_empty() {
            return None;
        }
        serde_json::from_slice(&data).ok()
    }

    pub fn get_visited_urls(&self) -> Vec<VisitedUrl> {
        self.visited_urls
            .lock()
            .map(|urls| urls.values().cloned().collect())
            .unwrap_or_default()
    }

    pub fn with_visited_urls<F, R>(&self, f: F) -> Option<R>
    where
        F: FnOnce(&IndexMap<String, VisitedUrl>) -> R,
    {
        self.visited_urls.lock().ok().map(|urls| f(&urls))
    }

    pub fn get_crawler_info(&self) -> Info {
        self.crawler_info.read().map(|info| info.clone()).unwrap_or_else(|_| {
            Info::new(
                String::new(),
                String::new(),
                String::new(),
                String::new(),
                String::new(),
                String::new(),
                String::new(),
            )
        })
    }

    pub fn get_storage(&self) -> &dyn Storage {
        self.storage.as_ref()
    }

    pub fn set_final_user_agent(&self, value: &str) {
        if let Ok(mut info) = self.crawler_info.write() {
            info.set_final_user_agent(value.to_string());
        }
    }

    pub fn get_basic_stats(&self) -> BasicStats {
        // Check if we already have cached stats
        if let Ok(stats_guard) = self.basic_stats.read()
            && let Some(ref stats) = *stats_guard
        {
            return stats.clone();
        }

        // Compute stats
        let stats = match self.visited_urls.lock() {
            Ok(urls) => {
                let url_refs: Vec<&VisitedUrl> = urls.values().collect();
                BasicStats::from_visited_urls(&url_refs, self.start_time)
            }
            _ => BasicStats::from_visited_urls(&[], self.start_time),
        };

        // Cache the result
        if let Ok(mut stats_guard) = self.basic_stats.write() {
            *stats_guard = Some(stats.clone());
        }

        stats
    }

    pub fn add_super_table_at_beginning(&self, super_table: SuperTable) {
        if let Ok(mut tables) = self.super_tables_at_beginning.lock() {
            tables.push(super_table);
        }
    }

    pub fn add_super_table_at_end(&self, super_table: SuperTable) {
        if let Ok(mut tables) = self.super_tables_at_end.lock() {
            tables.push(super_table);
        }
    }

    pub fn with_super_tables_at_beginning<F, R>(&self, f: F) -> Option<R>
    where
        F: FnOnce(&[SuperTable]) -> R,
    {
        self.super_tables_at_beginning.lock().ok().map(|tables| f(&tables))
    }

    pub fn with_super_tables_at_beginning_mut<F, R>(&self, f: F) -> Option<R>
    where
        F: FnOnce(&mut [SuperTable]) -> R,
    {
        self.super_tables_at_beginning
            .lock()
            .ok()
            .map(|mut tables| f(&mut tables))
    }

    pub fn with_super_tables_at_end<F, R>(&self, f: F) -> Option<R>
    where
        F: FnOnce(&[SuperTable]) -> R,
    {
        self.super_tables_at_end.lock().ok().map(|tables| f(&tables))
    }

    pub fn with_super_tables_at_end_mut<F, R>(&self, f: F) -> Option<R>
    where
        F: FnOnce(&mut [SuperTable]) -> R,
    {
        self.super_tables_at_end.lock().ok().map(|mut tables| f(&mut tables))
    }

    /// Set host_to_strip_from_urls and initial_url on a SuperTable based on crawler info.
    /// Used so that URLs matching the initial domain are displayed without protocol+domain.
    pub fn configure_super_table_url_stripping(&self, table: &mut SuperTable) {
        let info = self.get_crawler_info();
        if !info.initial_url.is_empty()
            && let Ok(parsed) = url::Url::parse(&info.initial_url)
        {
            table.set_host_to_strip_from_urls(
                parsed.host_str().map(|h| h.to_string()),
                Some(parsed.scheme().to_string()),
            );
            table.set_initial_url(Some(info.initial_url.clone()));
        }
    }

    pub fn get_super_table_by_apl_code(&self, apl_code: &str) -> bool {
        let found_beginning = self
            .super_tables_at_beginning
            .lock()
            .ok()
            .map(|tables| tables.iter().any(|t| t.apl_code == apl_code))
            .unwrap_or(false);

        if found_beginning {
            return true;
        }

        self.super_tables_at_end
            .lock()
            .ok()
            .map(|tables| tables.iter().any(|t| t.apl_code == apl_code))
            .unwrap_or(false)
    }

    pub fn get_url_by_uq_id(&self, uq_id: &str) -> Option<String> {
        self.visited_urls
            .lock()
            .ok()
            .and_then(|urls| urls.get(uq_id).map(|v| v.url.clone()))
    }

    pub fn get_origin_header_value_by_source_uq_id(&self, source_uq_id: &str) -> Option<String> {
        self.visited_urls.lock().ok().and_then(|urls| {
            urls.get(source_uq_id).and_then(|visited_url| {
                url::Url::parse(&visited_url.url).ok().map(|parsed| {
                    let scheme = parsed.scheme();
                    let host = parsed.host_str().unwrap_or("");
                    let port = parsed.port();
                    if let Some(p) = port {
                        format!("{}://{}:{}", scheme, host, p)
                    } else {
                        format!("{}://{}", scheme, host)
                    }
                })
            })
        })
    }

    pub fn add_url_analysis_result(&self, visited_url_uq_id: &str, result: UrlAnalysisResultEntry) {
        if let Ok(mut map) = self.visited_url_to_analysis_result.lock() {
            map.entry(visited_url_uq_id.to_string()).or_default().push(result);
        }
    }

    pub fn get_url_analysis_results(&self, visited_url_uq_id: &str) -> Vec<UrlAnalysisResultEntry> {
        self.visited_url_to_analysis_result
            .lock()
            .ok()
            .and_then(|map| map.get(visited_url_uq_id).cloned())
            .unwrap_or_default()
    }

    pub fn add_skipped_url(&mut self, url: String, reason: SkippedReason, source_uq_id: String, source_attr: i32) {
        if let Ok(mut skipped) = self.skipped_urls.lock() {
            skipped.push(SkippedUrlEntry {
                url,
                reason,
                source_uq_id,
                source_attr,
            });
        }
    }

    pub fn get_skipped_urls(&self) -> Vec<SkippedUrlEntry> {
        self.skipped_urls.lock().ok().map(|v| v.clone()).unwrap_or_default()
    }

    pub fn get_details_by_analysis_name_and_severity(&self, analysis_name: &str, severity: &str) -> Vec<String> {
        let mut result = Vec::new();
        if let Ok(map) = self.visited_url_to_analysis_result.lock() {
            for entries in map.values() {
                for entry in entries {
                    let details = entry
                        .result
                        .get_details_of_severity_and_analysis_name(severity, analysis_name);
                    result.extend(details);
                }
            }
        }
        result
    }

    pub fn get_visited_url_to_analysis_result(&self) -> HashMap<String, Vec<UrlAnalysisResultEntry>> {
        self.visited_url_to_analysis_result
            .lock()
            .map(|map| map.clone())
            .unwrap_or_default()
    }

    /// Get number of visited URLs with HTTP code >= 200
    pub fn get_number_of_working_visited_urls(&self) -> usize {
        self.visited_urls
            .lock()
            .map(|urls| urls.values().filter(|u| u.status_code >= 200).count())
            .unwrap_or(0)
    }

    pub fn set_robots_txt_content(&self, scheme: &str, host: &str, port: u16, content: &str) {
        let key = format!("{}://{}:{}", scheme, host, port);
        if let Ok(mut map) = self.robots_txt_content.write() {
            map.insert(key, content.to_string());
        }
    }

    pub fn get_robots_txt_content(&self, scheme: &str, host: &str, port: u16) -> Option<String> {
        let key = format!("{}://{}:{}", scheme, host, port);
        self.robots_txt_content
            .read()
            .ok()
            .and_then(|map| map.get(&key).cloned())
    }
}

impl std::fmt::Debug for Status {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        f.debug_struct("Status")
            .field("store_content", &self.store_content)
            .field(
                "visited_urls_count",
                &self.visited_urls.lock().map(|u| u.len()).unwrap_or(0),
            )
            .finish()
    }
}


================================================
FILE: src/result/storage/file_storage.rs
================================================
// SiteOne Crawler - FileStorage
// (c) Jan Reges <jan.reges@siteone.cz>

use std::fs;
use std::io::Write;
use std::path::{Path, PathBuf};

use regex::Regex;

use crate::error::{CrawlerError, CrawlerResult};
use crate::result::storage::storage::Storage;

pub struct FileStorage {
    cache_dir: PathBuf,
    compress: bool,
}

impl FileStorage {
    pub fn new(tmp_dir: &str, compress: bool, origin_url_domain: &str) -> CrawlerResult<Self> {
        // Sanitize domain name for use as directory name
        let sanitized_domain = match Regex::new(r"[^a-zA-Z0-9.\-_]") {
            Ok(re) => re.replace_all(&origin_url_domain.to_lowercase(), "-").to_string(),
            _ => origin_url_domain.to_lowercase(),
        };

        let cache_dir = PathBuf::from(tmp_dir).join(sanitized_domain);

        if !cache_dir.exists() {
            fs::create_dir_all(&cache_dir).map_err(|e| {
                CrawlerError::Io(std::io::Error::other(format!(
                    "Directory '{}' was not created: {}",
                    cache_dir.display(),
                    e
                )))
            })?;
        }

        Ok(Self { cache_dir, compress })
    }

    fn get_file_extension(&self) -> &str {
        if self.compress { "cache.gz" } else { "cache" }
    }

    fn get_file_path(&self, uq_id: &str) -> PathBuf {
        debug_assert!(
            uq_id.chars().all(|c| c.is_ascii_alphanumeric() || c == '-' || c == '_'),
            "uq_id '{}' contains unsafe characters",
            uq_id
        );
        let subdir = if uq_id.len() >= 2 { &uq_id[..2] } else { uq_id };
        self.cache_dir
            .join(subdir)
            .join(format!("{}.{}", uq_id, self.get_file_extension()))
    }

    fn create_directory_if_needed(&self, path: &Path) -> CrawlerResult<()> {
        if !path.exists() {
            fs::create_dir_all(path).map_err(|e| {
                CrawlerError::Io(std::io::Error::other(format!(
                    "Directory '{}' was not created. Please check permissions: {}",
                    path.display(),
                    e
                )))
            })?;
        }
        Ok(())
    }
}

impl Storage for FileStorage {
    fn save(&mut self, uq_id: &str, content: &[u8]) -> CrawlerResult<()> {
        let data = if self.compress {
            let mut encoder = flate2::write::GzEncoder::new(Vec::new(), flate2::Compression::default());
            encoder.write_all(content).map_err(CrawlerError::Io)?;
            encoder.finish().map_err(CrawlerError::Io)?
        } else {
            content.to_vec()
        };

        let file_path = self.get_file_path(uq_id);
        if let Some(parent) = file_path.parent() {
            self.create_directory_if_needed(parent)?;
        }

        fs::write(&file_path, &data).map_err(CrawlerError::Io)
    }

    fn load(&self, uq_id: &str) -> CrawlerResult<Vec<u8>> {
        let file_path = self.get_file_path(uq_id);

        if !file_path.exists() {
            return Ok(Vec::new());
        }

        let data = fs::read(&file_path).map_err(CrawlerError::Io)?;

        if self.compress {
            let mut decoder = flate2::read::GzDecoder::new(&data[..]);
            let mut decompressed = Vec::new();
            std::io::Read::read_to_end(&mut decoder, &mut decompressed).map_err(CrawlerError::Io)?;
            Ok(decompressed)
        } else {
            Ok(data)
        }
    }

    fn delete(&mut self, uq_id: &str) -> CrawlerResult<()> {
        let file_path = self.get_file_path(uq_id);
        if file_path.exists() {
            fs::remove_file(&file_path).map_err(CrawlerError::Io)?;
        }
        Ok(())
    }

    fn delete_all(&mut self) -> CrawlerResult<()> {
        if self.cache_dir.exists() {
            // Remove all files recursively within cache_dir, then recreate
            fs::remove_dir_all(&self.cache_dir).map_err(CrawlerError::Io)?;
            fs::create_dir_all(&self.cache_dir).map_err(CrawlerError::Io)?;
        }
        Ok(())
    }
}

impl Drop for FileStorage {
    fn drop(&mut self) {
        // Clean up cache directory on drop
        let _ = self.delete_all();
    }
}

impl std::fmt::Debug for FileStorage {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        f.debug_struct("FileStorage")
            .field("cache_dir", &self.cache_dir)
            .field("compress", &self.compress)
            .finish()
    }
}


================================================
FILE: src/result/storage/memory_storage.rs
================================================
// SiteOne Crawler - MemoryStorage
// (c) Jan Reges <jan.reges@siteone.cz>

use std::collections::HashMap;
use std::io::Write;

use crate::error::{CrawlerError, CrawlerResult};
use crate::result::storage::storage::Storage;

pub struct MemoryStorage {
    storage: HashMap<String, Vec<u8>>,
    compress: bool,
}

impl MemoryStorage {
    pub fn new(compress: bool) -> Self {
        Self {
            storage: HashMap::new(),
            compress,
        }
    }
}

impl Storage for MemoryStorage {
    fn save(&mut self, uq_id: &str, content: &[u8]) -> CrawlerResult<()> {
        let data = if self.compress {
            let mut encoder = flate2::write::GzEncoder::new(Vec::new(), flate2::Compression::default());
            encoder.write_all(content).map_err(CrawlerError::Io)?;
            encoder.finish().map_err(CrawlerError::Io)?
        } else {
            content.to_vec()
        };

        self.storage.insert(uq_id.to_string(), data);
        Ok(())
    }

    fn load(&self, uq_id: &str) -> CrawlerResult<Vec<u8>> {
        match self.storage.get(uq_id) {
            Some(data) if !data.is_empty() => {
                if self.compress {
                    let mut decoder = flate2::read::GzDecoder::new(&data[..]);
                    let mut decompressed = Vec::new();
                    std::io::Read::read_to_end(&mut decoder, &mut decompressed).map_err(CrawlerError::Io)?;
                    Ok(decompressed)
                } else {
                    Ok(data.clone())
                }
            }
            _ => Ok(Vec::new()),
        }
    }

    fn delete(&mut self, uq_id: &str) -> CrawlerResult<()> {
        self.storage.remove(uq_id);
        Ok(())
    }

    fn delete_all(&mut self) -> CrawlerResult<()> {
        self.storage.clear();
        Ok(())
    }
}

impl std::fmt::Debug for MemoryStorage {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        f.debug_struct("MemoryStorage")
            .field("entries", &self.storage.len())
            .field("compress", &self.compress)
            .finish()
    }
}


================================================
FILE: src/result/storage/mod.rs
================================================
pub mod file_storage;
pub mod memory_storage;
#[allow(clippy::module_inception)]
pub mod storage;
pub mod storage_type;


================================================
FILE: src/result/storage/storage.rs
================================================
// SiteOne Crawler - Storage trait
// (c) Jan Reges <jan.reges@siteone.cz>

use crate::error::CrawlerResult;

pub trait Storage: Send + Sync {
    fn save(&mut self, uq_id: &str, content: &[u8]) -> CrawlerResult<()>;

    fn load(&self, uq_id: &str) -> CrawlerResult<Vec<u8>>;

    fn delete(&mut self, uq_id: &str) -> CrawlerResult<()>;

    fn delete_all(&mut self) -> CrawlerResult<()>;
}


================================================
FILE: src/result/storage/storage_type.rs
================================================
// SiteOne Crawler - StorageType
// (c) Jan Reges <jan.reges@siteone.cz>

use serde::{Deserialize, Serialize};

use crate::error::CrawlerError;

#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
#[serde(rename_all = "lowercase")]
pub enum StorageType {
    Memory,
    File,
}

impl StorageType {
    pub fn from_text(text: &str) -> Result<Self, CrawlerError> {
        match text.trim().to_lowercase().as_str() {
            "memory" => Ok(StorageType::Memory),
            "file" => Ok(StorageType::File),
            other => Err(CrawlerError::Parse(format!(
                "Unknown storage type '{}'. Supported values are: {}",
                other,
                Self::available_text_types().join(", ")
            ))),
        }
    }

    pub fn available_text_types() -> Vec<&'static str> {
        vec!["memory", "file"]
    }

    pub fn as_str(&self) -> &'static str {
        match self {
            StorageType::Memory => "memory",
            StorageType::File => "file",
        }
    }
}

impl std::fmt::Display for StorageType {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        f.write_str(self.as_str())
    }
}


================================================
FILE: src/result/visited_url.rs
================================================
// SiteOne Crawler - VisitedUrl
// (c) Jan Reges <jan.reges@siteone.cz>

use std::collections::HashMap;

use regex::Regex;
use serde::{Deserialize, Serialize};

use crate::types::ContentTypeId;
use crate::utils;

// Error status codes (negative values)
pub const ERROR_CONNECTION_FAIL: i32 = -1;
pub const ERROR_TIMEOUT: i32 = -2;
pub const ERROR_SERVER_RESET: i32 = -3;
pub const ERROR_SEND_ERROR: i32 = -4;
pub const ERROR_SKIPPED: i32 = -6;

// Cache type flags (bitwise OR)
pub const CACHE_TYPE_HAS_CACHE_CONTROL: u32 = 1;
pub const CACHE_TYPE_HAS_EXPIRES: u32 = 2;
pub const CACHE_TYPE_HAS_ETAG: u32 = 4;
pub const CACHE_TYPE_HAS_LAST_MODIFIED: u32 = 8;
pub const CACHE_TYPE_HAS_MAX_AGE: u32 = 16;
pub const CACHE_TYPE_HAS_S_MAX_AGE: u32 = 32;
pub const CACHE_TYPE_HAS_STALE_WHILE_REVALIDATE: u32 = 64;
pub const CACHE_TYPE_HAS_STALE_IF_ERROR: u32 = 128;
pub const CACHE_TYPE_HAS_PUBLIC: u32 = 256;
pub const CACHE_TYPE_HAS_PRIVATE: u32 = 512;
pub const CACHE_TYPE_HAS_NO_CACHE: u32 = 1024;
pub const CACHE_TYPE_HAS_NO_STORE: u32 = 2048;
pub const CACHE_TYPE_HAS_MUST_REVALIDATE: u32 = 4096;
pub const CACHE_TYPE_HAS_PROXY_REVALIDATE: u32 = 8192;
pub const CACHE_TYPE_HAS_IMMUTABLE: u32 = 16384;
pub const CACHE_TYPE_NO_CACHE_HEADERS: u32 = 32768;
pub const CACHE_TYPE_NOT_AVAILABLE: u32 = 65536;

// Source attribute constants
pub const SOURCE_INIT_URL: i32 = 5;
pub const SOURCE_A_HREF: i32 = 10;
pub const SOURCE_IMG_SRC: i32 = 20;
pub const SOURCE_IMG_SRCSET: i32 = 21;
pub const SOURCE_INPUT_SRC: i32 = 22;
pub const SOURCE_SOURCE_SRC: i32 = 23;
pub const SOURCE_VIDEO_SRC: i32 = 24;
pub const SOURCE_AUDIO_SRC: i32 = 25;
pub const SOURCE_SCRIPT_SRC: i32 = 30;
pub const SOURCE_INLINE_SCRIPT_SRC: i32 = 40;
pub const SOURCE_LINK_HREF: i32 = 50;
pub const SOURCE_CSS_URL: i32 = 60;
pub const SOURCE_JS_URL: i32 = 70;
pub const SOURCE_REDIRECT: i32 = 80;
pub const SOURCE_SITEMAP: i32 = 90;

#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct VisitedUrl {
    /// Unique ID hash of this URL
    pub uq_id: String,

    /// Unique ID hash of the source URL where this URL was found
    pub source_uq_id: String,

    /// Source attribute where this URL was found (see SOURCE_* constants)
    pub source_attr: i32,

    /// Full URL with scheme, domain, path and query
    pub url: String,

    /// HTTP status code of the request (negative values are errors, see ERROR_* constants)
    pub status_code: i32,

    /// Request time in seconds
    pub request_time: f64,

    /// Request time formatted as "32 ms" or "7.4 s"
    pub request_time_formatted: String,

    /// Size of the response in bytes
    pub size: Option<i64>,

    /// Size of the response formatted as "1.23 MB"
    pub size_formatted: Option<String>,

    /// Content-Encoding header value (br, gzip, ...)
    pub content_encoding: Option<String>,

    /// Content type ID
    pub content_type: ContentTypeId,

    /// Content type header value (text/html, application/json, ...)
    pub content_type_header: Option<String>,

    /// Extra data from the response required by --extra-columns
    pub extras: Option<HashMap<String, String>>,

    /// Is this URL external (not from the same domain as the initial URL)
    pub is_external: bool,

    /// Is this URL allowed for crawling (based on --allowed-domain-for-crawling)
    pub is_allowed_for_crawling: bool,

    /// Cache type flags of the response (bitwise OR). See CACHE_TYPE_* constants
    pub cache_type_flags: u32,

    /// How long the response is allowed to be cached in seconds
    pub cache_lifetime: Option<i64>,
}

impl VisitedUrl {
    #[allow(clippy::too_many_arguments)]
    pub fn new(
        uq_id: String,
        source_uq_id: String,
        source_attr: i32,
        url: String,
        status_code: i32,
        request_time: f64,
        size: Option<i64>,
        content_type: ContentTypeId,
        content_type_header: Option<String>,
        content_encoding: Option<String>,
        extras: Option<HashMap<String, String>>,
        is_external: bool,
        is_allowed_for_crawling: bool,
        cache_type_flags: u32,
        cache_lifetime: Option<i64>,
    ) -> Self {
        let request_time_formatted = utils::get_formatted_duration(request_time);
        let size_formatted = size.map(|s| utils::get_formatted_size(s, 0));

        Self {
            uq_id,
            source_uq_id,
            source_attr,
            url,
            status_code,
            request_time,
            request_time_formatted,
            size,
            size_formatted,
            content_encoding,
            content_type,
            content_type_header,
            extras,
            is_external,
            is_allowed_for_crawling,
            cache_type_flags,
            cache_lifetime,
        }
    }

    pub fn is_https(&self) -> bool {
        self.url.starts_with("https://")
    }

    pub fn is_static_file(&self) -> bool {
        matches!(
            self.content_type,
            ContentTypeId::Image
                | ContentTypeId::Script
                | ContentTypeId::Stylesheet
                | ContentTypeId::Video
                | ContentTypeId::Audio
                | ContentTypeId::Document
                | ContentTypeId::Font
                | ContentTypeId::Json
                | ContentTypeId::Xml
        )
    }

    pub fn is_image(&self) -> bool {
        self.content_type == ContentTypeId::Image
    }

    pub fn is_video(&self) -> bool {
        self.content_type == ContentTypeId::Video
    }

    pub fn get_source_description(&self, source_url: Option<&str>) -> String {
        let source = source_url.unwrap_or("unknown");
        match self.source_attr {
            SOURCE_INIT_URL => "Initial URL".to_string(),
            SOURCE_A_HREF => format!("<a href> on {}", source),
            SOURCE_IMG_SRC => format!("<img src> on {}", source),
            SOURCE_IMG_SRCSET => format!("<img srcset> on {}", source),
            SOURCE_INPUT_SRC => format!("<input src> on {}", source),
            SOURCE_SOURCE_SRC => format!("<source src> on {}", source),
            SOURCE_VIDEO_SRC => format!("<video src> on {}", source),
            SOURCE_AUDIO_SRC => format!("<audio src> on {}", source),
            SOURCE_SCRIPT_SRC => format!("<script src> on {}", source),
            SOURCE_INLINE_SCRIPT_SRC => format!("<script> on {}", source),
            SOURCE_LINK_HREF => format!("<link href> on {}", source),
            SOURCE_CSS_URL => format!("CSS url() on {}", source),
            SOURCE_JS_URL => format!("JS url on {}", source),
            SOURCE_REDIRECT => format!("Redirect from {}", source),
            SOURCE_SITEMAP => format!("URL in sitemap {}", source),
            _ => "Unknown source".to_string(),
        }
    }

    pub fn get_source_short_name(&self) -> &'static str {
        match self.source_attr {
            SOURCE_INIT_URL => "Initial URL",
            SOURCE_A_HREF => "<a href>",
            SOURCE_IMG_SRC => "<img src>",
            SOURCE_IMG_SRCSET => "<img srcset>",
            SOURCE_INPUT_SRC => "<input src>",
            SOURCE_SOURCE_SRC => "<source src>",
            SOURCE_VIDEO_SRC => "<video src>",
            SOURCE_AUDIO_SRC => "<audio src>",
            SOURCE_SCRIPT_SRC => "<script src>",
            SOURCE_INLINE_SCRIPT_SRC => "inline <script src>",
            SOURCE_LINK_HREF => "<link href>",
            SOURCE_CSS_URL => "css url()",
            SOURCE_JS_URL => "js url",
            SOURCE_REDIRECT => "redirect",
            SOURCE_SITEMAP => "sitemap",
            _ => "unknown",
        }
    }

    pub fn looks_like_static_file_by_url(&self) -> bool {
        use once_cell::sync::Lazy;
        static RE_STATIC_FILE: Lazy<Regex> = Lazy::new(|| {
            Regex::new(
                r"(?i)\.(jpg|jpeg|png|gif|webp|svg|ico|js|css|txt|woff2|woff|ttf|eot|mp4|webm|ogg|mp3|wav|flac|pdf|doc|docx|xls|xlsx|ppt|pptx|zip|rar|gz|bz2|7z|xml|json)",
            ).unwrap()
        });
        RE_STATIC_FILE.is_match(&self.url)
    }

    pub fn has_error_status_code(&self) -> bool {
        self.status_code < 0
    }

    pub fn get_scheme(&self) -> Option<String> {
        url::Url::parse(&self.url).ok().map(|u| u.scheme().to_string())
    }

    pub fn get_host(&self) -> Option<String> {
        url::Url::parse(&self.url)
            .ok()
            .and_then(|u| u.host_str().map(|h| h.to_string()))
    }

    pub fn get_port(&self) -> u16 {
        if let Ok(parsed) = url::Url::parse(&self.url) {
            parsed.port().unwrap_or_else(|| if self.is_https() { 443 } else { 80 })
        } else if self.is_https() {
            443
        } else {
            80
        }
    }

    pub fn get_cache_type_label(&self) -> String {
        let mut labels = Vec::new();

        // Cache-Control or Expires (if Cache-Control is not defined)
        if self.cache_type_flags & CACHE_TYPE_HAS_CACHE_CONTROL != 0 {
            labels.push("Cache-Control");
        } else if self.cache_type_flags & CACHE_TYPE_HAS_EXPIRES != 0 {
            labels.push("Expires");
        }

        // ETag and Last-Modified
        if self.cache_type_flags & CACHE_TYPE_HAS_ETAG != 0 {
            labels.push("ETag");
        }
        if self.cache_type_flags & CACHE_TYPE_HAS_LAST_MODIFIED != 0 {
            labels.push("Last-Modified");
        }

        if labels.is_empty() {
            "No cache headers".to_string()
        } else {
            labels.join(" + ")
        }
    }
}


================================================
FILE: src/scoring/ci_gate.rs
================================================
// SiteOne Crawler - CI/CD Quality Gate
// (c) Jan Reges <jan.reges@siteone.cz>
//
// Evaluates crawler results against configurable thresholds.
// Returns exit code 10 when any check fails.

use serde::Serialize;

use crate::components::summary::item_status::ItemStatus;
use crate::components::summary::summary::Summary;
use crate::options::core_options::CoreOptions;
use crate::output::output::BasicStats;
use crate::scoring::quality_score::QualityScores;
use crate::types::ContentTypeId;

#[derive(Debug, Clone, Serialize)]
#[serde(rename_all = "camelCase")]
pub struct CiCheck {
    pub metric: String,
    pub operator: String,
    pub threshold: f64,
    pub actual: f64,
    pub passed: bool,
}

#[derive(Debug, Clone, Serialize)]
#[serde(rename_all = "camelCase")]
pub struct CiGateResult {
    pub passed: bool,
    pub exit_code: i32,
    pub checks: Vec<CiCheck>,
}

pub fn evaluate(options: &CoreOptions, scores: &QualityScores, stats: &BasicStats, summary: &Summary) -> CiGateResult {
    let mut checks = Vec::new();

    // If no pages were successfully crawled, fail immediately.
    // URLs with negative status codes (-1 connection error, -2 timeout, etc.) don't count.
    let has_successful_response = stats.count_by_status.keys().any(|&code| code > 0);
    if stats.total_urls == 0 || !has_successful_response {
        checks.push(CiCheck {
            metric: "Pages crawled".to_string(),
            operator: ">".to_string(),
            threshold: 0.0,
            actual: 0.0,
            passed: false,
        });
        return CiGateResult {
            passed: false,
            exit_code: 10,
            checks,
        };
    }

    // Overall score
    checks.push(check_min("Overall score", scores.overall.score, options.ci_min_score));

    // Category scores
    if let Some(threshold) = options.ci_min_performance {
        let actual = find_category_score(scores, "performance");
        checks.push(check_min("Performance score", actual, threshold));
    }
    if let Some(threshold) = options.ci_min_seo {
        let actual = find_category_score(scores, "seo");
        checks.push(check_min("SEO score", actual, threshold));
    }
    if let Some(threshold) = options.ci_min_security {
        let actual = find_category_score(scores, "security");
        checks.push(check_min("Security score", actual, threshold));
    }
    if let Some(threshold) = options.ci_min_accessibility {
        let actual = find_category_score(scores, "accessibility");
        checks.push(check_min("Accessibility score", actual, threshold));
    }
    if let Some(threshold) = options.ci_min_best_practices {
        let actual = find_category_score(scores, "best-practices");
        checks.push(check_min("Best Practices score", actual, threshold));
    }

    // 404 errors
    let count_404 = stats.count_by_status.get(&404).copied().unwrap_or(0) as f64;
    checks.push(check_max("404 errors", count_404, options.ci_max_404 as f64));

    // 5xx errors
    let count_5xx: usize = stats
        .count_by_status
        .iter()
        .filter(|&(&code, _)| (500..600).contains(&code))
        .map(|(_, &count)| count)
        .sum();
    checks.push(check_max("5xx errors", count_5xx as f64, options.ci_max_5xx as f64));

    // Critical findings
    let criticals = summary.get_count_by_item_status(ItemStatus::Critical) as f64;
    checks.push(check_max(
        "Critical findings",
        criticals,
        options.ci_max_criticals as f64,
    ));

    // Warning findings (optional)
    if let Some(max_warnings) = options.ci_max_warnings {
        let warnings = summary.get_count_by_item_status(ItemStatus::Warning) as f64;
        checks.push(check_max("Warning findings", warnings, max_warnings as f64));
    }

    // Average response time (optional)
    if let Some(max_avg) = options.ci_max_avg_response {
        checks.push(check_max(
            "Avg response time (s)",
            stats.total_requests_times_avg,
            max_avg,
        ));
    }

    // Minimum content type counts
    let pages = count_content_types(stats, &[ContentTypeId::Html]);
    checks.push(check_min("HTML pages", pages as f64, options.ci_min_pages as f64));

    let assets = count_content_types(
        stats,
        &[
            ContentTypeId::Script,
            ContentTypeId::Stylesheet,
            ContentTypeId::Image,
            ContentTypeId::Font,
        ],
    );
    checks.push(check_min(
        "Assets (JS/CSS/img/font)",
        assets as f64,
        options.ci_min_assets as f64,
    ));

    if options.ci_min_documents > 0 {
        let documents = count_content_types(stats, &[ContentTypeId::Document]);
        checks.push(check_min(
            "Documents",
            documents as f64,
            options.ci_min_documents as f64,
        ));
    }

    let passed = checks.iter().all(|c| c.passed);
    CiGateResult {
        passed,
        exit_code: if passed { 0 } else { 10 },
        checks,
    }
}

fn check_min(metric: &str, actual: f64, threshold: f64) -> CiCheck {
    CiCheck {
        metric: metric.to_string(),
        operator: ">=".to_string(),
        threshold,
        actual,
        passed: actual >= threshold,
    }
}

fn check_max(metric: &str, actual: f64, threshold: f64) -> CiCheck {
    CiCheck {
        metric: metric.to_string(),
        operator: "<=".to_string(),
        threshold,
        actual,
        passed: actual <= threshold,
    }
}

fn count_content_types(stats: &BasicStats, types: &[ContentTypeId]) -> usize {
    types
        .iter()
        .map(|t| stats.count_by_content_type.get(&(*t as i32)).copied().unwrap_or(0))
        .sum()
}

fn find_category_score(scores: &QualityScores, code: &str) -> f64 {
    scores
        .categories
        .iter()
        .find(|c| c.code == code)
        .map(|c| c.score)
        .unwrap_or(0.0)
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::components::summary::item::Item;
    use crate::scoring::quality_score::{CategoryScore, score_label};
    use std::collections::BTreeMap;

    fn make_options() -> CoreOptions {
        CoreOptions {
            url: "https://test.com".to_string(),
            single_page: false,
            max_depth: 0,
            device: crate::types::DeviceType::Desktop,
            user_agent: None,
            timeout: 5,
            proxy: None,
            http_auth: None,
            accept_invalid_certs: false,
            timezone: None,
            show_version_only: false,
            show_help_only: false,
            output_type: crate::types::OutputType::Text,
            url_column_size: None,
            show_inline_criticals: false,
            show_inline_warnings: false,
            rows_limit: 200,
            extra_columns: Vec::new(),
            extra_columns_names_only: Vec::new(),
            show_scheme_and_host: false,
            do_not_truncate_url: false,
            hide_progress_bar: false,
            hide_columns: Vec::new(),
            no_color: false,
            force_color: false,
            console_width: None,
            disable_all_assets: false,
            disable_javascript: false,
            disable_styles: false,
            disable_fonts: false,
            disable_images: false,
            disable_files: false,
            remove_all_anchor_listeners: false,
            workers: 3,
            max_reqs_per_sec: 10.0,
            memory_limit: "2048M".to_string(),
            resolve: Vec::new(),
            websocket_server: None,
            ignore_robots_txt: false,
            allowed_domains_for_external_files: Vec::new(),
            allowed_domains_for_crawling: Vec::new(),
            single_foreign_page: false,
            result_storage: crate::options::core_options::StorageType::Memory,
            result_storage_dir: "tmp/result-storage".to_string(),
            result_storage_compression: false,
            accept_encoding: "gzip, deflate, br".to_string(),
            max_queue_length: 9000,
            max_visited_urls: 10000,
            max_url_length: 2083,
            max_skipped_urls: 10000,
            max_non200_responses_per_basename: 5,
            include_regex: Vec::new(),
            ignore_regex: Vec::new(),
            regex_filtering_only_for_pages: false,
            analyzer_filter_regex: None,
            add_random_query_params: false,
            remove_query_params: false,
            keep_query_params: Vec::new(),
            transform_url: Vec::new(),
            force_relative_urls: false,
            output_html_report: None,
            html_report_options: None,
            output_json_file: None,
            output_text_file: None,
            add_host_to_output_file: false,
            add_timestamp_to_output_file: false,
            sitemap_xml_file: None,
            sitemap_txt_file: None,
            sitemap_base_priority: 0.5,
            sitemap_priority_increase: 0.1,
            offline_export_dir: None,
            offline_export_store_only_url_regex: Vec::new(),
            offline_export_remove_unwanted_code: true,
            offline_export_no_auto_redirect_html: false,
            offline_export_preserve_url_structure: false,
            offline_export_preserve_urls: false,
            replace_content: Vec::new(),
            replace_query_string: Vec::new(),
            offline_export_lowercase: false,
            ignore_store_file_error: false,
            disable_astro_inline_modules: false,
            markdown_export_dir: None,
            markdown_export_single_file: None,
            markdown_move_content_before_h1_to_end: false,
            markdown_disable_images: false,
            markdown_disable_files: false,
            markdown_remove_links_and_images_from_single_file: false,
            markdown_exclude_selector: Vec::new(),
            markdown_replace_content: Vec::new(),
            markdown_replace_query_string: Vec::new(),
            markdown_export_store_only_url_regex: Vec::new(),
            markdown_ignore_store_file_error: false,
            mail_to: Vec::new(),
            mail_from: "test@test.com".to_string(),
            mail_from_name: "Test".to_string(),
            mail_subject_template: "Test".to_string(),
            mail_smtp_host: "localhost".to_string(),
            mail_smtp_port: 25,
            mail_smtp_user: None,
            mail_smtp_pass: None,
            upload_enabled: false,
            upload_to: String::new(),
            upload_retention: "30d".to_string(),
            upload_password: None,
            upload_timeout: 3600,
            http_cache_dir: None,
            http_cache_compression: false,
            http_cache_ttl: None,
            debug: false,
            debug_log_file: None,
            debug_url_regex: Vec::new(),
            fastest_top_limit: 20,
            fastest_max_time: 1.0,
            max_heading_level: 3,
            slowest_top_limit: 20,
            slowest_min_time: 0.01,
            slowest_max_time: 3.0,
            serve_markdown_dir: None,
            serve_offline_dir: None,
            serve_port: 8321,
            serve_bind_address: "127.0.0.1".to_string(),
            html_to_markdown_file: None,
            html_to_markdown_output: None,
            ci: true,
            ci_min_score: 5.0,
            ci_min_performance: Some(5.0),
            ci_min_seo: Some(5.0),
            ci_min_security: Some(5.0),
            ci_min_accessibility: Some(3.0),
            ci_min_best_practices: Some(5.0),
            ci_max_404: 0,
            ci_max_5xx: 0,
            ci_max_criticals: 0,
            ci_max_warnings: None,
            ci_max_avg_response: None,
            ci_min_pages: 0,
            ci_min_assets: 0,
            ci_min_documents: 0,
        }
    }

    fn make_scores(overall: f64) -> QualityScores {
        let cats = vec![
            ("Performance", "performance", 0.20),
            ("SEO", "seo", 0.20),
            ("Security", "security", 0.25),
            ("Accessibility", "accessibility", 0.20),
            ("Best Practices", "best-practices", 0.15),
        ];
        QualityScores {
            overall: CategoryScore {
                name: "Overall".to_string(),
                code: "overall".to_string(),
                score: overall,
                label: score_label(overall).to_string(),
                weight: 1.0,
                deductions: Vec::new(),
            },
            categories: cats
                .into_iter()
                .map(|(name, code, weight)| CategoryScore {
                    name: name.to_string(),
                    code: code.to_string(),
                    score: overall,
                    label: score_label(overall).to_string(),
                    weight,
                    deductions: Vec::new(),
                })
                .collect(),
        }
    }

    fn make_stats(total_urls: usize) -> BasicStats {
        let mut count_by_status = BTreeMap::new();
        if total_urls > 0 {
            count_by_status.insert(200, total_urls);
        }
        BasicStats {
            total_urls,
            count_by_status,
            ..Default::default()
        }
    }

    fn make_stats_with_status(total_urls: usize, status_counts: &[(i32, usize)]) -> BasicStats {
        let mut count_by_status = BTreeMap::new();
        for &(code, count) in status_counts {
            count_by_status.insert(code, count);
        }
        BasicStats {
            total_urls,
            count_by_status,
            ..Default::default()
        }
    }

    #[test]
    fn all_checks_pass() {
        let options = make_options();
        let scores = make_scores(8.0);
        let stats = make_stats(100);
        let summary = Summary::new();
        let result = evaluate(&options, &scores, &stats, &summary);
        assert!(result.passed);
        assert_eq!(result.exit_code, 0);
    }

    #[test]
    fn fail_low_overall_score() {
        let options = make_options();
        let scores = make_scores(3.0);
        let stats = make_stats(100);
        let summary = Summary::new();
        let result = evaluate(&options, &scores, &stats, &summary);
        assert!(!result.passed);
        assert_eq!(result.exit_code, 10);
    }

    #[test]
    fn fail_404_count() {
        let options = make_options();
        let scores = make_scores(8.0);
        let stats = make_stats_with_status(100, &[(404, 3)]);
        let summary = Summary::new();
        let result = evaluate(&options, &scores, &stats, &summary);
        assert!(!result.passed);
    }

    #[test]
    fn fail_5xx_count() {
        let options = make_options();
        let scores = make_scores(8.0);
        let stats = make_stats_with_status(100, &[(500, 2)]);
        let summary = Summary::new();
        let result = evaluate(&options, &scores, &stats, &summary);
        assert!(!result.passed);
    }

    #[test]
    fn fail_criticals() {
        let options = make_options();
        let scores = make_scores(8.0);
        let stats = make_stats(100);
        let mut summary = Summary::new();
        summary.add_item(Item::new(
            "test".to_string(),
            "Test critical".to_string(),
            ItemStatus::Critical,
        ));
        let result = evaluate(&options, &scores, &stats, &summary);
        assert!(!result.passed);
    }

    #[test]
    fn optional_warnings() {
        let mut options = make_options();
        options.ci_max_warnings = Some(0);
        let scores = make_scores(8.0);
        let stats = make_stats(100);
        let mut summary = Summary::new();
        summary.add_item(Item::new(
            "test".to_string(),
            "Test warning".to_string(),
            ItemStatus::Warning,
        ));
        let result = evaluate(&options, &scores, &stats, &summary);
        assert!(!result.passed);
    }

    #[test]
    fn optional_avg_response() {
        let mut options = make_options();
        options.ci_max_avg_response = Some(0.5);
        let scores = make_scores(8.0);
        let mut stats = make_stats(100);
        stats.total_requests_times_avg = 1.0;
        let summary = Summary::new();
        let result = evaluate(&options, &scores, &stats, &summary);
        assert!(!result.passed);
    }

    #[test]
    fn zero_urls_immediate_fail() {
        let options = make_options();
        let scores = make_scores(10.0);
        let stats = make_stats(0);
        let summary = Summary::new();
        let result = evaluate(&options, &scores, &stats, &summary);
        assert!(!result.passed);
        assert_eq!(result.exit_code, 10);
    }

    #[test]
    fn only_negative_status_codes_immediate_fail() {
        let options = make_options();
        let scores = make_scores(10.0);
        // 1 URL visited but only with negative status (e.g. timeout = -2)
        let stats = make_stats_with_status(1, &[(-2, 1)]);
        let summary = Summary::new();
        let result = evaluate(&options, &scores, &stats, &summary);
        assert!(!result.passed);
        assert_eq!(result.exit_code, 10);
    }

    #[test]
    fn category_threshold() {
        let mut options = make_options();
        options.ci_min_performance = Some(8.0);
        let mut scores = make_scores(9.0);
        // Set performance score to 6.0 while keeping overall high
        scores.categories[0].score = 6.0;
        let stats = make_stats(100);
        let summary = Summary::new();
        let result = evaluate(&options, &scores, &stats, &summary);
        assert!(!result.passed);
    }

    #[test]
    fn fail_min_pages() {
        let mut options = make_options();
        options.ci_min_pages = 5;
        let scores = make_scores(8.0);
        let mut stats = make_stats(100);
        // Only 3 HTML pages
        stats.count_by_content_type.insert(ContentTypeId::Html as i32, 3);
        let summary = Summary::new();
        let result = evaluate(&options, &scores, &stats, &summary);
        assert!(!result.passed);
        assert!(result.checks.iter().any(|c| c.metric == "HTML pages" && !c.passed));
    }

    #[test]
    fn pass_min_pages() {
        let mut options = make_options();
        options.ci_min_pages = 5;
        let scores = make_scores(8.0);
        let mut stats = make_stats(100);
        stats.count_by_content_type.insert(ContentTypeId::Html as i32, 10);
        let summary = Summary::new();
        let result = evaluate(&options, &scores, &stats, &summary);
        assert!(result.checks.iter().any(|c| c.metric == "HTML pages" && c.passed));
    }

    #[test]
    fn fail_min_assets() {
        let mut options = make_options();
        options.ci_min_assets = 5;
        let scores = make_scores(8.0);
        let mut stats = make_stats(100);
        stats.count_by_content_type.insert(ContentTypeId::Script as i32, 1);
        stats.count_by_content_type.insert(ContentTypeId::Stylesheet as i32, 1);
        // Total assets = 2, below threshold 5
        let summary = Summary::new();
        let result = evaluate(&options, &scores, &stats, &summary);
        assert!(!result.passed);
        assert!(result.checks.iter().any(|c| c.metric.contains("Assets") && !c.passed));
    }

    #[test]
    fn documents_check_skipped_when_zero() {
        let options = make_options(); // ci_min_documents = 0
        let scores = make_scores(8.0);
        let stats = make_stats(100);
        let summary = Summary::new();
        let result = evaluate(&options, &scores, &stats, &summary);
        // Documents check should not appear at all
        assert!(!result.checks.iter().any(|c| c.metric == "Documents"));
    }

    #[test]
    fn fail_min_documents() {
        let mut options = make_options();
        options.ci_min_documents = 2;
        let scores = make_scores(8.0);
        let mut stats = make_stats(100);
        stats.count_by_content_type.insert(ContentTypeId::Document as i32, 1);
        let summary = Summary::new();
        let result = evaluate(&options, &scores, &stats, &summary);
        assert!(!result.passed);
        assert!(result.checks.iter().any(|c| c.metric == "Documents" && !c.passed));
    }
}


================================================
FILE: src/scoring/mod.rs
================================================
// SiteOne Crawler - Quality Scoring module
// (c) Jan Reges <jan.reges@siteone.cz>

pub mod ci_gate;
pub mod quality_score;
pub mod scorer;


================================================
FILE: src/scoring/quality_score.rs
================================================
// SiteOne Crawler - Quality Score data model
// (c) Jan Reges <jan.reges@siteone.cz>

use serde::Serialize;

#[derive(Debug, Clone, Serialize)]
#[serde(rename_all = "camelCase")]
pub struct QualityScores {
    pub overall: CategoryScore,
    pub categories: Vec<CategoryScore>,
}

#[derive(Debug, Clone, Serialize)]
#[serde(rename_all = "camelCase")]
pub struct CategoryScore {
    pub name: String,
    pub code: String,
    pub score: f64,
    pub label: String,
    pub weight: f64,
    pub deductions: Vec<Deduction>,
}

#[derive(Debug, Clone, Serialize)]
#[serde(rename_all = "camelCase")]
pub struct Deduction {
    pub reason: String,
    pub points: f64,
}

impl CategoryScore {
    pub fn color_hex(&self) -> &'static str {
        match self.score {
            s if s >= 9.0 => "#22c55e",
            s if s >= 7.0 => "#3b82f6",
            s if s >= 5.0 => "#eab308",
            s if s >= 3.0 => "#a855f7",
            _ => "#ef4444",
        }
    }

    pub fn console_color(&self) -> &'static str {
        match self.score {
            s if s >= 9.0 => "green",
            s if s >= 7.0 => "blue",
            s if s >= 5.0 => "yellow",
            s if s >= 3.0 => "magenta",
            _ => "red",
        }
    }
}

pub fn score_label(score: f64) -> &'static str {
    match score {
        s if s >= 9.0 => "Excellent",
        s if s >= 7.0 => "Good",
        s if s >= 5.0 => "Fair",
        s if s >= 3.0 => "Poor",
        _ => "Critical",
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    fn make_score(score: f64) -> CategoryScore {
        CategoryScore {
            name: "Test".to_string(),
            code: "test".to_string(),
            score,
            label: score_label(score).to_string(),
            weight: 1.0,
            deductions: Vec::new(),
        }
    }

    #[test]
    fn score_label_values() {
        assert_eq!(score_label(0.0), "Critical");
        assert_eq!(score_label(3.0), "Poor");
        assert_eq!(score_label(5.0), "Fair");
        assert_eq!(score_label(7.0), "Good");
        assert_eq!(score_label(9.0), "Excellent");
    }

    #[test]
    fn score_label_boundaries() {
        assert_eq!(score_label(2.99), "Critical");
        assert_eq!(score_label(4.99), "Poor");
        assert_eq!(score_label(6.99), "Fair");
        assert_eq!(score_label(8.99), "Good");
    }

    #[test]
    fn color_hex_green_for_excellent() {
        assert_eq!(make_score(9.5).color_hex(), "#22c55e");
    }

    #[test]
    fn color_hex_purple_for_poor() {
        assert_eq!(make_score(4.0).color_hex(), "#a855f7");
    }

    #[test]
    fn color_hex_red_for_critical() {
        assert_eq!(make_score(1.0).color_hex(), "#ef4444");
    }

    #[test]
    fn color_hex_boundaries() {
        assert_eq!(make_score(8.99).color_hex(), "#3b82f6");
        assert_eq!(make_score(6.99).color_hex(), "#eab308");
    }

    #[test]
    fn console_color_values() {
        assert_eq!(make_score(9.5).console_color(), "green");
        assert_eq!(make_score(7.5).console_color(), "blue");
        assert_eq!(make_score(5.5).console_color(), "yellow");
        assert_eq!(make_score(3.5).console_color(), "magenta");
        assert_eq!(make_score(1.0).console_color(), "red");
    }
}


================================================
FILE: src/scoring/scorer.rs
================================================
// SiteOne Crawler - Quality Scorer
// (c) Jan Reges <jan.reges@siteone.cz>
//
// Computes quality scores (0.0-10.0) across 5 categories based on
// data already collected by existing analyzers.

use regex::Regex;

use crate::components::summary::item_status::ItemStatus;
use crate::components::summary::summary::Summary;
use crate::output::output::BasicStats;
use crate::scoring::quality_score::{CategoryScore, Deduction, QualityScores, score_label};

/// Maximum total deduction from "per URL" rules within a single category.
const MAX_PER_URL_DEDUCTION: f64 = 5.0;

/// Maximum deduction from a single per-URL deduction type (prevents one issue from eating entire budget).
const MAX_PER_TYPE_DEDUCTION: f64 = 2.5;

/// Calculate quality scores from analysis results.
pub fn calculate_scores(summary: &Summary, basic_stats: &BasicStats) -> QualityScores {
    let categories = vec![
        score_performance(summary, basic_stats),
        score_seo(summary, basic_stats),
        score_security(summary),
        score_accessibility(summary),
        score_best_practices(summary),
    ];

    let overall_score = categories.iter().map(|c| c.score * c.weight).sum::<f64>();
    let overall_score = round1(overall_score);

    let overall = CategoryScore {
        name: "Overall".to_string(),
        code: "overall".to_string(),
        score: overall_score,
        label: score_label(overall_score).to_string(),
        weight: 1.0,
        deductions: Vec::new(),
    };

    QualityScores { overall, categories }
}

// ---- Category scorers ----

fn score_performance(summary: &Summary, stats: &BasicStats) -> CategoryScore {
    let mut deductions = Vec::new();
    let mut per_url_total = 0.0;

    // Average response time
    if stats.total_requests_times_avg > 1.0 {
        deductions.push(Deduction {
            reason: format!(
                "Average response time {:.0}ms > 1000ms",
                stats.total_requests_times_avg * 1000.0
            ),
            points: 1.0,
        });
    } else if stats.total_requests_times_avg > 0.5 {
        deductions.push(Deduction {
            reason: format!(
                "Average response time {:.0}ms > 500ms",
                stats.total_requests_times_avg * 1000.0
            ),
            points: 0.5,
        });
    }

    // Slowest single response (from BasicStats — covers all resource types)
    if stats.total_requests_times_max > 5.0 {
        deductions.push(Deduction {
            reason: format!("Slowest response {:.1}s > 5.0s", stats.total_requests_times_max),
            points: 1.0,
        });
    } else if stats.total_requests_times_max > 3.0 {
        deductions.push(Deduction {
            reason: format!("Slowest response {:.1}s > 3.0s", stats.total_requests_times_max),
            points: 0.5,
        });
    }

    // Slow URLs count (from slowest analyzer summary)
    if is_not_ok(summary, "slowUrls") {
        let count = get_item_count(summary, "slowUrls").unwrap_or(1);
        if count > 0 {
            let pts = (count as f64 * 0.3).min(MAX_PER_URL_DEDUCTION);
            per_url_total += pts;
            deductions.push(Deduction {
                reason: format!("{} slow URL(s) detected", count),
                points: round1(pts),
            });
        }
    }

    build_category("Performance", "performance", 0.20, deductions, per_url_total)
}

fn score_seo(summary: &Summary, stats: &BasicStats) -> CategoryScore {
    let mut deductions = Vec::new();
    let mut per_url_total = 0.0;

    // Missing H1
    per_url_deduct(
        summary,
        "pages-without-h1",
        0.3,
        "page(s) without <h1>",
        &mut deductions,
        &mut per_url_total,
    );

    // Multiple H1
    per_url_deduct(
        summary,
        "pages-with-multiple-h1",
        0.2,
        "page(s) with multiple <h1>",
        &mut deductions,
        &mut per_url_total,
    );

    // Title uniqueness issues
    if is_not_ok(summary, "title-uniqueness") {
        let count = get_item_count_for_code(summary, "title-uniqueness").unwrap_or(1);
        let pts = (count as f64 * 0.3).min(MAX_PER_TYPE_DEDUCTION);
        let remaining = MAX_PER_URL_DEDUCTION - per_url_total;
        let pts = pts.min(remaining).max(0.0);
        per_url_total += pts;
        deductions.push(Deduction {
            reason: "Non-unique page titles detected".to_string(),
            points: round1(pts),
        });
    }

    // Meta description uniqueness
    if is_not_ok(summary, "meta-description-uniqueness") {
        let count = get_item_count_for_code(summary, "meta-description-uniqueness").unwrap_or(1);
        let pts = (count as f64 * 0.3).min(MAX_PER_TYPE_DEDUCTION);
        let remaining = MAX_PER_URL_DEDUCTION - per_url_total;
        let pts = pts.min(remaining).max(0.0);
        per_url_total += pts;
        deductions.push(Deduction {
            reason: "Non-unique meta descriptions detected".to_string(),
            points: round1(pts),
        });
    }

    // 404 pages — use status code count from BasicStats for accuracy
    let count_404 = stats.count_by_status.get(&404).copied().unwrap_or(0);
    if count_404 > 0 {
        let pts = match count_404 {
            1 => 0.5,
            2..=5 => 1.0,
            6..=20 => 1.5,
            _ => 2.0,
        };
        deductions.push(Deduction {
            reason: format!("{} page(s) returned 404", count_404),
            points: pts,
        });
    }

    // Redirects
    if is_not_ok(summary, "redirects") {
        let count = get_item_count(summary, "redirects").unwrap_or(1);
        if count > 0 {
            let pts = (count as f64 * 0.15).min(MAX_PER_TYPE_DEDUCTION);
            let remaining = MAX_PER_URL_DEDUCTION - per_url_total;
            let pts = pts.min(remaining).max(0.0);
            per_url_total += pts;
            deductions.push(Deduction {
                reason: format!("{} redirect(s) found", count),
                points: round1(pts),
            });
        }
    }

    build_category("SEO", "seo", 0.20, deductions, per_url_total)
}

fn score_security(summary: &Summary) -> CategoryScore {
    let mut deductions = Vec::new();

    // SSL certificate issues
    for code in &[
        "ssl-certificate-connect",
        "ssl-certificate-missing",
        "ssl-certificate-parse",
        "ssl-certificate-valid",
    ] {
        if is_critical(summary, code) {
            deductions.push(Deduction {
                reason: "SSL/TLS certificate issue".to_string(),
                points: 3.0,
            });
            break;
        }
    }

    // SSL certificate validity period
    if is_critical(summary, "ssl-certificate-valid-to") {
        deductions.push(Deduction {
            reason: "SSL certificate expired or expiring soon".to_string(),
            points: 0.5,
        });
    }

    // Unsafe SSL protocols
    if is_critical(summary, "ssl-protocol-unsafe") || is_warning(summary, "ssl-protocol-unsafe") {
        deductions.push(Deduction {
            reason: "Insecure TLS protocol versions supported".to_string(),
            points: 1.0,
        });
    }

    // Security headers — graduated scale based on affected page count
    if is_critical(summary, "security") {
        let count = get_item_count(summary, "security").unwrap_or(1);
        let pts = match count {
            0 => 0.0,
            1 => 1.0,
            2 => 1.5,
            3 => 2.0,
            4..=10 => 2.5,
            11..=50 => 3.0,
            _ => 3.5,
        };
        deductions.push(Deduction {
            reason: format!("{} page(s) with critical security findings", count),
            points: pts,
        });
    } else if is_warning_or_above(summary, "security") {
        let count = get_item_count(summary, "security").unwrap_or(1);
        let pts = match count {
            0 => 0.0,
            1 => 0.5,
            2 => 0.75,
            3 => 1.0,
            4..=10 => 1.25,
            _ => 1.5,
        };
        deductions.push(Deduction {
            reason: format!("{} page(s) with security warnings", count),
            points: pts,
        });
    }

    build_category("Security", "security", 0.25, deductions, 0.0)
}

fn score_accessibility(summary: &Summary) -> CategoryScore {
    let mut deductions = Vec::new();
    let mut per_url_total = 0.0;

    // Missing lang attribute (flat deduction — affects entire site)
    if is_not_ok(summary, "pages-without-lang") {
        let count = get_item_count(summary, "pages-without-lang").unwrap_or(1);
        let pts = if count > 0 { 1.5 } else { 0.0 };
        deductions.push(Deduction {
            reason: format!("{} page(s) without lang attribute", count),
            points: pts,
        });
    }

    // Missing image alt attributes
    per_url_deduct(
        summary,
        "pages-without-image-alt-attributes",
        0.5,
        "page(s) without image alt attributes",
        &mut deductions,
        &mut per_url_total,
    );

    // Missing form labels
    per_url_deduct(
        summary,
        "pages-without-form-labels",
        0.5,
        "page(s) without form labels",
        &mut deductions,
        &mut per_url_total,
    );

    // Skipped heading levels (accessibility concern, not SEO)
    per_url_deduct(
        summary,
        "pages-with-skipped-heading-levels",
        0.1,
        "page(s) with skipped heading levels",
        &mut deductions,
        &mut per_url_total,
    );

    // Missing ARIA labels
    per_url_deduct(
        summary,
        "pages-without-aria-labels",
        0.3,
        "page(s) without aria labels",
        &mut deductions,
        &mut per_url_total,
    );

    // Missing roles (lower weight — semantic HTML provides implicit roles)
    per_url_deduct(
        summary,
        "pages-without-roles",
        0.15,
        "page(s) without role attributes",
        &mut deductions,
        &mut per_url_total,
    );

    // Invalid HTML
    per_url_deduct(
        summary,
        "pages-with-invalid-html",
        0.3,
        "page(s) with invalid HTML",
        &mut deductions,
        &mut per_url_total,
    );

    build_category("Accessibility", "accessibility", 0.20, deductions, per_url_total)
}

fn score_best_practices(summary: &Summary) -> CategoryScore {
    let mut deductions = Vec::new();
    let mut per_url_total = 0.0;

    // Duplicate SVGs
    per_url_deduct(
        summary,
        "pages-with-duplicated-svgs",
        0.3,
        "page(s) with duplicated inline SVGs",
        &mut deductions,
        &mut per_url_total,
    );

    // Large SVGs
    per_url_deduct(
        summary,
        "pages-with-large-svgs",
        0.2,
        "page(s) with large inline SVGs",
        &mut deductions,
        &mut per_url_total,
    );

    // Invalid SVGs
    per_url_deduct(
        summary,
        "pages-with-invalid-svgs",
        0.2,
        "page(s) with invalid inline SVGs",
        &mut deductions,
        &mut per_url_total,
    );

    // Missing quotes
    per_url_deduct(
        summary,
        "pages-with-missing-quotes",
        0.2,
        "page(s) with missing quotes",
        &mut deductions,
        &mut per_url_total,
    );

    // Deep DOM
    per_url_deduct(
        summary,
        "pages-with-deep-dom",
        0.5,
        "page(s) with deep DOM",
        &mut deductions,
        &mut per_url_total,
    );

    // Non-clickable phone numbers
    per_url_deduct(
        summary,
        "pages-with-non-clickable-phone-numbers",
        0.3,
        "page(s) with non-clickable phone numbers",
        &mut deductions,
        &mut per_url_total,
    );

    // Brotli support
    if is_not_ok(summary, "brotli-support") {
        deductions.push(Deduction {
            reason: "No Brotli compression support".to_string(),
            points: 0.5,
        });
    }

    // WebP support
    if is_not_ok(summary, "webp-support") {
        deductions.push(Deduction {
            reason: "No WebP image support".to_string(),
            points: 0.3,
        });
    }

    build_category("Best Practices", "best-practices", 0.15, deductions, per_url_total)
}

// ---- Helpers ----

fn build_category(
    name: &str,
    code: &str,
    weight: f64,
    deductions: Vec<Deduction>,
    _per_url_total: f64,
) -> CategoryScore {
    let fixed_total: f64 = deductions.iter().map(|d| d.points).sum();
    let score = round1((10.0 - fixed_total).clamp(0.0, 10.0));

    CategoryScore {
        name: name.to_string(),
        code: code.to_string(),
        score,
        label: score_label(score).to_string(),
        weight,
        deductions,
    }
}

/// Apply a per-URL deduction with per-type sub-cap and total cap.
fn per_url_deduct(
    summary: &Summary,
    apl_code: &str,
    points_per_url: f64,
    description: &str,
    deductions: &mut Vec<Deduction>,
    per_url_total: &mut f64,
) {
    if is_not_ok(summary, apl_code) {
        let count = get_item_count(summary, apl_code).unwrap_or(1);
        if count > 0 {
            let remaining = MAX_PER_URL_DEDUCTION - *per_url_total;
            if remaining <= 0.0 {
                return;
            }
            // Apply per-type sub-cap, then total cap
            let pts = (count as f64 * points_per_url)
                .min(MAX_PER_TYPE_DEDUCTION)
                .min(remaining);
            *per_url_total += pts;
            deductions.push(Deduction {
                reason: format!("{} {}", count, description),
                points: round1(pts),
            });
        }
    }
}

/// Check if a summary item is not OK (Warning, Critical, or Notice).
fn is_not_ok(summary: &Summary, apl_code: &str) -> bool {
    summary
        .get_items()
        .iter()
        .any(|item| item.apl_code == apl_code && !matches!(item.status, ItemStatus::Ok | ItemStatus::Info))
}

/// Check if a summary item is Critical.
fn is_critical(summary: &Summary, apl_code: &str) -> bool {
    summary
        .get_items()
        .iter()
        .any(|item| item.apl_code == apl_code && item.status == ItemStatus::Critical)
}

/// Check if a summary item is Warning or above.
fn is_warning_or_above(summary: &Summary, apl_code: &str) -> bool {
    summary
        .get_items()
        .iter()
        .any(|item| item.apl_code == apl_code && matches!(item.status, ItemStatus::Warning | ItemStatus::Critical))
}

/// Check if a summary item is Warning.
fn is_warning(summary: &Summary, apl_code: &str) -> bool {
    summary
        .get_items()
        .iter()
        .any(|item| item.apl_code == apl_code && item.status == ItemStatus::Warning)
}

/// Extract a count (first number found) from a non-OK summary item's text.
fn get_item_count(summary: &Summary, apl_code: &str) -> Option<usize> {
    let item = summary
        .get_items()
        .iter()
        .find(|i| i.apl_code == apl_code && !matches!(i.status, ItemStatus::Ok | ItemStatus::Info))?;
    extract_first_number(&item.text)
}

/// Get count for items that may have multiple entries with the same apl_code (e.g. title-uniqueness).
fn get_item_count_for_code(summary: &Summary, apl_code: &str) -> Option<usize> {
    let count = summary
        .get_items()
        .iter()
        .filter(|i| i.apl_code == apl_code && !matches!(i.status, ItemStatus::Ok | ItemStatus::Info))
        .count();
    if count > 0 { Some(count) } else { None }
}

/// Extract the first number from a string (e.g., "Security - 89 pages(s) with..." -> 89).
fn extract_first_number(text: &str) -> Option<usize> {
    number_regex().find(text).and_then(|m| m.as_str().parse().ok())
}

fn number_regex() -> &'static Regex {
    use std::sync::OnceLock;
    static RE: OnceLock<Regex> = OnceLock::new();
    RE.get_or_init(|| Regex::new(r"\d+").unwrap())
}

fn round1(v: f64) -> f64 {
    (v * 10.0).round() / 10.0
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::components::summary::item::Item;
    use crate::scoring::quality_score::score_label;

    fn make_empty_summary() -> Summary {
        Summary::new()
    }

    fn make_summary_with_items(items: Vec<(&str, ItemStatus)>) -> Summary {
        let mut s = Summary::new();
        for (code, status) in items {
            s.add_item(Item::new(code.to_string(), "1 test issue".to_string(), status));
        }
        s
    }

    fn make_basic_stats() -> BasicStats {
        BasicStats {
            total_urls: 100,
            total_requests_times_avg: 0.3,
            ..Default::default()
        }
    }

    #[test]
    fn perfect_score_for_clean_site() {
        let summary = make_empty_summary();
        let stats = make_basic_stats();
        let scores = calculate_scores(&summary, &stats);
        assert_eq!(scores.overall.score, 10.0);
    }

    #[test]
    fn score_label_thresholds() {
        assert_eq!(score_label(9.5), "Excellent");
        assert_eq!(score_label(8.0), "Good");
        assert_eq!(score_label(5.5), "Fair");
        assert_eq!(score_label(3.5), "Poor");
        assert_eq!(score_label(1.0), "Critical");
    }

    #[test]
    fn slow_response_deduction() {
        let summary = make_empty_summary();
        let mut stats = make_basic_stats();
        stats.total_requests_times_avg = 1.5;
        let scores = calculate_scores(&summary, &stats);
        let perf = scores.categories.iter().find(|c| c.code == "performance").unwrap();
        assert!(perf.score < 10.0);
    }

    #[test]
    fn categories_have_correct_weights() {
        let summary = make_empty_summary();
        let stats = make_basic_stats();
        let scores = calculate_scores(&summary, &stats);
        let total_weight: f64 = scores.categories.iter().map(|c| c.weight).sum();
        assert!((total_weight - 1.0).abs() < 0.001);
    }

    #[test]
    fn overall_is_weighted_average() {
        let summary = make_empty_summary();
        let stats = make_basic_stats();
        let scores = calculate_scores(&summary, &stats);
        let expected: f64 = scores.categories.iter().map(|c| c.score * c.weight).sum();
        let expected = round1(expected);
        assert!((scores.overall.score - expected).abs() < 0.01);
    }

    #[test]
    fn errors_404_deduct_from_seo() {
        let summary = make_empty_summary();
        let mut stats = make_basic_stats();
        stats.count_by_status.insert(404, 5);
        let scores = calculate_scores(&summary, &stats);
        let seo = scores.categories.iter().find(|c| c.code == "seo").unwrap();
        assert!(seo.score < 10.0);
    }

    #[test]
    fn warnings_reduce_score() {
        let summary = make_summary_with_items(vec![
            ("pages-without-h1", ItemStatus::Warning),
            ("pages-without-lang", ItemStatus::Warning),
        ]);
        let stats = make_basic_stats();
        let scores = calculate_scores(&summary, &stats);
        assert!(scores.overall.score < 10.0);
    }
}


================================================
FILE: src/server.rs
================================================
// SiteOne Crawler - Built-in HTTP server for serving exports
// (c) Jan Reges <jan.reges@siteone.cz>
//
// Two modes:
// - Markdown: reads .md files, renders them as styled HTML with table/accordion support
// - Offline: serves static HTML files with Content-Security-Policy restricting to same origin

use std::path::{Path, PathBuf};

use tokio::io::{AsyncReadExt, AsyncWriteExt};
use tokio::net::{TcpListener, TcpStream};

use crate::utils;
use crate::version;

/// Server mode
pub enum ServeMode {
    Markdown,
    Offline,
}

/// Run the HTTP server for serving exported content.
pub async fn run(root_dir: PathBuf, mode: ServeMode, port: u16, bind_address: &str) {
    if !root_dir.is_dir() {
        eprintln!(
            "{}",
            utils::get_color_text(
                &format!("ERROR: Directory '{}' does not exist.", root_dir.display()),
                "red",
                false,
            )
        );
        std::process::exit(101);
    }

    let mode_name = match mode {
        ServeMode::Markdown => "Markdown",
        ServeMode::Offline => "Offline HTML",
    };

    let addr = format!("{}:{}", bind_address, port);
    let listener = match TcpListener::bind(&addr).await {
        Ok(l) => l,
        Err(e) => {
            eprintln!(
                "{}",
                utils::get_color_text(
                    &format!("ERROR: Cannot bind to {}:{}: {}", bind_address, port, e),
                    "red",
                    false,
                )
            );
            std::process::exit(1);
        }
    };

    let display_host = if bind_address == "0.0.0.0" || bind_address == "127.0.0.1" {
        "localhost"
    } else {
        bind_address
    };

    println!();
    println!(
        "{}",
        utils::get_color_text(
            &format!("SiteOne Crawler v{} - {} Server", version::CODE, mode_name),
            "yellow",
            false,
        )
    );
    println!(
        "  {}",
        utils::get_color_text(&format!("Serving from: {}", root_dir.display()), "gray", false,)
    );
    println!(
        "  {}",
        utils::get_color_text(&format!("URL: http://{}:{}", display_host, port), "cyan", false,)
    );
    if bind_address == "0.0.0.0" {
        println!(
            "  {}",
            utils::get_color_text("Listening on all network interfaces", "yellow", false,)
        );
    }
    println!("  {}", utils::get_color_text("Press Ctrl+C to stop", "gray", false,));
    println!();

    let is_markdown = matches!(mode, ServeMode::Markdown);

    loop {
        match listener.accept().await {
            Ok((stream, _)) => {
                let root = root_dir.clone();
                tokio::spawn(async move {
                    if let Err(e) = handle_connection(stream, &root, is_markdown).await {
                        eprintln!("Connection error: {}", e);
                    }
                });
            }
            Err(e) => eprintln!("Accept error: {}", e),
        }
    }
}

async fn handle_connection(mut stream: TcpStream, root_dir: &Path, is_markdown: bool) -> std::io::Result<()> {
    let mut buf = vec![0u8; 8192];
    let n = match tokio::time::timeout(std::time::Duration::from_secs(30), stream.read(&mut buf)).await {
        Ok(result) => result?,
        Err(_) => return Ok(()), // read timeout — close silently
    };
    if n == 0 {
        return Ok(());
    }

    let request = String::from_utf8_lossy(&buf[..n]);

    let first_line = match request.lines().next() {
        Some(line) => line,
        None => {
            stream
                .write_all(&build_response(400, "text/plain", b"Bad Request", &[]))
                .await?;
            return Ok(());
        }
    };

    let parts: Vec<&str> = first_line.split_whitespace().collect();
    if parts.len() < 2 || (parts[0] != "GET" && parts[0] != "HEAD") {
        stream
            .write_all(&build_response(
                405,
                "text/plain",
                b"Method Not Allowed",
                &[("Allow", "GET, HEAD")],
            ))
            .await?;
        return Ok(());
    }

    let is_head = parts[0] == "HEAD";

    let raw_path = parts[1];

    // Decode percent-encoding, strip query string and fragment
    let decoded = percent_encoding::percent_decode_str(raw_path)
        .decode_utf8_lossy()
        .to_string();
    let clean_path = decoded
        .split('?')
        .next()
        .unwrap_or(&decoded)
        .split('#')
        .next()
        .unwrap_or(&decoded);

    // Security: prevent path traversal (check segments, not substring)
    let normalized = clean_path.replace('\\', "/");
    if normalized.split('/').any(|seg| seg == "..") {
        stream
            .write_all(&build_response(403, "text/plain", b"Forbidden", &[]))
            .await?;
        return Ok(());
    }

    let relative_path = normalized.trim_start_matches('/');

    let mut response = if is_markdown {
        serve_markdown_request(root_dir, relative_path)
    } else {
        serve_offline_request(root_dir, relative_path)
    };

    // For HEAD requests, send only headers (Content-Length stays correct)
    if is_head && let Some(pos) = find_header_end(&response) {
        response.truncate(pos);
    }

    let status = extract_status(&response);
    let method = parts[0];
    let status_color = if status < 300 {
        "green"
    } else if status < 400 {
        "cyan"
    } else {
        "red"
    };
    println!(
        "  {} {} {}",
        utils::get_color_text(&format!("{}", status), status_color, false),
        method,
        raw_path,
    );

    stream.write_all(&response).await?;
    Ok(())
}

fn find_header_end(response: &[u8]) -> Option<usize> {
    response.windows(4).position(|w| w == b"\r\n\r\n").map(|p| p + 4)
}

fn extract_status(response: &[u8]) -> u16 {
    let header = String::from_utf8_lossy(&response[..std::cmp::min(30, response.len())]);
    header
        .split_whitespace()
        .nth(1)
        .and_then(|s| s.parse().ok())
        .unwrap_or(0)
}

// ---- Markdown serving ----

fn serve_markdown_request(root_dir: &Path, relative_path: &str) -> Vec<u8> {
    let csp = ("Content-Security-Policy", "default-src 'self' 'unsafe-inline' data:");

    match resolve_markdown_path(root_dir, relative_path) {
        Some(path) if !is_within_root(root_dir, &path) => build_response(403, "text/plain", b"Forbidden", &[]),
        Some(path) if path.extension().is_some_and(|ext| ext == "md") => match std::fs::read_to_string(&path) {
            Ok(content) if content.trim().is_empty() => {
                // Empty markdown file — show directory listing instead
                let dir_path = path.parent().unwrap_or(root_dir);
                let url_path = relative_path
                    .trim_end_matches('/')
                    .trim_end_matches("index.md")
                    .trim_end_matches('/');
                let listing = directory_listing(dir_path, url_path, true);
                build_response(200, "text/html; charset=utf-8", listing.as_bytes(), &[csp])
            }
            Ok(content) => {
                let html = render_markdown_to_html(&content, relative_path);
                build_response(200, "text/html; charset=utf-8", html.as_bytes(), &[csp])
            }
            Err(_) => build_404_response(true),
        },
        Some(path) => serve_static_file(&path, &[csp]),
        None => {
            let dir_path = root_dir.join(relative_path);
            if dir_path.is_dir() && is_within_root(root_dir, &dir_path) {
                let listing = directory_listing(&dir_path, relative_path, true);
                build_response(200, "text/html; charset=utf-8", listing.as_bytes(), &[csp])
            } else {
                build_404_response(true)
            }
        }
    }
}

fn resolve_markdown_path(root_dir: &Path, relative_path: &str) -> Option<PathBuf> {
    if relative_path.is_empty() {
        let index = root_dir.join("index.md");
        if index.is_file() {
            return Some(index);
        }
        return None;
    }

    let full_path = root_dir.join(relative_path);

    // Direct file match (static files, .md files with extension in URL)
    if full_path.is_file() {
        return Some(full_path);
    }

    // Try adding .md extension
    let trimmed = relative_path.trim_end_matches('/');
    let md_path = root_dir.join(format!("{}.md", trimmed));
    if md_path.is_file() {
        return Some(md_path);
    }

    // Try as directory with index.md
    let index_path = full_path.join("index.md");
    if index_path.is_file() {
        return Some(index_path);
    }

    None
}

// ---- Offline serving ----

fn serve_offline_request(root_dir: &Path, relative_path: &str) -> Vec<u8> {
    let csp = ("Content-Security-Policy", "default-src 'self' 'unsafe-inline' data:");

    match resolve_offline_path(root_dir, relative_path) {
        Some(path) if !is_within_root(root_dir, &path) => build_response(403, "text/plain", b"Forbidden", &[]),
        Some(path) => serve_static_file(&path, &[csp]),
        None => {
            let dir_path = root_dir.join(relative_path);
            if dir_path.is_dir() && is_within_root(root_dir, &dir_path) {
                let listing = directory_listing(&dir_path, relative_path, false);
                build_response(200, "text/html; charset=utf-8", listing.as_bytes(), &[csp])
            } else {
                build_404_response(false)
            }
        }
    }
}

fn resolve_offline_path(root_dir: &Path, relative_path: &str) -> Option<PathBuf> {
    if relative_path.is_empty() {
        let index = root_dir.join("index.html");
        if index.is_file() {
            return Some(index);
        }
        return None;
    }

    let full_path = root_dir.join(relative_path);

    // Direct file match
    if full_path.is_file() {
        return Some(full_path);
    }

    // Try as directory with index.html (prefer over .html redirect files)
    let dir_path = root_dir.join(relative_path.trim_end_matches('/'));
    let index_path = dir_path.join("index.html");
    if index_path.is_file() {
        return Some(index_path);
    }

    // Try with .html extension
    let trimmed = relative_path.trim_end_matches('/');
    let html_path = root_dir.join(format!("{}.html", trimmed));
    if html_path.is_file() {
        return Some(html_path);
    }

    None
}

// ---- Shared utilities ----

fn serve_static_file(path: &Path, extra_headers: &[(&str, &str)]) -> Vec<u8> {
    match std::fs::read(path) {
        Ok(content) => {
            let ext = path.extension().and_then(|e| e.to_str()).unwrap_or("");
            let content_type = content_type_for_extension(ext);
            build_response(200, content_type, &content, extra_headers)
        }
        Err(_) => build_response(500, "text/plain", b"Internal Server Error", &[]),
    }
}

/// Verify that the resolved path stays within the root directory (symlink-safe).
fn is_within_root(root_dir: &Path, resolved_path: &Path) -> bool {
    let Ok(canonical_root) = std::fs::canonicalize(root_dir) else {
        return false;
    };
    let Ok(canonical_path) = std::fs::canonicalize(resolved_path) else {
        return false;
    };
    canonical_path.starts_with(&canonical_root)
}

fn build_response(status: u16, content_type: &str, body: &[u8], extra_headers: &[(&str, &str)]) -> Vec<u8> {
    let status_text = match status {
        200 => "OK",
        301 => "Moved Permanently",
        400 => "Bad Request",
        403 => "Forbidden",
        404 => "Not Found",
        405 => "Method Not Allowed",
        500 => "Internal Server Error",
        _ => "Unknown",
    };

    let mut header = format!(
        "HTTP/1.1 {} {}\r\nContent-Type: {}\r\nContent-Length: {}\r\nX-Powered-By: siteone-crawler/{}\r\nX-Frame-Options: DENY\r\nX-Content-Type-Options: nosniff\r\nConnection: close\r\n",
        status,
        status_text,
        content_type,
        body.len(),
        version::CODE
    );

    for (name, value) in extra_headers {
        header.push_str(&format!("{}: {}\r\n", name, value));
    }

    header.push_str("\r\n");

    let mut response = header.into_bytes();
    response.extend_from_slice(body);
    response
}

fn build_404_response(is_markdown: bool) -> Vec<u8> {
    let body = if is_markdown {
        format!(
            "<!DOCTYPE html>\n<html lang=\"en\">\n<head><meta charset=\"utf-8\"><title>404 Not Found</title>\n<style>{}</style>\n</head>\n<body>\n<div class=\"container\">\n<article class=\"markdown-body\">\n<h1>404 - Page Not Found</h1>\n<p>The requested page was not found.</p>\n<p><a href=\"/\">Back to home</a></p>\n</article>\n</div>\n</body>\n</html>",
            MARKDOWN_CSS
        )
    } else {
        "<!DOCTYPE html>\n<html><body><h1>404 Not Found</h1><p>The requested file was not found.</p></body></html>"
            .to_string()
    };
    build_response(404, "text/html; charset=utf-8", body.as_bytes(), &[])
}

fn content_type_for_extension(ext: &str) -> &'static str {
    // Extensions come from the filesystem and are almost always lowercase.
    // Use a small stack buffer to avoid heap allocation for the rare uppercase case.
    let mut lower = [0u8; 8];
    let ext_lower = if ext.len() <= 8 {
        for (i, b) in ext.bytes().enumerate() {
            lower[i] = b.to_ascii_lowercase();
        }
        std::str::from_utf8(&lower[..ext.len()]).unwrap_or(ext)
    } else {
        ext // fallback: will only match if already lowercase
    };
    match ext_lower {
        "html" | "htm" => "text/html; charset=utf-8",
        "css" => "text/css; charset=utf-8",
        "js" | "mjs" => "application/javascript; charset=utf-8",
        "json" => "application/json; charset=utf-8",
        "xml" => "application/xml; charset=utf-8",
        "txt" => "text/plain; charset=utf-8",
        "md" => "text/markdown; charset=utf-8",
        "png" => "image/png",
        "jpg" | "jpeg" => "image/jpeg",
        "gif" => "image/gif",
        "svg" => "image/svg+xml; charset=utf-8",
        "ico" => "image/x-icon",
        "webp" => "image/webp",
        "avif" => "image/avif",
        "woff" => "font/woff",
        "woff2" => "font/woff2",
        "ttf" => "font/ttf",
        "otf" => "font/otf",
        "eot" => "application/vnd.ms-fontobject",
        "pdf" => "application/pdf",
        "zip" => "application/zip",
        "mp4" => "video/mp4",
        "webm" => "video/webm",
        "mp3" => "audio/mpeg",
        _ => "application/octet-stream",
    }
}

// ---- Markdown rendering ----

fn render_markdown_to_html(markdown: &str, request_path: &str) -> String {
    use pulldown_cmark::{Options, Parser, html};

    // Replace curly/smart quotes with straight quotes
    let markdown = markdown
        .replace(['\u{201c}', '\u{201d}'], "\"")
        .replace(['\u{2018}', '\u{2019}'], "'");

    // Clean up markdown artifacts from HTML→MD conversion
    let cleaned = clean_markdown_artifacts(&markdown);

    let mut options = Options::empty();
    options.insert(Options::ENABLE_TABLES);
    options.insert(Options::ENABLE_STRIKETHROUGH);
    options.insert(Options::ENABLE_TASKLISTS);

    let parser = Parser::new_ext(&cleaned, options);
    let mut html_content = String::new();
    html::push_html(&mut html_content, parser);

    // Add id attributes to h1-h4 headings for anchor linking
    html_content = add_heading_ids(&html_content);

    // Convert heading + link-only blocks (>3 links) into accordions
    html_content = collapse_link_blocks(&html_content);

    // Add link counts to existing Menu/Links accordions
    html_content = add_accordion_link_counts(&html_content);

    // Style callout blocks (Tip, Note, Caution, etc.)
    html_content = style_callout_blocks(&html_content);

    // Extract title from first heading in the cleaned markdown
    let heading = extract_title(&cleaned);
    let title = if heading == "SiteOne Crawler - Markdown Viewer" {
        heading
    } else {
        format!("{} | SiteOne Crawler - Markdown Viewer", heading)
    };

    // Build breadcrumb navigation
    let breadcrumb = build_breadcrumb(request_path);

    format!(
        r#"<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1">
<meta name="author" content="SiteOne Crawler - https://crawler.siteone.io/">
<title>{title}</title>
<style>
{css}
</style>
<script>
(function(){{
  var t=localStorage.getItem('md-theme'),w=localStorage.getItem('md-width');
  if(t==='dark')document.documentElement.classList.add('dark');
  if(w==='wide')document.documentElement.classList.add('wide');
}})();
</script>
</head>
<body>
<div class="container">
<nav class="breadcrumb">
<span class="breadcrumb-path">{breadcrumb}</span>
<span class="toolbar">
<button onclick="toggleWidth()" id="width-btn" title="Toggle full width"></button>
<button onclick="toggleTheme()" id="theme-btn" title="Toggle dark/light mode"></button>
</span>
</nav>
<article class="markdown-body">
{content}
</article>
<footer>
<p>Served by <a href="https://crawler.siteone.io/" target="_blank" rel="noopener">SiteOne Crawler</a> v{version}</p>
</footer>
</div>
<script>
var svgMoon='<svg viewBox="0 0 16 16" width="14" height="14"><path d="M6 .5a7.5 7.5 0 1 0 8 12A6 6 0 0 1 6 .5z" fill="currentColor"/></svg>';
var svgSun='<svg viewBox="0 0 16 16" width="14" height="14"><circle cx="8" cy="8" r="2.8" fill="currentColor"/><g stroke="currentColor" stroke-width="1.5" stroke-linecap="round"><line x1="8" y1=".5" x2="8" y2="3"/><line x1="8" y1="13" x2="8" y2="15.5"/><line x1=".5" y1="8" x2="3" y2="8"/><line x1="13" y1="8" x2="15.5" y2="8"/><line x1="2.7" y1="2.7" x2="4.5" y2="4.5"/><line x1="11.5" y1="11.5" x2="13.3" y2="13.3"/><line x1="2.7" y1="13.3" x2="4.5" y2="11.5"/><line x1="11.5" y1="4.5" x2="13.3" y2="2.7"/></g></svg>';
var svgExpand='<svg viewBox="0 0 16 16" width="14" height="14" fill="none" stroke="currentColor" stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round"><path d="M1 8h14M4.5 5L1 8l3.5 3M11.5 5L15 8l-3.5 3"/></svg>';
var svgContract='<svg viewBox="0 0 16 16" width="14" height="14" fill="none" stroke="currentColor" stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round"><path d="M1 8h5M10 8h5M3.5 5L7 8l-3.5 3M12.5 5L9 8l3.5 3"/></svg>';
function toggleTheme(){{
  document.documentElement.classList.toggle('dark');
  localStorage.setItem('md-theme',document.documentElement.classList.contains('dark')?'dark':'light');
  updBtn();
}}
function toggleWidth(){{
  document.documentElement.classList.toggle('wide');
  localStorage.setItem('md-width',document.documentElement.classList.contains('wide')?'wide':'narrow');
  updBtn();
}}
function updBtn(){{
  var d=document.documentElement.classList.contains('dark');
  var w=document.documentElement.classList.contains('wide');
  document.getElementById('theme-btn').innerHTML=d?svgSun:svgMoon;
  document.getElementById('width-btn').innerHTML=w?svgContract:svgExpand;
}}
updBtn();
</script>
</body>
</html>"#,
        title = html_escape(&title),
        css = MARKDOWN_CSS,
        breadcrumb = breadcrumb,
        content = html_content,
        version = version::CODE,
    )
}

/// Clean up common artifacts left by HTML→Markdown export.
fn clean_markdown_artifacts(markdown: &str) -> String {
    let lines: Vec<&str> = markdown.lines().collect();
    let mut result: Vec<&str> = Vec::with_capacity(lines.len());
    let mut in_code_block = false;

    // Phase 1: Skip site navigation header before the first h1 heading.
    // If there are 3+ links before the first h1, the content is likely nav/header.
    // But preserve <details> blocks (accordion menus) from the navigation.
    let mut content_start = 0;
    {
        let mut link_count = 0;
        for idx in 0..lines.len() {
            let t = lines[idx].trim();
            if t.starts_with("```") {
                break; // don't look inside code blocks
            }

            // Check for setext h1 (line followed by ===)
            if idx + 1 < lines.len() {
                let next = lines[idx + 1].trim();
                if !next.is_empty() && next.len() >= 3 && next.chars().all(|c| c == '=') {
                    if link_count >= 3 {
                        content_start = idx;
                    }
                    break;
                }
            }
            // Check for ATX h1
            if t.starts_with("# ") && !t.starts_with("## ") {
                if link_count >= 3 {
                    content_start = idx;
                }
                break;
            }

            // Count links in non-heading lines
            link_count += t.matches("](").count();
        }
    }

    // Phase 1b: Preserve navigation from the skipped section.
    // If the section already contains <details> blocks, preserve them as-is.
    // Otherwise, wrap the entire nav content in a <details><summary>Menu</summary> block.
    if content_start > 0 {
        let has_details = lines[..content_start]
            .iter()
            .any(|l| l.trim() == "<details>" || l.trim().starts_with("<details>"));

        if has_details {
            // Preserve existing <details> blocks (e.g. astro Menu accordion)
            let mut k = 0;
            while k < content_start {
                let t = lines[k].trim();
                if t == "<details>" || t.starts_with("<details>") {
                    while k < content_start {
                        result.push(lines[k]);
                        k += 1;
                        if lines[k - 1].trim() == "</details>" {
                            result.push(""); // blank line required so pulldown-cmark ends the HTML block
                            break;
                        }
                    }
                } else {
                    k += 1;
                }
            }
        } else {
            // No <details> blocks — wrap nav content in an accordion.
            // Collect non-empty, non-artifact lines as the accordion body.
            let mut nav_lines: Vec<&str> = Vec::new();
            for line in &lines[..content_start] {
                let t = line.trim();
                // Skip header artifacts
                if t.is_empty()
                    || t == "-"
                    || t.starts_with("[Skip to content]")
                    || (t.starts_with("| [") && t.ends_with(" |"))
                {
                    continue;
                }
                // Skip setext underlines (--- or ===)
                if t.len() >= 3 && (t.chars().all(|c| c == '-') || t.chars().all(|c| c == '=')) {
                    continue;
                }
                // Skip heading text that is just "Site navigation" or similar
                if t == "Site navigation" || t == "Navigation" {
                    continue;
                }
                nav_lines.push(line);
            }
            // Trim trailing non-link plain-text lines (breadcrumb labels etc.)
            while let Some(last) = nav_lines.last() {
                let t = last.trim();
                if !t.contains("](") && !t.starts_with("- ") && !t.starts_with("### ") {
                    nav_lines.pop();
                } else {
                    break;
                }
            }
            if nav_lines.len() > 3 {
                result.push("<details>");
                result.push("<summary>Menu</summary>");
                result.push("");
                for line in &nav_lines {
                    result.push(line);
                }
                result.push("");
                result.push("</details>");
                result.push(""); // empty line so pulldown-cmark starts a new block
            }
        }
    }

    // Phase 2: Process lines from content_start, filtering artifacts
    let mut i = content_start;
    while i < lines.len() {
        let trimmed = lines[i].trim();

        // Track code blocks to avoid filtering inside them
        if trimmed.starts_with("```") {
            in_code_block = !in_code_block;
            result.push(lines[i]);
            i += 1;
            continue;
        }
        if in_code_block {
            result.push(lines[i]);
            i += 1;
            continue;
        }

        // Skip empty list items (just "-" with no text content)
        if trimmed == "-" {
            i += 1;
            continue;
        }

        // Skip [Section titled "..."](...) lines
        if trimmed.starts_with("[Section titled \"") && trimmed.ends_with(')') {
            i += 1;
            continue;
        }

        // Skip [Skip to content](...) variants
        if trimmed.starts_with("[Skip to content]") {
            i += 1;
            continue;
        }

        // Detect footer: "[Go to ... homepage](...)" standalone link
        if !trimmed.starts_with("- ")
            && trimmed.starts_with("[Go to ")
            && trimmed.to_lowercase().contains("homepage")
            && trimmed.ends_with(')')
        {
            break;
        }

        // Skip "On this page" heading + its following <details> block
        if (trimmed == "On this page" || trimmed == "On this page:")
            && i + 1 < lines.len()
            && lines[i + 1].trim().starts_with('-')
        {
            // Skip underline-style heading (--- below)
            i += 1;
            if i < lines.len() && lines[i].trim().starts_with('-') && lines[i].trim().chars().all(|c| c == '-') {
                i += 1;
            }
            // Skip until next heading or end of <details> block
            while i < lines.len() {
                let t = lines[i].trim();
                if t.starts_with("# ")
                    || t.starts_with("## ")
                    || (t.starts_with('#') && t.chars().nth(1).is_some_and(|c| c == '#' || c == ' '))
                {
                    break;
                }
                if t == "</details>" {
                    i += 1;
                    break;
                }
                i += 1;
            }
            // Skip empty lines after the block
            while i < lines.len() && lines[i].trim().is_empty() {
                i += 1;
            }
            continue;
        }

        // Skip footer artifacts: "Learn" heading (astro docs pattern)
        if trimmed == "Learn"
            && i + 1 < lines.len()
            && (lines.get(i + 1).is_some_and(|l| l.trim().is_empty())
                || lines.get(i + 2).is_some_and(|l| l.trim().starts_with("| [")))
        {
            break;
        }

        result.push(lines[i]);
        i += 1;
    }

    // Phase 3: Fix broken code fences — detect unfenced code blocks between
    // a closing ``` and an opening ```lang (or at end of content).
    // The HTML→MD converter sometimes misses fences for tabbed code examples.
    let mut fixed: Vec<&str> = Vec::with_capacity(result.len());
    let mut ri = 0;
    let result_lines: Vec<&str> = result; // take ownership
    let mut in_fence = false;
    while ri < result_lines.len() {
        let t = result_lines[ri].trim();
        if t.starts_with("```") {
            in_fence = !in_fence;
            fixed.push(result_lines[ri]);
            ri += 1;
            continue;
        }
        if in_fence {
            fixed.push(result_lines[ri]);
            ri += 1;
            continue;
        }

        // Not inside a fence — check if this starts an unfenced code block.
        // Heuristic: if the previous non-empty line was a closing ```,
        // and this line looks like code (contains semicolons, braces, =>, etc.),
        // collect all lines until the next ``` or blank-line gap.
        let prev_is_fence_close = fixed
            .iter()
            .rev()
            .find(|l| !l.trim().is_empty())
            .is_some_and(|l| l.trim().starts_with("```"));

        if prev_is_fence_close && !t.is_empty() && looks_like_code(t) {
            // Collect unfenced code lines
            let start = ri;
            while ri < result_lines.len() {
                let lt = result_lines[ri].trim();
                if lt.starts_with("```") {
                    break;
                }
                // Stop if we hit a markdown heading or a completely blank line
                // followed by non-code content
                if lt.is_empty() {
                    // Look ahead: if the next non-empty line is not code-like, stop
                    let mut peek = ri + 1;
                    while peek < result_lines.len() && result_lines[peek].trim().is_empty() {
                        peek += 1;
                    }
                    if peek < result_lines.len() && !looks_like_code(result_lines[peek].trim()) {
                        break;
                    }
                }
                if lt.starts_with("# ") || lt.starts_with("## ") || lt.starts_with("### ") {
                    break;
                }
                ri += 1;
            }
            // Trim trailing empty lines from the collected block
            let mut end = ri;
            while end > start && result_lines[end - 1].trim().is_empty() {
                end -= 1;
            }
            if end > start {
                fixed.push("```");
                for line in &result_lines[start..end] {
                    fixed.push(line);
                }
                fixed.push("```");
            }
            // If the next line is an opening ```, it will be handled normally
            continue;
        }

        fixed.push(result_lines[ri]);
        ri += 1;
    }

    fixed.join("\n")
}

/// Heuristic: does this line look like source code?
/// Requires at least 2 code indicators to reduce false positives on prose.
fn looks_like_code(line: &str) -> bool {
    let mut score = 0;
    // Strong indicators (score 2) — very unlikely in prose
    if line.contains("=> {") || line.contains("=> (") {
        score += 2;
    }
    if line.contains("};") {
        score += 2;
    }
    if line.ends_with(';') {
        score += 2;
    }
    if line.starts_with("//") {
        score += 2;
    }
    if line.starts_with("if (") || line.starts_with("if (!") {
        score += 2;
    }
    if line.contains("?.") {
        score += 2;
    } // optional chaining
    if line.contains("===") || line.contains("!==") {
        score += 2;
    }
    // Moderate indicators (score 1)
    if line.contains("export ") {
        score += 1;
    }
    if line.contains("const ") {
        score += 1;
    }
    if line.contains("return ") {
        score += 1;
    }
    if line.contains("await ") {
        score += 1;
    }
    if line.contains("async ") {
        score += 1;
    }
    if line.contains("function ") {
        score += 1;
    }
    if line.ends_with('{') || line.ends_with('}') {
        score += 1;
    }
    score >= 2
}

/// Convert standalone callout paragraphs (Tip, Note, Caution, etc.) into styled divs.
fn style_callout_blocks(html: &str) -> String {
    let callout_patterns: [(&str, &str); 6] = [
        ("<p>Tip</p>", "Tip"),
        ("<p>Note</p>", "Note"),
        ("<p>Caution</p>", "Caution"),
        ("<p>Warning</p>", "Warning"),
        ("<p>Important</p>", "Important"),
        ("<p>Quick start</p>", "Quick start"),
    ];
    let mut result = String::with_capacity(html.len() + 512);
    let lines: Vec<&str> = html.lines().collect();
    let mut i = 0;

    while i < lines.len() {
        let trimmed = lines[i].trim();

        // Match <p>Label</p> followed by <p>content</p>
        let mut matched_label = None;
        for (pattern, label) in &callout_patterns {
            if trimmed == *pattern {
                matched_label = Some(*label);
                break;
            }
        }

        if let Some(label) = matched_label {
            let icon = match label {
                "Tip" => "💡",
                "Note" | "Important" => "📝",
                "Caution" | "Warning" => "⚠️",
                "Quick start" => "🚀",
                _ => "ℹ️",
            };
            let css_class = match label {
                "Caution" | "Warning" => "callout callout-warning",
                "Tip" | "Quick start" => "callout callout-tip",
                _ => "callout callout-note",
            };
            // Collect following paragraphs as callout content
            result.push_str(&format!(
                "<div class=\"{}\">\n<p class=\"callout-title\">{} {}</p>\n",
                css_class, icon, label
            ));
            i += 1;
            // Include subsequent content paragraphs until next heading or another callout
            while i < lines.len() {
                let next = lines[i].trim();
                if next.is_empty() {
                    i += 1;
                    continue;
                }
                // Stop at headings, details, or another callout label
                if next.starts_with("<h") || next.starts_with("<details") || next.starts_with("<div class=\"callout") {
                    break;
                }
                // Include this line in the callout
                result.push_str(lines[i]);
                result.push('\n');
                i += 1;
                // Only include the first content element
                break;
            }
            result.push_str("</div>\n");
        } else {
            result.push_str(lines[i]);
            result.push('\n');
            i += 1;
        }
    }

    result
}

fn extract_title(markdown: &str) -> String {
    let lines: Vec<&str> = markdown.lines().collect();
    let mut in_code_block = false;
    for (i, line) in lines.iter().enumerate() {
        let trimmed = line.trim();
        if trimmed.starts_with("```") {
            in_code_block = !in_code_block;
            continue;
        }
        if in_code_block {
            continue;
        }
        // ATX h1: # Heading
        if let Some(heading) = trimmed.strip_prefix("# ") {
            return heading.trim().to_string();
        }
        // Setext h1: text followed by === on the next line
        if !trimmed.is_empty()
            && i + 1 < lines.len()
            && lines[i + 1].trim().len() >= 3
            && lines[i + 1].trim().chars().all(|c| c == '=')
        {
            return trimmed.to_string();
        }
    }
    "SiteOne Crawler - Markdown Viewer".to_string()
}

fn add_heading_ids(html: &str) -> String {
    let closing_tags: [&str; 4] = ["</h1>", "</h2>", "</h3>", "</h4>"];
    let mut result = String::with_capacity(html.len() + 256);
    let mut used_slugs: std::collections::HashMap<String, usize> = std::collections::HashMap::new();
    let mut search_from = 0;

    while search_from < html.len() {
        // Find next '<h' pattern
        let remaining = &html[search_from..];
        let next_h = remaining.find("<h").and_then(|pos| {
            let abs = search_from + pos;
            let bytes = html.as_bytes();
            if abs + 3 < bytes.len() && bytes[abs + 2] >= b'1' && bytes[abs + 2] <= b'4' && bytes[abs + 3] == b'>' {
                Some((abs, (bytes[abs + 2] - b'1') as usize))
            } else {
                None
            }
        });

        match next_h {
            Some((tag_start, level_idx)) => {
                let tag_end = tag_start + 4; // past '>'
                let closing_tag = closing_tags[level_idx];
                if let Some(close_rel) = html[tag_end..].find(closing_tag) {
                    let inner_html = &html[tag_end..tag_end + close_rel];
                    let plain_text = strip_html_tags(inner_html);
                    let base_slug = slugify(&plain_text);

                    // Copy everything before this heading
                    result.push_str(&html[search_from..tag_start]);

                    if !base_slug.is_empty() {
                        // Deduplicate slug
                        let slug = match used_slugs.get(&base_slug) {
                            Some(&count) => {
                                let deduped = format!("{}-{}", base_slug, count);
                                *used_slugs.get_mut(&base_slug).unwrap() = count + 1;
                                deduped
                            }
                            None => {
                                used_slugs.insert(base_slug.clone(), 1);
                                base_slug
                            }
                        };
                        let escaped_slug = html_escape(&slug);
                        result.push_str(&format!(
                            "<h{0} id=\"{1}\"><a href=\"#{1}\" class=\"heading-link\">",
                            level_idx + 1,
                            escaped_slug
                        ));
                        result.push_str(inner_html);
                        result.push_str("</a>");
                        result.push_str(closing_tag);
                    } else {
                        result.push_str(&html[tag_start..tag_end]);
                        result.push_str(inner_html);
                        result.push_str(closing_tag);
                    }
                    search_from = tag_end + close_rel + closing_tag.len();
                } else {
                    // No closing tag found — emit the opening tag and continue
                    result.push_str(&html[search_from..tag_end]);
                    search_from = tag_end;
                }
            }
            None => {
                // No more headings — copy rest and done
                result.push_str(&html[search_from..]);
                break;
            }
        }
    }

    result
}

/// Detect heading level from an HTML line like `<h2 ...>` or `<h2>`.
fn detect_heading_level(line: &str) -> Option<u8> {
    let t = line.trim();
    if t.starts_with("<h") && t.len() > 3 {
        let ch = t.as_bytes()[2];
        if (b'1'..=b'6').contains(&ch) && (t.as_bytes()[3] == b'>' || t.as_bytes()[3] == b' ') {
            return Some(ch - b'0');
        }
    }
    None
}

/// When a heading is followed only by link-only content (paragraphs or lists)
/// and there are more than 3 links, collapse them into an accordion.
/// Also handles sub-headings + link blocks grouped under a parent heading.
fn collapse_link_blocks(html: &str) -> String {
    let lines: Vec<&str> = html.lines().collect();
    let mut result = String::with_capacity(html.len() + 512);
    let mut i = 0;
    let mut details_depth = 0; // track nesting inside <details> blocks

    while i < lines.len() {
        let trimmed = lines[i].trim();

        // Track <details> nesting — don't create accordions inside existing ones
        if trimmed.starts_with("<details") {
            details_depth += 1;
        }
        if trimmed == "</details>" && details_depth > 0 {
            details_depth -= 1;
        }

        let heading_level = detect_heading_level(trimmed);

        if details_depth > 0 {
            // Inside an existing <details> block — pass through without collapsing
            result.push_str(lines[i]);
            result.push('\n');
            i += 1;
            continue;
        }

        if let Some(level) = heading_level {
            let heading_line = lines[i];
            let closing = format!("</h{}>", level);
            let heading_text = if let Some(start) = heading_line.find('>') {
                let after = &heading_line[start + 1..];
                if let Some(end) = after.find(&closing) {
                    strip_html_tags(&after[..end]).trim().to_string()
                } else {
                    String::new()
                }
            } else {
                String::new()
            };

            // Scan forward: collect link-only content blocks
            // Allowed: empty lines, link-only <p>, link-only <ul>, sub-headings
            let mut j = i + 1;
            let mut link_count = 0;
            let mut content_indices: Vec<(usize, usize)> = Vec::new(); // (start, end) ranges
            let mut all_link_only = true;

            while j < lines.len() {
                let next = lines[j].trim();

                if next.is_empty() {
                    content_indices.push((j, j + 1));
                    j += 1;
                    continue;
                }

                // Stop at same-or-higher-level heading
                if let Some(next_level) = detect_heading_level(next) {
                    if next_level <= level {
                        break;
                    }
                    // Sub-heading within this section — include it
                    content_indices.push((j, j + 1));
                    j += 1;
                    continue;
                }

                // Link-only paragraph
                if is_link_only_paragraph(next) {
                    link_count += next.matches("<a ").count();
                    content_indices.push((j, j + 1));
                    j += 1;
                    continue;
                }

                // Link-only <ul> block
                if next == "<ul>" {
                    let ul_start = j;
                    let mut ul_links = 0;
                    let mut ul_ok = true;
                    let mut k = j + 1;
                    while k < lines.len() {
                        let ul_line = lines[k].trim();
                        if ul_line == "</ul>" {
                            k += 1;
                            break;
                        }
                        if ul_line.starts_with("<li>") && ul_line.contains("<a ") {
                            ul_links += 1;
                        } else if ul_line.starts_with("<li>") && ul_line != "<li></li>" {
                            // Non-link list item with content
                            let inner_text = strip_html_tags(ul_line);
                            if !inner_text.trim().is_empty() {
                                ul_ok = false;
                            }
                        }
                        k += 1;
                    }

                    if ul_ok && ul_links > 0 {
                        link_count += ul_links;
                        content_indices.push((ul_start, k));
                        j = k;
                        continue;
                    }

                    all_link_only = false;
                    break;
                }

                // Any other content → stop
                all_link_only = false;
                break;
            }

            if all_link_only && link_count > 3 && !heading_text.is_empty() {
                result.push_str(&format!(
                    "<details>\n<summary>{} ({} links)</summary>\n",
                    html_escape(&heading_text),
                    link_count
                ));
                for (start, end) in &content_indices {
                    for line in lines.iter().take(*end).skip(*start) {
                        result.push_str(line);
                        result.push('\n');
                    }
                }
                result.push_str("</details>\n");
                i = j;
            } else {
                result.push_str(heading_line);
                result.push('\n');
                i += 1;
            }
        } else {
            result.push_str(lines[i]);
            result.push('\n');
            i += 1;
        }
    }

    result
}

/// Check if an HTML line is a `<p>` containing only `<a>` links.
fn is_link_only_paragraph(line: &str) -> bool {
    let trimmed = line.trim();
    if !trimmed.starts_with("<p>") || !trimmed.ends_with("</p>") {
        return false;
    }
    let inner = &trimmed[3..trimmed.len() - 4];
    if !inner.contains("<a ") {
        return false;
    }
    let mut remaining = inner.to_string();
    while let Some(start) = remaining.find("<a ") {
        if let Some(end) = remaining[start..].find("</a>") {
            remaining = format!("{}{}", &remaining[..start], &remaining[start + end + 4..]);
        } else {
            break;
        }
    }
    remaining.trim().is_empty()
}

/// Find `<details>` blocks whose `<summary>` is "Menu" or "Links" and append link count.
fn add_accordion_link_counts(html: &str) -> String {
    let mut result = String::with_capacity(html.len() + 256);
    let mut search_from = 0;

    while let Some(details_start) = html[search_from..].find("<details>") {
        let abs_start = search_from + details_start;
        // Copy everything before this <details>
        result.push_str(&html[search_from..abs_start]);

        // Find matching </details> (respecting nesting)
        let after_tag = abs_start + "<details>".len();
        let details_end = {
            let mut depth = 1;
            let mut scan = after_tag;
            loop {
                let next_open = html[scan..].find("<details>");
                let next_close = html[scan..].find("</details>");
                match next_close {
                    Some(close_rel) => {
                        if let Some(open_rel) = next_open
                            && open_rel < close_rel
                        {
                            depth += 1;
                            scan += open_rel + "<details>".len();
                        } else {
                            depth -= 1;
                            scan += close_rel + "</details>".len();
                            if depth == 0 {
                                break;
                            }
                        }
                    }
                    None => break,
                }
            }
            if depth == 0 { Some(scan) } else { None }
        };
        if let Some(details_end) = details_end {
            let block = &html[abs_start..details_end];

            // Check if summary is "Menu" or "Links"
            if let Some(summary_start) = block.find("<summary>")
                && let Some(summary_end) = block.find("</summary>")
            {
                let summary_text = &block[summary_start + "<summary>".len()..summary_end];
                let trimmed_summary = summary_text.trim();

                if trimmed_summary == "Menu" || trimmed_summary == "Links" {
                    // Count <a> links inside the block
                    let link_count = block.matches("<a ").count() + block.matches("<a\n").count();
                    if link_count > 0 {
                        // Rebuild block with count in summary
                        let new_summary = format!("<summary>{} ({} links)</summary>", trimmed_summary, link_count);
                        let before_summary = &block[..summary_start];
                        let after_summary = &block[summary_end + "</summary>".len()..];
                        result.push_str(before_summary);
                        result.push_str(&new_summary);
                        result.push_str(after_summary);
                        search_from = details_end;
                        continue;
                    }
                }
            }

            // Not a Menu/Links accordion — emit as-is
            result.push_str(block);
            search_from = details_end;
        } else {
            // No closing </details> — emit rest as-is
            result.push_str(&html[abs_start..]);
            search_from = html.len();
        }
    }

    // Copy remaining text
    result.push_str(&html[search_from..]);
    result
}

fn strip_html_tags(html: &str) -> String {
    let mut result = String::with_capacity(html.len());
    let mut in_tag = false;
    for ch in html.chars() {
        if ch == '<' {
            in_tag = true;
        } else if ch == '>' {
            in_tag = false;
        } else if !in_tag {
            result.push(ch);
        }
    }
    result
}

fn slugify(text: &str) -> String {
    text.to_lowercase()
        .chars()
        .map(|c| {
            if c.is_ascii_alphanumeric() {
                c
            } else if c == ' ' || c == '_' || c == '-' {
                '-'
            } else {
                '\0'
            }
        })
        .filter(|c| *c != '\0')
        .collect::<String>()
        .split('-')
        .filter(|s| !s.is_empty())
        .collect::<Vec<_>>()
        .join("-")
}

fn build_breadcrumb(request_path: &str) -> String {
    let mut parts = vec![r#"<a href="/">Home</a>"#.to_string()];

    let clean = request_path
        .trim_end_matches(".md")
        .trim_end_matches("/index")
        .trim_end_matches('/');

    if !clean.is_empty() {
        let segments: Vec<&str> = clean.split('/').filter(|s| !s.is_empty()).collect();
        let mut accumulated = String::new();
        for (i, segment) in segments.iter().enumerate() {
            accumulated.push('/');
            accumulated.push_str(segment);
            let display = title_case_segment(segment);
            if i == segments.len() - 1 {
                parts.push(format!("<span>{}</span>", html_escape(&display)));
            } else {
                parts.push(format!(
                    r#"<a href="{}">{}</a>"#,
                    html_escape(&accumulated),
                    html_escape(&display)
                ));
            }
        }
    }

    parts.join(" / ")
}

/// Convert URL path segment to Title Case: "marketing-sites" → "Marketing Sites"
fn title_case_segment(segment: &str) -> String {
    segment
        .split('-')
        .map(|word| {
            let mut chars = word.chars();
            match chars.next() {
                Some(c) => {
                    let mut s = c.to_uppercase().to_string();
                    s.extend(chars);
                    s
                }
                None => String::new(),
            }
        })
        .collect::<Vec<_>>()
        .join(" ")
}

fn html_escape(s: &str) -> String {
    s.replace('&', "&amp;")
        .replace('<', "&lt;")
        .replace('>', "&gt;")
        .replace('"', "&quot;")
}

// ---- Directory listing ----

fn directory_listing(dir_path: &Path, url_path: &str, is_markdown: bool) -> String {
    let mut entries: Vec<(String, bool)> = Vec::new();

    if let Ok(read_dir) = std::fs::read_dir(dir_path) {
        for entry in read_dir.flatten() {
            let name = entry.file_name().to_string_lossy().to_string();
            let is_dir = entry.path().is_dir();
            entries.push((name, is_dir));
        }
    }

    // Directories first, then alphabetical
    entries.sort_by(|a, b| match (a.1, b.1) {
        (true, false) => std::cmp::Ordering::Less,
        (false, true) => std::cmp::Ordering::Greater,
        _ => a.0.to_lowercase().cmp(&b.0.to_lowercase()),
    });

    let url_base = if url_path.is_empty() {
        String::new()
    } else {
        format!("/{}", url_path.trim_end_matches('/'))
    };

    let mut items = String::new();

    if !url_path.is_empty() {
        items.push_str("<li class=\"dir\"><a href=\"..\">..</a></li>\n");
    }

    for (name, is_dir) in &entries {
        let css_class = if *is_dir { "dir" } else { "file" };
        let href = if *is_dir {
            format!("{}/{}/", url_base, name)
        } else {
            format!("{}/{}", url_base, name)
        };
        let display = if *is_dir { format!("{}/", name) } else { name.clone() };
        items.push_str(&format!(
            "<li class=\"{}\"><a href=\"{}\">{}</a></li>\n",
            css_class,
            html_escape(&href),
            html_escape(&display),
        ));
    }

    let title = if url_path.is_empty() {
        "Index".to_string()
    } else {
        format!("/{}", url_path)
    };

    if is_markdown {
        format!(
            r#"<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1">
<title>{title} - Directory</title>
<style>
{css}
</style>
</head>
<body>
<div class="container">
<nav class="breadcrumb"><a href="/">Home</a></nav>
<article class="markdown-body">
<h1>{title}</h1>
<ul class="directory-listing">
{items}
</ul>
</article>
</div>
</body>
</html>"#,
            title = html_escape(&title),
            css = MARKDOWN_CSS,
            items = items,
        )
    } else {
        format!(
            r#"<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8">
<title>{title} - Directory</title>
<style>
body {{ font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', sans-serif; margin: 20px 40px; color: #24292e; }}
a {{ color: #0366d6; text-decoration: none; }}
a:hover {{ text-decoration: underline; }}
ul {{ list-style: none; padding: 0; }}
li {{ padding: 4px 0; }}
li.dir a::before {{ content: "[ ] "; font-family: monospace; }}
li.file a::before {{ content: "  - "; font-family: monospace; }}
</style>
</head>
<body>
<h1>{title}</h1>
<ul>
{items}
</ul>
</body>
</html>"#,
            title = html_escape(&title),
            items = items,
        )
    }
}

// ---- CSS Theme ----

const MARKDOWN_CSS: &str = r##"
* { margin: 0; padding: 0; box-sizing: border-box; }

body {
    font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Helvetica, Arial, sans-serif;
    font-size: 15px;
    line-height: 1.6;
    color: #1f2328;
    background: #fff;
    -webkit-font-smoothing: antialiased;
}

.container {
    max-width: 860px;
    margin: 0 auto;
    padding: 16px 32px 32px;
}

/* Breadcrumb & Toolbar */
.breadcrumb {
    display: flex;
    justify-content: space-between;
    align-items: center;
    padding: 8px 0;
    margin-bottom: 16px;
    border-bottom: 1px solid #d1d9e0;
    font-size: 13px;
    color: #656d76;
}
.breadcrumb a { color: #0969da; text-decoration: none; }
.breadcrumb a:hover { text-decoration: underline; }
.breadcrumb-path span { color: #1f2328; font-weight: 500; }
.toolbar { display: flex; gap: 4px; }
.toolbar button {
    background: none; border: 1px solid #d1d9e0; border-radius: 6px;
    cursor: pointer; font-size: 16px; width: 32px; height: 28px;
    display: flex; align-items: center; justify-content: center;
    color: #656d76; transition: background .15s;
}
.toolbar button:hover { background: #f3f4f6; }

/* Headings */
.markdown-body h1, .markdown-body h2, .markdown-body h3,
.markdown-body h4, .markdown-body h5, .markdown-body h6 {
    margin-top: 1.5em;
    margin-bottom: 0.5em;
    font-weight: 600;
    line-height: 1.3;
    color: #1f2328;
}
.markdown-body h1 { font-size: 1.85em; padding-bottom: .25em; border-bottom: 1px solid #d1d9e0; margin-top: 0; }
.markdown-body h2 { font-size: 1.4em; padding-bottom: .2em; border-bottom: 1px solid #d1d9e0; }
.markdown-body h3 { font-size: 1.15em; }
.markdown-body h4 { font-size: 1em; }
.markdown-body h5, .markdown-body h6 { font-size: .9em; color: #656d76; }
.markdown-body h1:first-child { margin-top: 0; }

/* Heading anchor links */
.markdown-body h1[id], .markdown-body h2[id], .markdown-body h3[id], .markdown-body h4[id] {
    position: relative;
}
.markdown-body .heading-link,
.markdown-body .heading-link:hover,
.markdown-body .heading-link:visited,
.markdown-body .heading-link:active {
    color: inherit;
    text-decoration: none;
}
.markdown-body h1[id]:hover::before, .markdown-body h2[id]:hover::before,
.markdown-body h3[id]:hover::before, .markdown-body h4[id]:hover::before {
    content: "#";
    position: absolute;
    left: -1.2em;
    color: #0969da;
    font-weight: 400;
}

/* Paragraphs and text */
.markdown-body p { margin-bottom: 12px; }
.markdown-body > p:last-child { margin-bottom: 0; }

.markdown-body a { color: #0969da; text-decoration: none; }
.markdown-body a:hover { text-decoration: underline; color: #0550ae; }
.markdown-body a:visited { color: #6639ba; }

.markdown-body strong { font-weight: 600; }
.markdown-body em { font-style: italic; }
.markdown-body del { color: #656d76; }

/* Inline code */
.markdown-body code {
    padding: .15em .35em;
    font-size: 84%;
    background-color: #eff1f3;
    border-radius: 4px;
    font-family: ui-monospace, SFMono-Regular, 'SF Mono', Menlo, Consolas, 'Liberation Mono', monospace;
    color: #1f2328;
}

/* Code blocks */
.markdown-body pre {
    padding: 14px 16px;
    overflow: auto;
    font-size: 84%;
    line-height: 1.5;
    background-color: #f6f8fa;
    border-radius: 8px;
    border: 1px solid #d1d9e0;
    margin-bottom: 14px;
}
.markdown-body pre code {
    display: block;
    padding: 0;
    overflow: visible;
    background: transparent;
    border: 0;
    font-size: 100%;
    color: inherit;
    border-radius: 0;
}

/* Tables */
.markdown-body table {
    border-collapse: collapse;
    width: 100%;
    margin-bottom: 14px;
    display: block;
    overflow-x: auto;
    font-size: 14px;
}
.markdown-body th, .markdown-body td {
    padding: 6px 12px;
    border: 1px solid #d1d9e0;
}
.markdown-body th {
    font-weight: 600;
    background-color: #f6f8fa;
    text-align: left;
}
.markdown-body tr:nth-child(2n) { background-color: #f6f8fa; }

/* Blockquotes */
.markdown-body blockquote {
    padding: 4px 16px;
    color: #656d76;
    border-left: 3px solid #d1d9e0;
    margin-bottom: 12px;
}
.markdown-body blockquote p { margin-bottom: 4px; }
.markdown-body blockquote p:last-child { margin-bottom: 0; }

/* Lists — compact */
.markdown-body ul, .markdown-body ol {
    padding-left: 1.5em;
    margin-bottom: 10px;
}
.markdown-body li {
    margin: 1px 0;
    line-height: 1.5;
}
.markdown-body li > p { margin-bottom: 4px; }
.markdown-body li > ul, .markdown-body li > ol {
    margin-top: 2px;
    margin-bottom: 2px;
}

/* Task lists */
.markdown-body input[type="checkbox"] {
    margin-right: .4em;
    vertical-align: middle;
    position: relative;
    top: -1px;
}

/* Images */
.markdown-body img {
    max-width: 100%;
    height: auto;
    border-style: none;
    border-radius: 6px;
}

/* Horizontal rules */
.markdown-body hr {
    height: 2px;
    padding: 0;
    margin: 20px 0;
    background-color: #d1d9e0;
    border: 0;
}

/* Accordions (details/summary) */
.markdown-body details {
    margin: 12px 0;
    border: 1px solid #d1d9e0;
    border-radius: 8px;
    padding: 0;
    overflow: hidden;
}
.markdown-body details summary {
    cursor: pointer;
    font-weight: 600;
    font-size: 14px;
    padding: 8px 14px;
    background-color: #f6f8fa;
    user-select: none;
    list-style: none;
    display: flex;
    align-items: center;
    gap: 6px;
}
.markdown-body details summary::before {
    content: "▶";
    font-size: 10px;
    color: #656d76;
    transition: transform .15s ease;
    display: inline-block;
    flex-shrink: 0;
}
.markdown-body details[open] summary::before {
    transform: rotate(90deg);
}
.markdown-body details summary::-webkit-details-marker { display: none; }
.markdown-body details[open] summary {
    border-bottom: 1px solid #d1d9e0;
}
.markdown-body details summary:hover {
    background-color: #eaeef2;
}
.markdown-body details > :not(summary) {
    padding: 0 14px;
}
.markdown-body details > p:first-of-type {
    margin-top: 10px;
}
.markdown-body details > ul, .markdown-body details > ol {
    padding: 8px 14px 6px 32px;
}
.markdown-body details > ul li, .markdown-body details > ol li {
    font-size: 14px;
}

/* Callout boxes (Tip, Note, Caution, etc.) */
.callout {
    margin: 14px 0;
    padding: 12px 16px;
    border-radius: 8px;
    border-left: 4px solid;
    font-size: 14px;
}
.callout-note {
    background-color: #ddf4ff;
    border-left-color: #0969da;
}
.callout-tip {
    background-color: #dafbe1;
    border-left-color: #1a7f37;
}
.callout-warning {
    background-color: #fff8c5;
    border-left-color: #9a6700;
}
.callout .callout-title {
    font-weight: 600;
    margin-bottom: 4px;
    font-size: 14px;
}
.callout p { margin-bottom: 4px; }
.callout p:last-child { margin-bottom: 0; }

/* Directory listing */
.directory-listing { list-style: none; padding: 0 !important; }
.directory-listing li {
    padding: 5px 8px;
    border-bottom: 1px solid #f0f2f4;
}
.directory-listing li:last-child { border-bottom: none; }
.directory-listing li a {
    display: block;
    text-decoration: none;
    color: #0969da;
}
.directory-listing li a:hover { text-decoration: underline; }
.directory-listing li.dir a { font-weight: 600; color: #1f2328; }
.directory-listing li.dir a::before { content: "📁  "; }
.directory-listing li.file a::before { content: "📄  "; }

/* Footer */
footer {
    margin-top: 32px;
    padding-top: 12px;
    border-top: 1px solid #d1d9e0;
    font-size: 12px;
    color: #656d76;
}
footer a { color: #0969da; text-decoration: none; }
footer a:hover { text-decoration: underline; }

/* Selection highlight */
::selection { background-color: #dbe9f9; }

/* Smooth scroll for anchor links */
html { scroll-behavior: smooth; }

/* Wide mode */
html.wide .container { max-width: 100%; }

/* Dark mode */
html.dark body { background: #0d1117; color: #e6edf3; }
html.dark .breadcrumb { border-color: #30363d; color: #8b949e; }
html.dark .breadcrumb a { color: #58a6ff; }
html.dark .breadcrumb-path span { color: #e6edf3; }
html.dark .toolbar button { border-color: #30363d; color: #8b949e; }
html.dark .toolbar button:hover { background: #21262d; }
html.dark .markdown-body { color: #e6edf3; }
html.dark .markdown-body h1,
html.dark .markdown-body h2,
html.dark .markdown-body h3,
html.dark .markdown-body h4 { color: #e6edf3; }
html.dark .markdown-body h1, html.dark .markdown-body h2 { border-color: #30363d; }
html.dark .markdown-body h5, html.dark .markdown-body h6 { color: #8b949e; }
html.dark .markdown-body a { color: #58a6ff; }
html.dark .markdown-body a:hover { color: #79c0ff; }
html.dark .markdown-body a:visited { color: #bc8cff; }
html.dark .markdown-body .heading-link,
html.dark .markdown-body .heading-link:hover,
html.dark .markdown-body .heading-link:visited { color: inherit; }
html.dark .markdown-body h1[id]:hover::before,
html.dark .markdown-body h2[id]:hover::before,
html.dark .markdown-body h3[id]:hover::before,
html.dark .markdown-body h4[id]:hover::before { color: #58a6ff; }
html.dark .markdown-body code {
    background-color: #161b22; color: #e6edf3; border-color: #30363d;
}
html.dark .markdown-body pre {
    background-color: #161b22; border-color: #30363d;
}
html.dark .markdown-body pre code { background: transparent; }
html.dark .markdown-body blockquote { border-color: #30363d; color: #8b949e; }
html.dark .markdown-body table th { background-color: #161b22; border-color: #30363d; color: #e6edf3; }
html.dark .markdown-body table td { border-color: #30363d; }
html.dark .markdown-body tr:nth-child(2n) { background-color: #161b22; }
html.dark .markdown-body hr { background-color: #30363d; }
html.dark .markdown-body img { opacity: .85; }
html.dark .markdown-body del { color: #8b949e; }
html.dark .markdown-body details { border-color: #30363d; }
html.dark .markdown-body details summary { color: #e6edf3; background: #161b22; }
html.dark .markdown-body .callout { border-color: #30363d; background: #161b22; }
html.dark .markdown-body .callout-title { color: #e6edf3; }
html.dark footer { border-color: #30363d; color: #8b949e; }
html.dark footer a { color: #58a6ff; }
html.dark ::selection { background-color: #1f3a5f; }

/* Responsive */
@media (max-width: 768px) {
    .container { padding: 12px 16px 24px; }
    .markdown-body h1 { font-size: 1.5em; }
    .markdown-body h2 { font-size: 1.25em; }
    .markdown-body pre { font-size: 80%; padding: 10px 12px; }
}

/* Print */
@media print {
    .breadcrumb, footer, .toolbar { display: none; }
    .markdown-body details { border: none; }
    .markdown-body details > summary { display: none; }
    .markdown-body details > * { display: block !important; }
    .markdown-body a { color: inherit; text-decoration: underline; }
    .markdown-body a::after { content: " (" attr(href) ")"; font-size: 80%; color: #666; }
}
"##;


================================================
FILE: src/types.rs
================================================
// SiteOne Crawler - Type definitions
// (c) Jan Reges <jan.reges@siteone.cz>

use serde::{Deserialize, Serialize};
use std::fmt;

use crate::error::CrawlerError;

// ---------------------------------------------------------------------------
// DeviceType
// ---------------------------------------------------------------------------

#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
#[serde(rename_all = "lowercase")]
pub enum DeviceType {
    Desktop,
    Mobile,
    Tablet,
}

impl DeviceType {
    pub fn from_text(text: &str) -> Result<Self, CrawlerError> {
        match text.trim().to_lowercase().as_str() {
            "desktop" => Ok(DeviceType::Desktop),
            "mobile" => Ok(DeviceType::Mobile),
            "tablet" => Ok(DeviceType::Tablet),
            other => Err(CrawlerError::Config(format!(
                "Unknown device type '{}'. Supported values are: {}",
                other,
                Self::available_text_types().join(", ")
            ))),
        }
    }

    pub fn available_text_types() -> Vec<&'static str> {
        vec!["desktop", "mobile", "tablet"]
    }

    pub fn as_str(&self) -> &'static str {
        match self {
            DeviceType::Desktop => "desktop",
            DeviceType::Mobile => "mobile",
            DeviceType::Tablet => "tablet",
        }
    }
}

impl fmt::Display for DeviceType {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        f.write_str(self.as_str())
    }
}

// ---------------------------------------------------------------------------
// AssetType
// ---------------------------------------------------------------------------

#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
#[serde(rename_all = "lowercase")]
pub enum AssetType {
    Fonts,
    Images,
    Styles,
    Scripts,
    Files,
}

impl AssetType {
    pub fn from_text(text: &str) -> Result<Self, CrawlerError> {
        match text.trim().to_lowercase().as_str() {
            "fonts" => Ok(AssetType::Fonts),
            "images" => Ok(AssetType::Images),
            "styles" => Ok(AssetType::Styles),
            "scripts" => Ok(AssetType::Scripts),
            "files" => Ok(AssetType::Files),
            other => Err(CrawlerError::Config(format!(
                "Unknown asset type '{}'. Supported values are: {}",
                other,
                Self::available_text_types().join(", ")
            ))),
        }
    }

    pub fn available_text_types() -> Vec<&'static str> {
        vec!["fonts", "images", "styles", "scripts", "files"]
    }

    pub fn as_str(&self) -> &'static str {
        match self {
            AssetType::Fonts => "fonts",
            AssetType::Images => "images",
            AssetType::Styles => "styles",
            AssetType::Scripts => "scripts",
            AssetType::Files => "files",
        }
    }
}

impl fmt::Display for AssetType {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        f.write_str(self.as_str())
    }
}

// ---------------------------------------------------------------------------
// ContentTypeId
// ---------------------------------------------------------------------------

#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
#[repr(i32)]
pub enum ContentTypeId {
    Html = 1,
    Script = 2,
    Stylesheet = 3,
    Image = 4,
    Video = 5,
    Font = 6,
    Document = 7,
    Json = 8,
    Redirect = 9,
    Other = 10,
    Audio = 11,
    Xml = 12,
}

impl ContentTypeId {
    pub fn from_i32(value: i32) -> Option<Self> {
        match value {
            1 => Some(ContentTypeId::Html),
            2 => Some(ContentTypeId::Script),
            3 => Some(ContentTypeId::Stylesheet),
            4 => Some(ContentTypeId::Image),
            5 => Some(ContentTypeId::Video),
            6 => Some(ContentTypeId::Font),
            7 => Some(ContentTypeId::Document),
            8 => Some(ContentTypeId::Json),
            9 => Some(ContentTypeId::Redirect),
            10 => Some(ContentTypeId::Other),
            11 => Some(ContentTypeId::Audio),
            12 => Some(ContentTypeId::Xml),
            _ => None,
        }
    }

    pub fn name(&self) -> &'static str {
        match self {
            ContentTypeId::Html => "HTML",
            ContentTypeId::Script => "JS",
            ContentTypeId::Stylesheet => "CSS",
            ContentTypeId::Image => "Image",
            ContentTypeId::Audio => "Audio",
            ContentTypeId::Video => "Video",
            ContentTypeId::Font => "Font",
            ContentTypeId::Document => "Document",
            ContentTypeId::Json => "JSON",
            ContentTypeId::Xml => "XML",
            ContentTypeId::Redirect => "Redirect",
            ContentTypeId::Other => "Other",
        }
    }
}

impl fmt::Display for ContentTypeId {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        f.write_str(self.name())
    }
}

// ---------------------------------------------------------------------------
// SkippedReason
// ---------------------------------------------------------------------------

#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
#[repr(i32)]
pub enum SkippedReason {
    NotAllowedHost = 1,
    RobotsTxt = 2,
    ExceedsMaxDepth = 3,
}

impl SkippedReason {
    pub fn from_i32(value: i32) -> Option<Self> {
        match value {
            1 => Some(SkippedReason::NotAllowedHost),
            2 => Some(SkippedReason::RobotsTxt),
            3 => Some(SkippedReason::ExceedsMaxDepth),
            _ => None,
        }
    }

    pub fn description(&self) -> &'static str {
        match self {
            SkippedReason::NotAllowedHost => "Not allowed host",
            SkippedReason::RobotsTxt => "Robots.txt",
            SkippedReason::ExceedsMaxDepth => "Exceeds max depth",
        }
    }
}

impl fmt::Display for SkippedReason {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        f.write_str(self.description())
    }
}

// ---------------------------------------------------------------------------
// OutputType
// ---------------------------------------------------------------------------

#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
#[serde(rename_all = "lowercase")]
pub enum OutputType {
    Text,
    Json,
    Multi,
}

impl OutputType {
    pub fn from_text(text: &str) -> Result<Self, CrawlerError> {
        match text.trim().to_lowercase().as_str() {
            "text" => Ok(OutputType::Text),
            "json" => Ok(OutputType::Json),
            other => Err(CrawlerError::Config(format!(
                "Unknown output type '{}'. Supported values are: {}",
                other,
                Self::available_text_types().join(", ")
            ))),
        }
    }

    pub fn available_text_types() -> Vec<&'static str> {
        vec!["text", "json"]
    }

    pub fn as_str(&self) -> &'static str {
        match self {
            OutputType::Text => "text",
            OutputType::Json => "json",
            OutputType::Multi => "multi",
        }
    }
}

impl fmt::Display for OutputType {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        f.write_str(self.as_str())
    }
}


================================================
FILE: src/utils.rs
================================================
// SiteOne Crawler - Utilities
// (c) Jan Reges <jan.reges@siteone.cz>

use std::sync::RwLock;

use regex::Regex;

use crate::types::ContentTypeId;

static FORCED_COLOR_SETUP: RwLock<Option<bool>> = RwLock::new(None);

/// Check if a string looks like a regex pattern delimited by one of / # ~ %
/// e.g. "/pattern/flags" or "#pattern#i"
pub fn is_regex_pattern(s: &str) -> bool {
    if s.len() < 2 {
        return false;
    }
    let first = s.as_bytes()[0];
    if !matches!(first, b'/' | b'#' | b'~' | b'%') {
        return false;
    }
    // Find the last occurrence of the delimiter
    if let Some(last_delim_pos) = s[1..].rfind(first as char) {
        let last_delim_pos = last_delim_pos + 1; // adjust for the slice offset
        // Everything after the last delimiter should be flags (a-z only)
        let flags = &s[last_delim_pos + 1..];
        flags.chars().all(|c| c.is_ascii_lowercase())
    } else {
        false
    }
}
/// Extract the inner regex pattern from a PCRE-delimited string (e.g., /pattern/flags).
/// If the string is not delimited, returns it as-is.
/// Converts PCRE flags like 'i' to Rust regex inline flags like (?i).
pub fn extract_pcre_regex_pattern(s: &str) -> String {
    if is_regex_pattern(s) {
        let delimiter = s.as_bytes()[0] as char;
        let rest = &s[1..];
        if let Some(end_pos) = rest.rfind(delimiter) {
            let pattern = &rest[..end_pos];
            let flags = &rest[end_pos + 1..];
            let mut regex_pattern = String::new();
            if flags.contains('i') {
                regex_pattern.push_str("(?i)");
            }
            regex_pattern.push_str(pattern);
            return regex_pattern;
        }
    }
    s.to_string()
}

static FORCED_CONSOLE_WIDTH: RwLock<Option<usize>> = RwLock::new(None);

pub const IMG_SRC_TRANSPARENT_1X1_GIF: &str =
    "data:image/gif;base64,R0lGODlhAQABAIAAAP///wAAACH5BAEAAAAALAAAAAABAAEAAAICRAEAOw==";

pub fn disable_colors() {
    if let Ok(mut v) = FORCED_COLOR_SETUP.write() {
        *v = Some(false);
    }
}

pub fn force_enabled_colors() {
    if let Ok(mut v) = FORCED_COLOR_SETUP.write() {
        *v = Some(true);
    }
}

pub fn set_forced_console_width(width: usize) {
    if let Ok(mut v) = FORCED_CONSOLE_WIDTH.write() {
        *v = Some(width);
    }
}

pub fn get_formatted_size(bytes: i64, precision: usize) -> String {
    let units = ["B", "kB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB"];

    let bytes_f = (bytes.max(0)) as f64;
    let pow = if bytes_f > 0.0 {
        (bytes_f.ln() / 1024_f64.ln()).floor() as usize
    } else {
        0
    };
    let pow = pow.min(units.len() - 1);

    let value = bytes_f / 1024_f64.powi(pow as i32);
    let rounded = format!("{:.prec$}", value, prec = precision);

    format!("{} {}", rounded, units[pow])
}

pub fn get_formatted_duration(duration: f64) -> String {
    if duration < 1.0 {
        let ms = duration * 1000.0;
        format!("{} ms", ms as i64)
    } else if duration < 10.0 {
        let formatted = format!("{:.1}", duration);
        let formatted = formatted.trim_end_matches(".0");
        format!("{} s", formatted)
    } else {
        format!("{} s", duration as i64)
    }
}

pub fn get_formatted_age(age: i64) -> String {
    if age < 60 {
        format!("{} sec(s)", age)
    } else if age < 3600 {
        format!(
            "{} min(s)",
            strip_trailing_dot_zero(&format!("{:.1}", age as f64 / 60.0))
        )
    } else if age < 86400 {
        format!(
            "{} hour(s)",
            strip_trailing_dot_zero(&format!("{:.1}", age as f64 / 3600.0))
        )
    } else {
        format!(
            "{} day(s)",
            strip_trailing_dot_zero(&format!("{:.1}", age as f64 / 86400.0))
        )
    }
}

/// Strip trailing ".0" from formatted numbers.
fn strip_trailing_dot_zero(s: &str) -> String {
    s.strip_suffix(".0").unwrap_or(s).to_string()
}

pub fn get_formatted_cache_lifetime(seconds: i64) -> String {
    if seconds < 60 {
        format!("{} s", seconds)
    } else if seconds <= 3600 {
        format!("{} min", seconds / 60)
    } else if seconds <= 86400 {
        format!("{} h", seconds / 3600)
    } else if seconds <= 86400 * 90 {
        format!("{} d", seconds / 86400)
    } else if seconds <= 86400 * 365 * 2 {
        format!("{} mon", (seconds as f64 / 86400.0 / 30.0).round() as i64)
    } else {
        format!("{:.1} y", seconds as f64 / 31536000.0)
    }
}

pub fn get_color_text(text: &str, color: &str, set_background: bool) -> String {
    // Check forced color setup
    let forced = FORCED_COLOR_SETUP.read().ok().and_then(|v| *v);
    match forced {
        Some(false) => return text.to_string(),
        Some(true) => {}
        None => {
            // Check if stdout is a TTY
            if !atty_is_tty() {
                return text.to_string();
            }
        }
    }

    let fg_colors: &[(&str, &str)] = &[
        ("black", "0;30"),
        ("red", "0;31"),
        ("green", "0;32"),
        ("yellow", "0;33"),
        ("blue", "0;34"),
        ("magenta", "0;35"),
        ("cyan", "0;36"),
        ("white", "0;37"),
        ("gray", "38;5;244"),
        ("dark-gray", "38;5;240"),
    ];

    let bg_colors: &[(&str, &str)] = &[
        ("black", "1;40"),
        ("red", "1;41"),
        ("green", "1;42"),
        ("yellow", "1;43"),
        ("blue", "1;44"),
        ("magenta", "1;45"),
        ("cyan", "1;46"),
        ("white", "1;47"),
    ];

    let code = if set_background {
        bg_colors
            .iter()
            .find(|(name, _)| *name == color)
            .map(|(_, code)| *code)
            .unwrap_or("0")
    } else {
        fg_colors
            .iter()
            .find(|(name, _)| *name == color)
            .map(|(_, code)| *code)
            .unwrap_or("0")
    };

    format!("\x1b[{}m{}\x1b[0m", code, text)
}

fn atty_is_tty() -> bool {
    // Simple check using libc isatty
    unsafe { libc_isatty(1) != 0 }
}

unsafe extern "C" {
    fn isatty(fd: i32) -> i32;
}

unsafe fn libc_isatty(fd: i32) -> i32 {
    unsafe { isatty(fd) }
}

pub fn convert_bash_colors_in_text_to_html(text: &str) -> String {
    use once_cell::sync::Lazy;
    static RE_BASH_COLORS: Lazy<Regex> = Lazy::new(|| Regex::new(r"\x1b\[(.*?)m(.*?)\x1b\[0m").unwrap());
    let re = &*RE_BASH_COLORS;

    re.replace_all(text, |caps: &regex::Captures| {
        let styles_str = caps.get(1).map_or("", |m| m.as_str());
        let content = caps.get(2).map_or("", |m| m.as_str());

        let styles: Vec<&str> = styles_str.split(';').collect();
        let mut font_color: Option<&str> = None;
        let mut background_color: Option<&str> = None;

        for style in &styles {
            if ["30", "31", "32", "33", "34", "35", "36", "37"].contains(style) {
                font_color = Some(style);
            } else if ["40", "41", "42", "43", "44", "45", "46", "47"].contains(style) {
                background_color = Some(style);
            }
        }

        let mut css_style = String::new();
        if let Some(fc) = font_color {
            css_style.push_str(&format!("color: {};", get_html_color_by_bash_color(fc)));
        }
        if let Some(bc) = background_color {
            css_style.push_str(&format!("background-color: {};", get_html_color_by_bash_color(bc)));
        }

        if !css_style.is_empty() {
            format!("<span style=\"{}\">{}</span>", css_style.trim_end_matches(';'), content)
        } else {
            content.to_string()
        }
    })
    .to_string()
}

fn get_html_color_by_bash_color(color: &str) -> &'static str {
    match color {
        "30" | "40" => "#000000",
        "31" | "41" => "#e3342f",
        "32" | "42" => "#38c172",
        "33" | "43" => "#ffff00",
        "34" | "44" => "#2563EB",
        "35" | "45" => "#ff00ff",
        "36" | "46" => "#00ffff",
        "37" | "47" => "#ffffff",
        _ => "#000000",
    }
}

pub fn truncate_in_two_thirds(
    text: &str,
    max_length: usize,
    placeholder: &str,
    forced_coloring: Option<bool>,
) -> String {
    let char_count = text.chars().count();
    if char_count <= max_length {
        return text.to_string();
    }

    let placeholder_len = placeholder.chars().count();
    let first_part_length = ((max_length as f64) * (2.0 / 3.0)).ceil() as usize;
    let second_part_length = if max_length > first_part_length + placeholder_len {
        max_length - first_part_length - placeholder_len
    } else {
        0
    };

    let first_part: String = text.chars().take(first_part_length).collect();
    let second_part: String = text
        .chars()
        .rev()
        .take(second_part_length)
        .collect::<Vec<_>>()
        .into_iter()
        .rev()
        .collect();

    let final_placeholder = match forced_coloring {
        Some(true) | None => get_color_text(placeholder, "red", false),
        Some(false) => placeholder.to_string(),
    };

    format!("{}{}{}", first_part.trim(), final_placeholder, second_part.trim())
}

pub fn truncate_url(
    url: &str,
    max_length: usize,
    placeholder: &str,
    strip_hostname: Option<&str>,
    scheme_of_hostname_to_strip: Option<&str>,
    forced_coloring: Option<bool>,
) -> String {
    let mut url = url.to_string();

    if let Some(hostname) = strip_hostname {
        if let Some(scheme) = scheme_of_hostname_to_strip {
            let full = format!("{}://{}", scheme, hostname);
            url = url.replace(&full, "");
        } else {
            let http = format!("http://{}", hostname);
            let https = format!("https://{}", hostname);
            url = url.replace(&http, "").replace(&https, "");
        }
    }

    if url.chars().count() > max_length {
        url = truncate_in_two_thirds(&url, max_length, placeholder, forced_coloring);
    }

    url
}

pub fn get_progress_bar(done: usize, total: usize, segments: usize) -> String {
    let percentage = (done as f64 / total as f64) * 100.0;
    let filled_segments = ((done as f64 / total as f64) * segments as f64).round() as usize;
    let empty_segments = segments.saturating_sub(filled_segments);

    format!(
        "{:>5}|{}{}|",
        format!("{}%", percentage as i64),
        ">".repeat(filled_segments),
        " ".repeat(empty_segments),
    )
}

pub fn remove_ansi_colors(text: &str) -> String {
    use once_cell::sync::Lazy;
    static RE_ANSI: Lazy<Regex> = Lazy::new(|| Regex::new(r"\x1b\[\d+(;\d+)*m").unwrap());
    RE_ANSI.replace_all(text, "").to_string()
}

pub fn get_http_client_code_with_error_description(http_code: i32, short_version: bool) -> String {
    match http_code {
        -1 => {
            if short_version {
                "-1:CON".to_string()
            } else {
                "-1:CONN-FAIL".to_string()
            }
        }
        -2 => {
            if short_version {
                "-2:TIM".to_string()
            } else {
                "-2:TIMEOUT".to_string()
            }
        }
        -3 => {
            if short_version {
                "-3:RST".to_string()
            } else {
                "-3:SRV-RESET".to_string()
            }
        }
        -4 => {
            if short_version {
                "-4:SND".to_string()
            } else {
                "-4:SEND-ERROR".to_string()
            }
        }
        -6 => {
            if short_version {
                "-6:SKP".to_string()
            } else {
                "-6:SKIPPED".to_string()
            }
        }
        code => code.to_string(),
    }
}

pub fn get_console_width() -> usize {
    let forced = FORCED_CONSOLE_WIDTH.read().ok().and_then(|v| *v);
    if let Some(w) = forced {
        return w;
    }

    if let Some((terminal_size::Width(w), _)) = terminal_size::terminal_size() {
        return (w as usize).max(100);
    }

    138
}

pub fn get_url_without_scheme_and_host(
    url: &str,
    only_when_host: Option<&str>,
    initial_scheme: Option<&str>,
) -> String {
    if let Some(host) = only_when_host {
        let host_marker = format!("://{}", host);
        if !url.contains(&host_marker) {
            return url.to_string();
        }
    }

    if let Some(scheme) = initial_scheme {
        let prefix = format!("{}://", scheme);
        if !url.starts_with(&prefix) {
            return url.to_string();
        }
    }

    if let Ok(parsed) = url::Url::parse(url) {
        let path = parsed.path();
        if let Some(query) = parsed.query() {
            format!("{}?{}", path, query)
        } else {
            path.to_string()
        }
    } else {
        url.to_string()
    }
}

pub fn get_safe_command(command: &str) -> String {
    let patterns = [
        (r"(pass[a-z]{0,5})=\S+", "$1=***"),
        (r"(keys?)=\S+", "$1=***"),
        (r"(secrets?)=\S+", "$1=***"),
        (r"(auth)=\S+", "$1=***"),
    ];

    let mut result = command.to_string();
    for (pattern, replacement) in &patterns {
        if let Ok(re) = Regex::new(pattern) {
            result = re.replace_all(&result, *replacement).to_string();
        }
    }
    result
}

pub fn get_colored_request_time(request_time: f64, str_pad_to: usize) -> String {
    let formatted = get_formatted_duration(request_time);
    let padded = format!("{:<width$}", formatted, width = str_pad_to);

    if request_time >= 2.0 {
        get_color_text(&padded, "red", true)
    } else if request_time >= 1.0 {
        get_color_text(&padded, "magenta", true)
    } else if request_time >= 0.5 {
        get_color_text(&padded, "yellow", false)
    } else {
        get_color_text(&padded, "green", false)
    }
}

pub fn get_colored_status_code(status_code: i32, str_pad_to: usize) -> String {
    if (200..300).contains(&status_code) {
        get_color_text(&format!("{:<width$}", status_code, width = str_pad_to), "green", false)
    } else if (300..400).contains(&status_code) {
        get_color_text(&format!("{:<width$}", status_code, width = str_pad_to), "yellow", true)
    } else if (400..500).contains(&status_code) {
        get_color_text(&format!("{:<width$}", status_code, width = str_pad_to), "magenta", true)
    } else if (500..600).contains(&status_code) {
        get_color_text(&format!("{:<width$}", status_code, width = str_pad_to), "red", true)
    } else {
        get_color_text(
            &format!(
                "{:<width$}",
                get_http_client_code_with_error_description(status_code, true),
                width = str_pad_to
            ),
            "red",
            true,
        )
    }
}

pub fn get_colored_severity(severity: &str) -> String {
    match severity {
        "critical" => get_color_text(severity, "red", true),
        "warning" => get_color_text(severity, "magenta", true),
        "notice" => get_color_text(severity, "blue", false),
        _ => get_color_text(severity, "green", false),
    }
}

pub fn get_colored_criticals(criticals: i32, str_pad_to: usize) -> String {
    if criticals == 0 {
        criticals.to_string()
    } else {
        get_color_text(&format!("{:<width$}", criticals, width = str_pad_to), "red", true)
    }
}

pub fn get_colored_warnings(warnings: i32, str_pad_to: usize) -> String {
    if warnings == 0 {
        warnings.to_string()
    } else {
        get_color_text(&format!("{:<width$}", warnings, width = str_pad_to), "magenta", false)
    }
}

pub fn get_colored_notices(notices: i32, str_pad_to: usize) -> String {
    if notices == 0 {
        notices.to_string()
    } else {
        get_color_text(&format!("{:<width$}", notices, width = str_pad_to), "blue", false)
    }
}

pub fn get_content_type_name_by_id(content_type_id: ContentTypeId) -> &'static str {
    content_type_id.name()
}

pub fn is_href_for_requestable_resource(href: &str) -> bool {
    if href.starts_with('#') {
        return false;
    }
    if href.contains('{') {
        return false;
    }
    if href.contains('<') {
        return false;
    }
    if href.contains("&#") {
        return false;
    }

    // Check if href starts with a scheme that is not http/https
    use once_cell::sync::Lazy;
    static RE_HAS_SCHEME: Lazy<Regex> = Lazy::new(|| Regex::new(r"^[a-zA-Z0-9]+:").unwrap());
    static RE_IS_HTTP: Lazy<Regex> = Lazy::new(|| Regex::new(r"(?i)^https?:/").unwrap());
    let has_scheme = RE_HAS_SCHEME.is_match(href);
    let is_http = RE_IS_HTTP.is_match(href);

    if has_scheme && !is_http {
        return false;
    }

    true
}

pub fn get_absolute_url_by_base_url(base_url: &str, target_url: &str) -> String {
    // Use the url crate for proper resolution
    if let Ok(base) = url::Url::parse(base_url)
        && let Ok(resolved) = base.join(target_url)
    {
        return resolved.to_string();
    }

    // Fallback: return target_url as-is
    target_url.to_string()
}

pub fn get_absolute_path(path: &str) -> String {
    let p = std::path::Path::new(path);
    if p.is_absolute() {
        return path.to_string();
    }
    // On Windows, Path::join() correctly handles drive-relative ("C:foo"),
    // root-relative ("\foo"), and UNC paths ("\\server\share").
    // On Unix, it handles paths starting with "/" by returning them as-is.
    let cwd = std::env::current_dir().unwrap_or_else(|_| std::path::PathBuf::from("."));
    cwd.join(p).to_string_lossy().to_string()
}

pub fn get_output_formatted_path(path: &str) -> String {
    #[cfg(windows)]
    {
        path.replace('/', "\\")
    }
    #[cfg(not(windows))]
    {
        path.to_string()
    }
}

pub fn mb_str_pad(input: &str, pad_length: usize, pad_char: char) -> String {
    let char_count = input.chars().count();
    if char_count >= pad_length {
        input.to_string()
    } else {
        let padding = pad_length - char_count;
        format!(
            "{}{}",
            input,
            std::iter::repeat_n(pad_char, padding).collect::<String>()
        )
    }
}

pub fn strip_javascript(html: &str) -> String {
    let mut result = html.to_string();

    // script tags
    if let Ok(re) = Regex::new(r"(?is)<script[^>]*>.*?</script>") {
        result = re.replace_all(&result, "").to_string();
    }

    // link tags by "href" pointing to .js
    if let Ok(re) = Regex::new(r#"(?is)<link[^>]*href=["'][^"']+\.js[^"']*["'][^>]*>"#) {
        result = re.replace_all(&result, "").to_string();
    }

    // link tags by "as=script"
    if let Ok(re) = Regex::new(r#"(?is)<link[^>]*as=["']script["'][^>]*>"#) {
        result = re.replace_all(&result, "").to_string();
    }

    // on* attributes
    if let Ok(re) = Regex::new(r#"(?is)\s+on[a-z]+=("[^"]*"|'[^']*'|[^\s>]*)"#) {
        result = re.replace_all(&result, "").to_string();
    }

    result
}

pub fn strip_styles(html: &str) -> String {
    let mut result = html.to_string();

    if let Ok(re) = Regex::new(r"(?is)<style\b[^>]*>.*?</style>") {
        result = re.replace_all(&result, "").to_string();
    }

    if let Ok(re) = Regex::new(r#"(?is)<link\b[^>]*rel=["']stylesheet["'][^>]*>"#) {
        result = re.replace_all(&result, "").to_string();
    }

    if let Ok(re) = Regex::new(r#"(?is)\s+style=("[^"]*"|'[^']*'|[^\s>]*)"#) {
        result = re.replace_all(&result, " ").to_string();
    }

    result
}

pub fn strip_fonts(html_or_css: &str) -> String {
    let mut result = html_or_css.to_string();

    if let Ok(re) = Regex::new(r#"(?is)<link\b[^>]*href=["'][^"']+\.(eot|ttf|woff2|woff|otf)[^"']*["'][^>]*>"#) {
        result = re.replace_all(&result, "").to_string();
    }

    if let Ok(re) = Regex::new(r"(?is)@font-face\s*\{[^}]*\}\s*") {
        result = re.replace_all(&result, "").to_string();
    }

    if let Ok(re) = Regex::new(r"(?i)\b(font|font-family)\s*:[^;]+;") {
        result = re.replace_all(&result, "").to_string();
    }

    if let Ok(re) = Regex::new(r#"(?i)\s*style=["']\s*["']"#) {
        result = re.replace_all(&result, "").to_string();
    }

    result
}

pub fn strip_images(html_or_css: &str, placeholder_image: Option<&str>) -> String {
    let placeholder = placeholder_image.unwrap_or(IMG_SRC_TRANSPARENT_1X1_GIF);
    let mut result = html_or_css.to_string();

    let patterns_and_replacements: Vec<(&str, String)> = vec![
        (
            r#"(?is)(<img[^>]+)src=['"][^'"]*['"]([^>]*>)"#,
            format!("${{1}}src=\"{}\"${{2}}", placeholder),
        ),
        (
            r#"(?is)(<img[^>]+)srcset=['"][^'"]*['"]([^>]*>)"#,
            format!("${{1}}srcset=\"{}\"${{2}}", placeholder),
        ),
        (
            r#"(?is)(<source[^>]+)srcset=['"][^'"]*['"]([^>]*>)"#,
            format!("${{1}}srcset=\"{}\"${{2}}", placeholder),
        ),
        (
            r#"(?is)(<source[^>]+)src=['"][^'"]*['"]([^>]*>)"#,
            format!("${{1}}src=\"{}\"${{2}}", placeholder),
        ),
        (
            r#"(?is)url\(\s*['"]?(?!data:)([^'")\s]*\.(?:png|jpe?g|gif|webp|svg|bmp))['"]?\s*\)"#,
            format!("url(\"{}\")", placeholder),
        ),
        (r"(?is)<svg[^>]*>.*?</svg>", String::new()),
    ];

    for (pattern, replacement) in &patterns_and_replacements {
        if let Ok(re) = Regex::new(pattern) {
            result = re.replace_all(&result, replacement.as_str()).to_string();
        }
    }

    result
}

pub fn get_colored_cache_lifetime(cache_lifetime: i64, str_pad_to: usize) -> String {
    let color = if cache_lifetime <= 0 {
        "red"
    } else if cache_lifetime < 600 {
        "magenta"
    } else if cache_lifetime <= 86400 {
        "yellow"
    } else {
        "green"
    };

    get_color_text(
        &format!(
            "{:<width$}",
            get_formatted_cache_lifetime(cache_lifetime),
            width = str_pad_to
        ),
        color,
        false,
    )
}

pub fn is_asset_by_content_type(content_type: &str) -> bool {
    let non_asset_content_types = [
        "text/html",
        "application/xhtml+xml",
        "application/xml",
        "application/json",
        "application/ld+json",
        "application/rss+xml",
    ];

    let ct_lower = content_type.to_lowercase();
    for non_asset in &non_asset_content_types {
        if ct_lower.contains(non_asset) {
            return false;
        }
    }
    true
}

pub fn add_class_to_html_images(html: &str, class_name: &str) -> String {
    let mut result = html.to_string();
    if let Ok(re) = Regex::new(r#"(?is)(<img\b)([^>]*>)"#) {
        result = re
            .replace_all(&result, |caps: &regex::Captures| {
                let tag_start = caps.get(1).map_or("", |m| m.as_str());
                let rest = caps.get(2).map_or("", |m| m.as_str());
                if rest.contains("class=") {
                    format!("{}{}", tag_start, rest)
                } else {
                    format!("{} class=\"{}\"{}", tag_start, class_name, rest)
                }
            })
            .to_string();
    }
    result
}

pub fn get_flat_response_headers(
    headers: &std::collections::HashMap<String, Vec<String>>,
) -> std::collections::HashMap<String, String> {
    headers.iter().map(|(k, v)| (k.clone(), v.join(", "))).collect()
}

/// Returns peak resident memory usage (VmHWM) in bytes by reading /proc/self/status.
/// Returns 0 if the information is not available (e.g., on non-Linux platforms).
pub fn get_peak_memory_usage() -> i64 {
    if let Ok(status) = std::fs::read_to_string("/proc/self/status") {
        for line in status.lines() {
            if line.starts_with("VmHWM:") {
                // Format is "VmHWM:    12345 kB"
                let parts: Vec<&str> = line.split_whitespace().collect();
                if parts.len() >= 2
                    && let Ok(kb) = parts[1].parse::<i64>()
                {
                    return kb * 1024; // convert kB to bytes
                }
            }
        }
    }
    0
}

#[cfg(test)]
mod tests {
    use super::*;

    // -- get_formatted_size --

    #[test]
    fn formatted_size_zero() {
        assert_eq!(get_formatted_size(0, 1), "0.0 B");
    }

    #[test]
    fn formatted_size_bytes() {
        assert_eq!(get_formatted_size(512, 0), "512 B");
    }

    #[test]
    fn formatted_size_kilobytes() {
        assert_eq!(get_formatted_size(1024, 1), "1.0 kB");
    }

    #[test]
    fn formatted_size_megabytes() {
        assert_eq!(get_formatted_size(1_048_576, 1), "1.0 MB");
    }

    #[test]
    fn formatted_size_gigabytes() {
        assert_eq!(get_formatted_size(1_073_741_824, 1), "1.0 GB");
    }

    // -- get_formatted_duration --

    #[test]
    fn formatted_duration_milliseconds() {
        assert_eq!(get_formatted_duration(0.001), "1 ms");
    }

    #[test]
    fn formatted_duration_half_second() {
        assert_eq!(get_formatted_duration(0.5), "500 ms");
    }

    #[test]
    fn formatted_duration_seconds() {
        assert_eq!(get_formatted_duration(1.5), "1.5 s");
    }

    // -- get_formatted_age --

    #[test]
    fn formatted_age_seconds() {
        assert_eq!(get_formatted_age(0), "0 sec(s)");
        assert_eq!(get_formatted_age(59), "59 sec(s)");
    }

    #[test]
    fn formatted_age_minutes() {
        assert_eq!(get_formatted_age(60), "1 min(s)");
    }

    #[test]
    fn formatted_age_hours() {
        assert_eq!(get_formatted_age(3600), "1 hour(s)");
    }

    #[test]
    fn formatted_age_days() {
        assert_eq!(get_formatted_age(86400), "1 day(s)");
    }

    // -- get_formatted_cache_lifetime --

    #[test]
    fn cache_lifetime_seconds() {
        assert_eq!(get_formatted_cache_lifetime(0), "0 s");
    }

    #[test]
    fn cache_lifetime_minutes() {
        assert_eq!(get_formatted_cache_lifetime(60), "1 min");
    }

    #[test]
    fn cache_lifetime_hours() {
        // 3600 is exactly boundary of <= 3600, so still "min"
        assert_eq!(get_formatted_cache_lifetime(3601), "1 h");
    }

    #[test]
    fn cache_lifetime_days() {
        // 86400 is exactly boundary of <= 86400, so still "h"
        assert_eq!(get_formatted_cache_lifetime(86401), "1 d");
    }

    #[test]
    fn cache_lifetime_months() {
        // 86400*90 = 7776000, must exceed that for "mon"
        assert_eq!(get_formatted_cache_lifetime(86400 * 91), "3 mon");
    }

    // -- is_regex_pattern --

    #[test]
    fn regex_pattern_slash_delimited() {
        assert!(is_regex_pattern("/test/i"));
    }

    #[test]
    fn regex_pattern_hash_delimited() {
        assert!(is_regex_pattern("#pat#"));
    }

    #[test]
    fn regex_pattern_plain_text() {
        assert!(!is_regex_pattern("plain"));
    }

    #[test]
    fn regex_pattern_empty() {
        assert!(!is_regex_pattern(""));
    }

    #[test]
    fn regex_pattern_single_slash() {
        assert!(!is_regex_pattern("/"));
    }

    // -- extract_pcre_regex_pattern --

    #[test]
    fn extract_pcre_with_case_insensitive() {
        assert_eq!(extract_pcre_regex_pattern("/hello/i"), "(?i)hello");
    }

    #[test]
    fn extract_pcre_hash_delimiter() {
        assert_eq!(extract_pcre_regex_pattern("#test#"), "test");
    }

    #[test]
    fn extract_pcre_tilde_with_flags() {
        // Only 'i' flag is converted to (?i); other flags are silently ignored
        let result = extract_pcre_regex_pattern("~foo~ms");
        assert_eq!(result, "foo");
    }

    // -- strip_javascript --

    #[test]
    fn strip_javascript_removes_script_tags() {
        let input = "<p>ok</p><script>alert(1)</script>";
        assert_eq!(strip_javascript(input), "<p>ok</p>");
    }

    // -- strip_styles --

    #[test]
    fn strip_styles_removes_style_tags() {
        let input = "<p>ok</p><style>.x{}</style>";
        assert_eq!(strip_styles(input), "<p>ok</p>");
    }

    // -- mb_str_pad --

    #[test]
    fn str_pad_shorter_input() {
        assert_eq!(mb_str_pad("hi", 5, ' '), "hi   ");
    }

    #[test]
    fn str_pad_longer_input() {
        assert_eq!(mb_str_pad("long", 2, ' '), "long");
    }

    // -- is_href_for_requestable_resource --

    #[test]
    fn requestable_http_url() {
        assert!(is_href_for_requestable_resource("https://x.com"));
    }

    #[test]
    fn requestable_javascript_void() {
        assert!(!is_href_for_requestable_resource("javascript:void(0)"));
    }

    #[test]
    fn requestable_mailto() {
        assert!(!is_href_for_requestable_resource("mailto:a@b.c"));
    }

    #[test]
    fn requestable_data_uri() {
        assert!(!is_href_for_requestable_resource("data:text/html"));
    }

    // -- get_absolute_url_by_base_url --

    #[test]
    fn absolute_url_from_root_relative() {
        assert_eq!(
            get_absolute_url_by_base_url("https://x.com/a/", "/b"),
            "https://x.com/b"
        );
    }

    #[test]
    fn absolute_url_from_relative() {
        assert_eq!(
            get_absolute_url_by_base_url("https://x.com/a/", "c"),
            "https://x.com/a/c"
        );
    }
}


================================================
FILE: src/version.rs
================================================
// SiteOne Crawler - Version
// (c) Jan Reges <jan.reges@siteone.cz>

pub const CODE: &str = "2.3.0.20260330";


================================================
FILE: src/wizard/form.rs
================================================
// SiteOne Crawler - Interactive settings form with arrow-key cycling
// (c) Jan Reges <jan.reges@siteone.cz>

use crossterm::{
    cursor,
    event::{self, Event, KeyCode, KeyEventKind, KeyModifiers},
    execute,
    terminal::{self, Clear, ClearType},
};
use std::io::{self, Write};

use super::WizardError;
use super::presets::WizardState;

// ── FormSetting ─────────────────────────────────────────────────────────────

pub struct FormSetting {
    pub label: &'static str,
    pub options: Vec<&'static str>,
    pub current: usize,
}

impl FormSetting {
    fn new(label: &'static str, options: Vec<&'static str>, default: &str) -> Self {
        let current = options.iter().position(|o| *o == default).unwrap_or(0);
        FormSetting {
            label,
            options,
            current,
        }
    }

    pub fn value(&self) -> &str {
        self.options[self.current]
    }

    fn cycle_right(&mut self) {
        self.current = (self.current + 1) % self.options.len();
    }

    fn cycle_left(&mut self) {
        if self.current == 0 {
            self.current = self.options.len() - 1;
        } else {
            self.current -= 1;
        }
    }
}

// ── Setting indices (order in the form) ─────────────────────────────────────

const S_TIMEOUT: usize = 0;
const S_WORKERS: usize = 1;
const S_MAX_RPS: usize = 2;
const S_MAX_URLS: usize = 3;
const S_DEVICE: usize = 4;
const S_JAVASCRIPT: usize = 5;
const S_CSS: usize = 6;
const S_FONTS: usize = 7;
const S_IMAGES: usize = 8;
const S_FILES: usize = 9;
const S_SINGLE_PAGE: usize = 10;
const S_OFFLINE: usize = 11;
const S_MARKDOWN: usize = 12;
const S_SITEMAP: usize = 13;
const S_CACHE: usize = 14;
const S_STORAGE: usize = 15;
const S_ROBOTS: usize = 16;

// ── Build form from WizardState ─────────────────────────────────────────────

pub fn build_form_settings(state: &WizardState) -> Vec<FormSetting> {
    vec![
        // Performance & Limits
        FormSetting::new(
            "Timeout",
            vec!["1s", "2s", "3s", "5s", "10s", "30s", "60s"],
            format_static_timeout(state.timeout),
        ),
        FormSetting::new(
            "Workers",
            vec!["1", "2", "3", "5", "8", "10", "20", "50"],
            format_static_workers(state.workers),
        ),
        FormSetting::new(
            "Max requests/sec",
            vec!["unlimited", "5/s", "10/s", "20/s", "50/s", "100/s", "500/s"],
            format_static_rps(state.max_reqs_per_sec),
        ),
        FormSetting::new(
            "Max visited URLs",
            vec!["unlimited", "100", "500", "1000", "5000", "10000", "50000", "100000"],
            format_static_max_urls(state.max_visited_urls),
        ),
        // Device
        FormSetting::new("Device", vec!["desktop", "mobile", "tablet"], &state.device),
        // Content types
        FormSetting::new(
            "JavaScript",
            vec!["yes", "no"],
            if state.disable_javascript { "no" } else { "yes" },
        ),
        FormSetting::new(
            "CSS stylesheets",
            vec!["yes", "no"],
            if state.disable_styles { "no" } else { "yes" },
        ),
        FormSetting::new(
            "Fonts",
            vec!["yes", "no"],
            if state.disable_fonts { "no" } else { "yes" },
        ),
        FormSetting::new(
            "Images",
            vec!["yes", "no"],
            if state.disable_images { "no" } else { "yes" },
        ),
        FormSetting::new(
            "Files (PDFs, ZIPs..)",
            vec!["yes", "no"],
            if state.disable_files { "no" } else { "yes" },
        ),
        // Scope
        FormSetting::new(
            "Single page only",
            vec!["no", "yes"],
            if state.single_page { "yes" } else { "no" },
        ),
        // Generators
        FormSetting::new(
            "Offline export",
            vec!["disabled", "./tmp/"],
            if state.offline_export_dir.is_some() {
                "./tmp/"
            } else {
                "disabled"
            },
        ),
        FormSetting::new(
            "Markdown export",
            vec!["disabled", "./tmp/"],
            if state.markdown_export_dir.is_some() {
                "./tmp/"
            } else {
                "disabled"
            },
        ),
        FormSetting::new(
            "Sitemap XML",
            vec!["disabled", "./sitemap.xml"],
            if state.sitemap_xml_file.is_some() {
                "./sitemap.xml"
            } else {
                "disabled"
            },
        ),
        // Caching
        FormSetting::new(
            "HTTP caching",
            vec!["enabled", "disabled"],
            if state.http_cache_enabled {
                "enabled"
            } else {
                "disabled"
            },
        ),
        FormSetting::new(
            "Data storage",
            vec!["memory", "file"],
            if state.result_storage_file { "file" } else { "memory" },
        ),
        // Advanced
        FormSetting::new(
            "Ignore robots.txt",
            vec!["no", "yes"],
            if state.ignore_robots_txt { "yes" } else { "no" },
        ),
    ]
}

// Match default values to the closest available option
fn format_static_timeout(val: u32) -> &'static str {
    match val {
        0..=1 => "1s",
        2 => "2s",
        3..=4 => "3s",
        5..=9 => "5s",
        10..=29 => "10s",
        30..=59 => "30s",
        _ => "60s",
    }
}

fn format_static_workers(val: u32) -> &'static str {
    match val {
        0..=1 => "1",
        2 => "2",
        3..=4 => "3",
        5..=7 => "5",
        8..=9 => "8",
        10..=19 => "10",
        20..=49 => "20",
        _ => "50",
    }
}

fn format_static_rps(val: u32) -> &'static str {
    match val {
        0 => "unlimited",
        1..=7 => "5/s",
        8..=14 => "10/s",
        15..=34 => "20/s",
        35..=74 => "50/s",
        75..=299 => "100/s",
        _ => "500/s",
    }
}

fn format_static_max_urls(val: u32) -> &'static str {
    match val {
        0 => "unlimited",
        1..=299 => "100",
        300..=749 => "500",
        750..=2499 => "1000",
        2500..=7499 => "5000",
        7500..=29999 => "10000",
        30000..=74999 => "50000",
        _ => "100000",
    }
}

// ── Apply form values back to WizardState ───────────────────────────────────

pub fn apply_form_to_state(settings: &[FormSetting], state: &mut WizardState) {
    // Timeout
    state.timeout = parse_timeout(settings[S_TIMEOUT].value());
    // Workers
    state.workers = settings[S_WORKERS].value().parse().unwrap_or(3);
    // Max req/s
    state.max_reqs_per_sec = parse_rps(settings[S_MAX_RPS].value());
    // Max URLs
    state.max_visited_urls = parse_max_urls(settings[S_MAX_URLS].value());
    // Device
    state.device = settings[S_DEVICE].value().to_string();
    // Content types
    state.disable_javascript = settings[S_JAVASCRIPT].value() == "no";
    state.disable_styles = settings[S_CSS].value() == "no";
    state.disable_fonts = settings[S_FONTS].value() == "no";
    state.disable_images = settings[S_IMAGES].value() == "no";
    state.disable_files = settings[S_FILES].value() == "no";
    // Scope
    state.single_page = settings[S_SINGLE_PAGE].value() == "yes";
    // Generators
    state.offline_export_dir = if settings[S_OFFLINE].value() == "disabled" {
        None
    } else {
        Some("./tmp/offline-{domain}-{date}/".to_string())
    };
    state.markdown_export_dir = if settings[S_MARKDOWN].value() == "disabled" {
        None
    } else {
        Some("./tmp/markdown-{domain}-{date}/".to_string())
    };
    state.sitemap_xml_file = if settings[S_SITEMAP].value() == "disabled" {
        None
    } else {
        Some(settings[S_SITEMAP].value().to_string())
    };
    // Caching
    state.http_cache_enabled = settings[S_CACHE].value() == "enabled";
    state.result_storage_file = settings[S_STORAGE].value() == "file";
    // Advanced
    state.ignore_robots_txt = settings[S_ROBOTS].value() == "yes";
}

fn parse_timeout(val: &str) -> u32 {
    val.strip_suffix('s').and_then(|n| n.parse().ok()).unwrap_or(5)
}

fn parse_rps(val: &str) -> u32 {
    if val == "unlimited" {
        0
    } else {
        val.strip_suffix("/s").and_then(|n| n.parse().ok()).unwrap_or(10)
    }
}

fn parse_max_urls(val: &str) -> u32 {
    if val == "unlimited" {
        0
    } else {
        val.parse().unwrap_or(10000)
    }
}

// ── Interactive form loop ───────────────────────────────────────────────────

/// Run the interactive settings form. Returns Ok(true) on confirm, Ok(false) on cancel.
pub fn run_form(settings: &mut [FormSetting], preset_name: &str) -> Result<bool, WizardError> {
    let mut stdout = io::stdout();
    let mut cursor_idx: usize = 0;

    // Get current cursor position for drawing
    let start_row = cursor::position().map(|(_, row)| row).unwrap_or(0);

    terminal::enable_raw_mode().map_err(|e: std::io::Error| WizardError::IoError(e.to_string()))?;
    execute!(stdout, cursor::Hide).ok();

    // Drain any leftover key events (e.g. Enter release from the previous prompt)
    std::thread::sleep(std::time::Duration::from_millis(100));
    while event::poll(std::time::Duration::from_millis(50))
        .map_err(|e: std::io::Error| WizardError::IoError(e.to_string()))?
    {
        let _ = event::read();
    }

    let (result, final_start_row) = form_event_loop(settings, &mut cursor_idx, start_row, &mut stdout, preset_name);

    // Always restore terminal
    execute!(stdout, cursor::Show).ok();
    terminal::disable_raw_mode().ok();

    // Move past the form area using the scroll-adjusted start row
    let total_rows = settings.len() as u16 + 5;
    execute!(stdout, cursor::MoveTo(0, final_start_row + total_rows)).ok();
    println!();

    result
}

fn form_event_loop(
    settings: &mut [FormSetting],
    cursor_idx: &mut usize,
    mut start_row: u16,
    stdout: &mut io::Stdout,
    preset_name: &str,
) -> (Result<bool, WizardError>, u16) {
    render_form(settings, *cursor_idx, &mut start_row, stdout, preset_name);

    // Ignore Enter events that arrive within a short window after form start,
    // to prevent a stale Enter from the previous inquire prompt from confirming immediately.
    let form_start = std::time::Instant::now();
    let debounce = std::time::Duration::from_millis(300);

    loop {
        match event::read().map_err(|e: std::io::Error| WizardError::IoError(e.to_string())) {
            Err(e) => return (Err(e), start_row),
            Ok(Event::Key(key)) => {
                // Only react to Press events; ignore Release and Repeat to avoid double-firing
                if key.kind != KeyEventKind::Press {
                    continue;
                }
                match key.code {
                    KeyCode::Up | KeyCode::Char('k') => {
                        if *cursor_idx > 0 {
                            *cursor_idx -= 1;
                        } else {
                            *cursor_idx = settings.len() - 1;
                        }
                    }
                    KeyCode::Down | KeyCode::Char('j') => {
                        *cursor_idx = (*cursor_idx + 1) % settings.len();
                    }
                    KeyCode::Left | KeyCode::Char('h') => {
                        settings[*cursor_idx].cycle_left();
                    }
                    KeyCode::Right | KeyCode::Char('l') => {
                        settings[*cursor_idx].cycle_right();
                    }
                    KeyCode::Enter => {
                        if form_start.elapsed() >= debounce {
                            return (Ok(true), start_row);
                        }
                        continue; // ignore stale Enter from previous prompt
                    }
                    KeyCode::Esc | KeyCode::Char('q') => return (Ok(false), start_row),
                    KeyCode::Char('c') if key.modifiers.contains(KeyModifiers::CONTROL) => {
                        return (Err(WizardError::Cancelled), start_row);
                    }
                    _ => continue, // skip re-render for unknown keys
                }
                render_form(settings, *cursor_idx, &mut start_row, stdout, preset_name);
            }
            Ok(_) => continue, // ignore non-key events
        }
    }
}

fn render_form(
    settings: &[FormSetting],
    cursor_idx: usize,
    start_row: &mut u16,
    stdout: &mut io::Stdout,
    preset_name: &str,
) {
    execute!(stdout, cursor::MoveTo(0, *start_row), Clear(ClearType::FromCursorDown)).ok();

    let label_w = 22;
    let val_w = 18;

    // Header
    write_line(
        stdout,
        &format!("\x1b[1m  Settings\x1b[0m \x1b[90m(preset: {})\x1b[0m", preset_name),
    );
    write_line(
        stdout,
        "  \x1b[33mUp/Down\x1b[90m = navigate  \x1b[33mLeft/Right\x1b[90m = change value  \x1b[33mEnter\x1b[90m = confirm  \x1b[33mEsc\x1b[90m = cancel\x1b[0m",
    );
    write_line(stdout, "");

    for (i, setting) in settings.iter().enumerate() {
        let is_focused = i == cursor_idx;
        let val = setting.value();

        if is_focused {
            // Focused: yellow arrow, bold label, yellow value with < >
            write!(
                stdout,
                "  \x1b[33m>\x1b[0m \x1b[1m{:<lw$}\x1b[0m \x1b[33m<\x1b[0m \x1b[1;33m{:^vw$}\x1b[0m \x1b[33m>\x1b[0m\r\n",
                setting.label,
                val,
                lw = label_w,
                vw = val_w,
            )
            .ok();
        } else {
            // Normal: dimmed value
            write!(
                stdout,
                "    {:<lw$} \x1b[90m{:^vw$}\x1b[0m\r\n",
                setting.label,
                val,
                lw = label_w,
                vw = val_w,
            )
            .ok();
        }
    }

    write_line(stdout, "");

    stdout.flush().ok();

    // Recalculate start_row in case terminal scrolled (e.g. form near bottom of window).
    // Total lines: header + help + blank + settings + trailing blank = settings.len() + 4
    let total_lines = settings.len() as u16 + 4;
    if let Ok((_, current_row)) = cursor::position() {
        *start_row = current_row.saturating_sub(total_lines);
    }
}

fn write_line(stdout: &mut io::Stdout, text: &str) {
    write!(stdout, "{}\r\n", text).ok();
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn parse_timeout_values() {
        assert_eq!(parse_timeout("1s"), 1);
        assert_eq!(parse_timeout("5s"), 5);
        assert_eq!(parse_timeout("60s"), 60);
    }

    #[test]
    fn parse_rps_values() {
        assert_eq!(parse_rps("unlimited"), 0);
        assert_eq!(parse_rps("10/s"), 10);
        assert_eq!(parse_rps("100/s"), 100);
    }

    #[test]
    fn parse_max_urls_values() {
        assert_eq!(parse_max_urls("unlimited"), 0);
        assert_eq!(parse_max_urls("10000"), 10000);
    }

    #[test]
    fn format_static_timeout_snaps() {
        assert_eq!(format_static_timeout(1), "1s");
        assert_eq!(format_static_timeout(4), "3s");
        assert_eq!(format_static_timeout(5), "5s");
        assert_eq!(format_static_timeout(15), "10s");
        assert_eq!(format_static_timeout(100), "60s");
    }

    #[test]
    fn format_static_workers_snaps() {
        assert_eq!(format_static_workers(1), "1");
        assert_eq!(format_static_workers(3), "3");
        assert_eq!(format_static_workers(5), "5");
        assert_eq!(format_static_workers(10), "10");
    }

    #[test]
    fn cycle_wraps_around() {
        let mut s = FormSetting::new("test", vec!["a", "b", "c"], "a");
        assert_eq!(s.current, 0);
        s.cycle_left();
        assert_eq!(s.current, 2); // wraps to last
        s.cycle_right();
        assert_eq!(s.current, 0); // wraps to first
    }

    #[test]
    fn apply_form_roundtrip() {
        let mut state = WizardState::from_preset(&super::super::presets::PRESETS[0]); // Quick audit
        let settings = build_form_settings(&state);
        // Apply unchanged form back → state should match
        apply_form_to_state(&settings, &mut state);
        assert_eq!(state.workers, 5);
        assert_eq!(state.timeout, 5);
        assert!(!state.disable_javascript);
    }
}


================================================
FILE: src/wizard/mod.rs
================================================
// SiteOne Crawler - Interactive wizard for no-args invocation
// (c) Jan Reges <jan.reges@siteone.cz>

mod form;
mod presets;

use colored::Colorize;
use inquire::ui::{Color, RenderConfig, StyleSheet, Styled};
use inquire::validator::Validation;
use inquire::{Confirm, InquireError, Select, Text};
use std::io::IsTerminal;

use crate::version;
use presets::{PRESETS, WizardState};

// ── Public API ──────────────────────────────────────────────────────────────

/// Returns true when stdin AND stdout are interactive TTYs.
pub fn is_interactive_tty() -> bool {
    std::io::stdin().is_terminal() && std::io::stdout().is_terminal()
}

/// After an offline/markdown export crawl (wizard mode), offer to immediately
/// serve the exported content via HTTP.
/// Returns `Some((dir_path, "offline"|"markdown"))` if the user confirms, else `None`.
pub fn offer_serve_after_export(crawl_argv: &[String]) -> Option<(String, String)> {
    let (dir, kind) = if let Some(arg) = crawl_argv.iter().find(|a| a.starts_with("--offline-export-dir=")) {
        let raw = arg.trim_start_matches("--offline-export-dir=");
        (raw.trim_matches('\'').to_string(), "offline")
    } else if let Some(arg) = crawl_argv.iter().find(|a| a.starts_with("--markdown-export-dir=")) {
        let raw = arg.trim_start_matches("--markdown-export-dir=");
        (raw.trim_matches('\'').to_string(), "markdown")
    } else {
        return None;
    };

    println!();
    let confirmed = Confirm::new(&format!("Serve the {} export via HTTP?", kind))
        .with_default(true)
        .prompt()
        .unwrap_or(false);

    if confirmed { Some((dir, kind.to_string())) } else { None }
}

/// Block until the user presses Enter. Used after a wizard-launched crawl
/// so the terminal window stays open (especially on Windows double-click).
pub fn press_enter_to_exit() {
    println!();
    println!("{}", "Press Enter to exit...".dimmed());
    let mut buf = String::new();
    let _ = std::io::stdin().read_line(&mut buf);
}

/// Run the interactive wizard. Returns a synthetic argv `Vec<String>` ready
/// to be fed into `Initiator::new()` / `parse_argv()`.
pub fn run_wizard() -> Result<Vec<String>, WizardError> {
    // Set inquire theme: yellow accents instead of default cyan, gray help text
    let mut render_config = RenderConfig::default_colored();
    render_config.help_message = StyleSheet::new().with_fg(Color::DarkGrey);
    render_config.highlighted_option_prefix = Styled::new("❯").with_fg(Color::DarkYellow);
    render_config.answer = StyleSheet::new().with_fg(Color::DarkYellow);
    inquire::set_global_render_config(render_config);

    print_banner();

    // Step 1: Preset selection (+ dynamic serve items if exports exist)
    let choice = prompt_preset_or_serve()?;
    match choice {
        PresetChoice::Serve(argv) => Ok(argv),
        PresetChoice::Preset(preset_idx) => {
            let preset = &PRESETS[preset_idx];
            let mut state = WizardState::from_preset(preset);

            // Step 2: URL (required)
            state.url = prompt_url()?;

            // Resolve {domain} and {date} placeholders in export paths
            resolve_export_paths(&mut state);

            // Step 3: Interactive settings form (arrow-key navigation + value cycling)
            let mut settings = form::build_form_settings(&state);
            println!();

            // Show warning for Stress Test preset
            if preset.name == "Stress Test" {
                println!(
                    "  {} {}",
                    "WARNING:".yellow().bold(),
                    "Stress testing generates high-concurrency load with cache-busting".yellow()
                );
                println!(
                    "           {}",
                    "random query params. This can overload a server and cause downtime.".yellow()
                );
                println!(
                    "           {}",
                    "Only run this against your own websites or with explicit permission!"
                        .yellow()
                        .bold()
                );
                println!();
            }

            let confirmed = form::run_form(&mut settings, preset.name)?;
            if !confirmed {
                return Err(WizardError::Cancelled);
            }
            form::apply_form_to_state(&settings, &mut state);

            // Re-resolve export paths — apply_form_to_state may have reset them to templates
            resolve_export_paths(&mut state);

            // Step 4: Summary & confirm
            let argv = state.build_argv();
            print_summary(&state, &argv);

            let run = Confirm::new("Start the crawl?").with_default(true).prompt()?;

            if run {
                println!();
                Ok(argv)
            } else {
                Err(WizardError::Cancelled)
            }
        }
    }
}

// ── Preset or Serve choice ──────────────────────────────────────────────────

enum PresetChoice {
    Preset(usize),
    Serve(Vec<String>),
}

/// Separator label used in the menu to visually separate serve items.
const SERVE_SEPARATOR: &str = "──────────────────────────────────────";

fn prompt_preset_or_serve() -> Result<PresetChoice, WizardError> {
    let mut labels: Vec<String> = PRESETS.iter().map(|p| p.to_string()).collect();

    // Detect existing exports in ./tmp/
    let offline_dirs = find_export_dirs("offline");
    let markdown_dirs = find_export_dirs("markdown");

    let has_serve_items = !offline_dirs.is_empty() || !markdown_dirs.is_empty();
    let serve_offline_label = "Browse offline export     Serve a previously exported offline site via HTTP";
    let serve_markdown_label = "Browse markdown export    Serve a previously exported markdown site via HTTP";

    if has_serve_items {
        labels.push(SERVE_SEPARATOR.to_string());
        if !offline_dirs.is_empty() {
            labels.push(serve_offline_label.to_string());
        }
        if !markdown_dirs.is_empty() {
            labels.push(serve_markdown_label.to_string());
        }
    }

    let choice = Select::new("Choose a crawl mode:", labels.clone())
        .with_page_size(labels.len())
        .prompt()?;

    // Check if it's a serve option
    if choice == serve_offline_label {
        return prompt_serve_export(&offline_dirs, "offline");
    }
    if choice == serve_markdown_label {
        return prompt_serve_export(&markdown_dirs, "markdown");
    }
    if choice == SERVE_SEPARATOR {
        // User selected the separator — re-prompt
        return prompt_preset_or_serve();
    }

    // It's a preset
    let preset_idx = PRESETS.iter().position(|p| choice.starts_with(p.name)).unwrap_or(0);
    Ok(PresetChoice::Preset(preset_idx))
}

/// Prompt the user to select from available exports, then return serve argv.
fn prompt_serve_export(dirs: &[ExportDir], kind: &str) -> Result<PresetChoice, WizardError> {
    let labels: Vec<String> = dirs.iter().map(|d| format!("{:40} {}", d.name, d.date_label)).collect();

    let choice = Select::new(&format!("Select {} export to serve:", kind), labels).prompt()?;

    // Find matching dir
    let selected = dirs.iter().find(|d| choice.starts_with(&d.name)).unwrap();

    let serve_flag = match kind {
        "offline" => format!("--serve-offline={}", selected.path),
        _ => format!("--serve-markdown={}", selected.path),
    };

    Ok(PresetChoice::Serve(vec!["siteone-crawler".to_string(), serve_flag]))
}

// ── Export directory detection ───────────────────────────────────────────────

struct ExportDir {
    name: String,
    path: String,
    date_label: String,
}

/// Find export directories matching `./tmp/{kind}-*/` pattern, sorted newest first.
fn find_export_dirs(kind: &str) -> Vec<ExportDir> {
    let tmp_path = std::path::Path::new("./tmp");
    if !tmp_path.is_dir() {
        return Vec::new();
    }

    let prefix = format!("{}-", kind);
    let mut dirs: Vec<ExportDir> = Vec::new();

    if let Ok(entries) = std::fs::read_dir(tmp_path) {
        for entry in entries.flatten() {
            let path = entry.path();
            if !path.is_dir() {
                continue;
            }
            let name = entry.file_name().to_string_lossy().to_string();
            if !name.starts_with(&prefix) {
                continue;
            }

            // Extract a human-readable date from metadata
            let date_label = std::fs::metadata(&path)
                .and_then(|m| m.modified())
                .ok()
                .map(|t| {
                    let dt: chrono::DateTime<chrono::Local> = t.into();
                    dt.format("%Y-%m-%d %H:%M").to_string()
                })
                .unwrap_or_default();

            dirs.push(ExportDir {
                name,
                path: path.to_string_lossy().to_string(),
                date_label,
            });
        }
    }

    // Sort newest first (by name descending — names contain date)
    dirs.sort_by(|a, b| b.name.cmp(&a.name));
    dirs
}

// ── Export path resolution ──────────────────────────────────────────────────

/// Replace `{domain}` and `{date}` placeholders in export dir paths after URL is known.
fn resolve_export_paths(state: &mut WizardState) {
    let url = &state.url;
    if let Some(ref dir) = state.offline_export_dir
        && (dir.contains("{domain}") || dir.contains("{date}"))
    {
        state.offline_export_dir = Some(presets::resolve_export_path(dir, url));
    }
    if let Some(ref dir) = state.markdown_export_dir
        && (dir.contains("{domain}") || dir.contains("{date}"))
    {
        state.markdown_export_dir = Some(presets::resolve_export_path(dir, url));
    }
}

// ── Error type ──────────────────────────────────────────────────────────────

#[derive(Debug)]
pub enum WizardError {
    Cancelled,
    IoError(String),
}

impl std::fmt::Display for WizardError {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        match self {
            WizardError::Cancelled => write!(f, "Wizard cancelled."),
            WizardError::IoError(msg) => write!(f, "Wizard error: {}", msg),
        }
    }
}

impl From<InquireError> for WizardError {
    fn from(err: InquireError) -> Self {
        match err {
            InquireError::OperationCanceled | InquireError::OperationInterrupted => WizardError::Cancelled,
            other => WizardError::IoError(other.to_string()),
        }
    }
}

// ── Banner ──────────────────────────────────────────────────────────────────

fn print_banner() {
    let separator = "=".repeat(60);
    println!();
    println!("{}", separator.dimmed());
    println!(
        "  {} {}",
        "SiteOne Crawler".bold(),
        format!("v{}", version::CODE).dimmed()
    );
    println!(
        "  {}",
        "Website QA toolkit: audit, clone, export, sitemap, CI/CD".dimmed()
    );
    println!("{}", separator.dimmed());
    println!();
}

// ── URL prompt ──────────────────────────────────────────────────────────────

fn prompt_url() -> Result<String, WizardError> {
    let url = Text::new("Enter the website URL to crawl:")
        .with_placeholder("https://example.com")
        .with_help_message("Enter a domain (e.g. example.com) or full URL (https://...)")
        .with_validator(|input: &str| {
            let trimmed = input.trim();
            if trimmed.is_empty() {
                return Ok(Validation::Invalid("URL is required.".into()));
            }
            let url_str = normalize_url_input(trimmed);
            match url::Url::parse(&url_str) {
                Ok(u) if (u.scheme() == "http" || u.scheme() == "https") && u.host().is_some() => Ok(Validation::Valid),
                _ => Ok(Validation::Invalid(
                    "Invalid URL. Enter a domain name or a valid http(s) address.".into(),
                )),
            }
        })
        .prompt()?;

    Ok(normalize_url_input(url.trim()))
}

fn normalize_url_input(input: &str) -> String {
    let trimmed = input.trim();
    if !trimmed.starts_with("http://") && !trimmed.starts_with("https://") {
        format!("https://{}", trimmed)
    } else {
        trimmed.to_string()
    }
}

// ── Summary ─────────────────────────────────────────────────────────────────

fn print_summary(state: &WizardState, argv: &[String]) {
    println!();
    let separator = "=".repeat(60);
    println!("{}", separator.dimmed());
    println!("  {}", "Configuration Summary".bold());
    println!("{}", separator.dimmed());
    println!();

    let label_width = 22;

    print_row("URL:", &state.url, label_width);
    print_row("Preset:", &state.preset_name, label_width);
    print_row("Workers:", &state.workers.to_string(), label_width);
    print_row("Timeout:", &format!("{}s", state.timeout), label_width);
    let rate_limit = if state.max_reqs_per_sec == 0 {
        "unlimited".to_string()
    } else {
        format!("{}/s", state.max_reqs_per_sec)
    };
    print_row("Rate limit:", &rate_limit, label_width);

    let max_urls = if state.max_visited_urls == 0 {
        "unlimited".to_string()
    } else {
        state.max_visited_urls.to_string()
    };
    print_row("Max URLs:", &max_urls, label_width);
    print_row("Device:", &state.device, label_width);
    print_row("Content types:", &state.content_summary(), label_width);

    if state.single_page {
        print_row("Scope:", "single page", label_width);
    }
    if let Some(ref dir) = state.offline_export_dir {
        print_row("Offline export:", dir, label_width);
    }
    if let Some(ref dir) = state.markdown_export_dir {
        print_row("Markdown export:", dir, label_width);
    }
    if let Some(ref file) = state.sitemap_xml_file {
        print_row("Sitemap XML:", file, label_width);
    }
    if let Some(ref cols) = state.extra_columns {
        print_row("Extra columns:", cols, label_width);
    }
    if !state.http_cache_enabled {
        print_row("HTTP cache:", "disabled", label_width);
    }
    if state.result_storage_file {
        print_row("Storage:", "file", label_width);
    }
    if state.ignore_robots_txt {
        print_row("Robots.txt:", "ignored", label_width);
    }
    if state.http_auth.is_some() {
        print_row("HTTP auth:", "configured", label_width);
    }
    if let Some(ref proxy) = state.proxy {
        print_row("Proxy:", proxy, label_width);
    }

    // Show generated CLI command
    println!();
    println!("  {}", "Equivalent CLI command:".yellow());
    let cmd = argv[1..].join(" \\\n    ");
    println!("  {} {}", "siteone-crawler".yellow(), cmd.yellow());
    println!();
    println!("  {}", "Tip: Copy this command to skip the wizard next time.".dimmed());
    println!();
}

fn print_row(label: &str, value: &str, label_width: usize) {
    println!("  {:<width$} {}", label.dimmed(), value, width = label_width);
}


================================================
FILE: src/wizard/presets.rs
================================================
// SiteOne Crawler - Wizard preset definitions and state
// (c) Jan Reges <jan.reges@siteone.cz>

use std::fmt;

/// A wizard preset — predefined configuration for common use cases.
pub struct Preset {
    pub name: &'static str,
    pub description: &'static str,
    pub workers: u32,
    pub timeout: u32,
    pub max_reqs_per_sec: u32,
    pub max_visited_urls: u32,
    pub disable_javascript: bool,
    pub disable_styles: bool,
    pub disable_fonts: bool,
    pub disable_images: bool,
    pub disable_files: bool,
    pub single_page: bool,
    pub offline_export_dir: Option<&'static str>,
    pub markdown_export_dir: Option<&'static str>,
    pub sitemap_xml_file: Option<&'static str>,
    pub http_cache_enabled: bool,
    pub result_storage_file: bool,
    pub extra_columns: Option<&'static str>,
    pub ignore_robots_txt: bool,
    pub add_random_query_params: bool,
    pub allowed_domains_for_external_files: Option<&'static str>,
    pub hide_columns: Option<&'static str>,
}

impl fmt::Display for Preset {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        write!(f, "{:26} {}", self.name, self.description)
    }
}

pub const PRESETS: &[Preset] = &[
    // 1. Quick Audit — the most common starting point
    Preset {
        name: "Quick Audit",
        description: "Fast site health overview — crawls all pages and assets",
        workers: 5,
        timeout: 5,
        max_reqs_per_sec: 10,
        max_visited_urls: 10000,
        disable_javascript: false,
        disable_styles: false,
        disable_fonts: false,
        disable_images: false,
        disable_files: false,
        single_page: false,
        offline_export_dir: None,
        markdown_export_dir: None,
        sitemap_xml_file: None,
        http_cache_enabled: true,
        result_storage_file: false,
        extra_columns: Some("Title(20)"),
        ignore_robots_txt: false,
        add_random_query_params: false,
        allowed_domains_for_external_files: None,
        hide_columns: Some("cache"),
    },
    // 2. SEO Analysis — metadata, headings, OpenGraph
    Preset {
        name: "SEO Analysis",
        description: "Extract titles, descriptions, keywords, and OpenGraph tags",
        workers: 8,
        timeout: 5,
        max_reqs_per_sec: 20,
        max_visited_urls: 50000,
        disable_javascript: true,
        disable_styles: true,
        disable_fonts: true,
        disable_images: true,
        disable_files: true,
        single_page: false,
        offline_export_dir: None,
        markdown_export_dir: None,
        sitemap_xml_file: None,
        http_cache_enabled: true,
        result_storage_file: false,
        extra_columns: Some("Title(20),Description(20),H1=xpath://h1/text()(40)"),
        ignore_robots_txt: false,
        add_random_query_params: false,
        allowed_domains_for_external_files: None,
        hide_columns: Some("cache"),
    },
    // 3. Performance Test — realistic timing, no cache
    Preset {
        name: "Performance Test",
        description: "Measure response times with cache disabled — find bottlenecks",
        workers: 3,
        timeout: 10,
        max_reqs_per_sec: 5,
        max_visited_urls: 5000,
        disable_javascript: false,
        disable_styles: false,
        disable_fonts: false,
        disable_images: false,
        disable_files: false,
        single_page: false,
        offline_export_dir: None,
        markdown_export_dir: None,
        sitemap_xml_file: None,
        http_cache_enabled: false,
        result_storage_file: false,
        extra_columns: Some("Title(30),DOM"),
        ignore_robots_txt: false,
        add_random_query_params: false,
        allowed_domains_for_external_files: None,
        hide_columns: None,
    },
    // 4. Security Check — headers, SSL/TLS, CSP
    Preset {
        name: "Security Check",
        description: "Check SSL/TLS, security headers, and redirects site-wide",
        workers: 5,
        timeout: 5,
        max_reqs_per_sec: 15,
        max_visited_urls: 10000,
        disable_javascript: false,
        disable_styles: true,
        disable_fonts: true,
        disable_images: true,
        disable_files: true,
        single_page: false,
        offline_export_dir: None,
        markdown_export_dir: None,
        sitemap_xml_file: None,
        http_cache_enabled: true,
        result_storage_file: false,
        extra_columns: Some("Title(30)"),
        ignore_robots_txt: false,
        add_random_query_params: false,
        allowed_domains_for_external_files: None,
        hide_columns: Some("cache"),
    },
    // 5. Offline Clone — full site download
    Preset {
        name: "Offline Clone",
        description: "Download entire website with all assets for offline browsing",
        workers: 2,
        timeout: 5,
        max_reqs_per_sec: 8,
        max_visited_urls: 100000,
        disable_javascript: false,
        disable_styles: false,
        disable_fonts: false,
        disable_images: false,
        disable_files: false,
        single_page: false,
        offline_export_dir: Some("./tmp/offline-{domain}-{date}/"),
        markdown_export_dir: None,
        sitemap_xml_file: None,
        http_cache_enabled: false,
        result_storage_file: false,
        extra_columns: None,
        ignore_robots_txt: false,
        add_random_query_params: false,
        allowed_domains_for_external_files: Some("*"),
        hide_columns: Some("cache"),
    },
    // 6. Markdown Export — content for AI/docs
    Preset {
        name: "Markdown Export",
        description: "Convert pages to Markdown for AI models or documentation",
        workers: 3,
        timeout: 5,
        max_reqs_per_sec: 10,
        max_visited_urls: 20000,
        disable_javascript: true,
        disable_styles: true,
        disable_fonts: true,
        disable_images: false,
        disable_files: false,
        single_page: false,
        offline_export_dir: None,
        markdown_export_dir: Some("./tmp/markdown-{domain}-{date}/"),
        sitemap_xml_file: None,
        http_cache_enabled: true,
        result_storage_file: false,
        extra_columns: Some("Title(40)"),
        ignore_robots_txt: false,
        add_random_query_params: false,
        allowed_domains_for_external_files: None,
        hide_columns: Some("cache"),
    },
    // 7. Stress Test — high concurrency load testing with cache busting
    Preset {
        name: "Stress Test",
        description: "High-concurrency load test with cache-busting random params",
        workers: 20,
        timeout: 10,
        max_reqs_per_sec: 20,
        max_visited_urls: 10000,
        disable_javascript: true,
        disable_styles: true,
        disable_fonts: true,
        disable_images: true,
        disable_files: true,
        single_page: false,
        offline_export_dir: None,
        markdown_export_dir: None,
        sitemap_xml_file: None,
        http_cache_enabled: false,
        result_storage_file: false,
        extra_columns: Some("Title(30)"),
        ignore_robots_txt: true,
        add_random_query_params: true,
        allowed_domains_for_external_files: None,
        hide_columns: Some("cache"),
    },
    // 8. Single Page — deep dive on one URL
    Preset {
        name: "Single Page",
        description: "Deep analysis of a single URL — SEO, security, performance",
        workers: 1,
        timeout: 10,
        max_reqs_per_sec: 10,
        max_visited_urls: 1,
        disable_javascript: false,
        disable_styles: false,
        disable_fonts: false,
        disable_images: false,
        disable_files: false,
        single_page: true,
        offline_export_dir: None,
        markdown_export_dir: None,
        sitemap_xml_file: None,
        http_cache_enabled: true,
        result_storage_file: false,
        extra_columns: Some("Title(50),Description(50),Keywords(30),DOM"),
        ignore_robots_txt: false,
        add_random_query_params: false,
        allowed_domains_for_external_files: None,
        hide_columns: None,
    },
    // 9. Large Site Crawl — optimized for scale
    Preset {
        name: "Large Site Crawl",
        description: "High-throughput HTML-only crawl for large sites (100k+ pages)",
        workers: 10,
        timeout: 3,
        max_reqs_per_sec: 50,
        max_visited_urls: 0, // unlimited
        disable_javascript: true,
        disable_styles: true,
        disable_fonts: true,
        disable_images: true,
        disable_files: true,
        single_page: false,
        offline_export_dir: None,
        markdown_export_dir: None,
        sitemap_xml_file: Some("./sitemap.xml"),
        http_cache_enabled: true,
        result_storage_file: false,
        extra_columns: Some("Title(40)"),
        ignore_robots_txt: true,
        add_random_query_params: false,
        allowed_domains_for_external_files: None,
        hide_columns: Some("cache"),
    },
    // 10. Custom — power users
    Preset {
        name: "Custom",
        description: "Start from defaults and configure every option manually",
        workers: 3,
        timeout: 5,
        max_reqs_per_sec: 10,
        max_visited_urls: 10000,
        disable_javascript: false,
        disable_styles: false,
        disable_fonts: false,
        disable_images: false,
        disable_files: false,
        single_page: false,
        offline_export_dir: None,
        markdown_export_dir: None,
        sitemap_xml_file: None,
        http_cache_enabled: true,
        result_storage_file: false,
        extra_columns: None,
        ignore_robots_txt: false,
        add_random_query_params: false,
        allowed_domains_for_external_files: None,
        hide_columns: None,
    },
];

/// Mutable state collected by the wizard, built from a preset and optionally customized.
pub struct WizardState {
    pub preset_name: String,
    pub url: String,
    pub workers: u32,
    pub timeout: u32,
    pub max_reqs_per_sec: u32,
    pub max_visited_urls: u32,
    pub device: String,
    pub disable_javascript: bool,
    pub disable_styles: bool,
    pub disable_fonts: bool,
    pub disable_images: bool,
    pub disable_files: bool,
    pub single_page: bool,
    pub offline_export_dir: Option<String>,
    pub markdown_export_dir: Option<String>,
    pub sitemap_xml_file: Option<String>,
    pub http_cache_enabled: bool,
    pub result_storage_file: bool,
    pub ignore_robots_txt: bool,
    pub add_random_query_params: bool,
    pub allowed_domains_for_external_files: Option<String>,
    pub hide_columns: Option<String>,
    pub extra_columns: Option<String>,
    pub http_auth: Option<String>,
    pub proxy: Option<String>,
}

impl WizardState {
    pub fn from_preset(preset: &Preset) -> Self {
        WizardState {
            preset_name: preset.name.to_string(),
            url: String::new(),
            workers: preset.workers,
            timeout: preset.timeout,
            max_reqs_per_sec: preset.max_reqs_per_sec,
            max_visited_urls: preset.max_visited_urls,
            device: "desktop".to_string(),
            disable_javascript: preset.disable_javascript,
            disable_styles: preset.disable_styles,
            disable_fonts: preset.disable_fonts,
            disable_images: preset.disable_images,
            disable_files: preset.disable_files,
            single_page: preset.single_page,
            offline_export_dir: preset.offline_export_dir.map(String::from),
            markdown_export_dir: preset.markdown_export_dir.map(String::from),
            sitemap_xml_file: preset.sitemap_xml_file.map(String::from),
            http_cache_enabled: preset.http_cache_enabled,
            result_storage_file: preset.result_storage_file,
            ignore_robots_txt: preset.ignore_robots_txt,
            add_random_query_params: preset.add_random_query_params,
            allowed_domains_for_external_files: preset.allowed_domains_for_external_files.map(String::from),
            hide_columns: preset.hide_columns.map(String::from),
            extra_columns: preset.extra_columns.map(String::from),
            http_auth: None,
            proxy: None,
        }
    }

    /// Build synthetic argv from wizard state. Only includes flags that differ from
    /// siteone-crawler defaults so the generated command is minimal and readable.
    pub fn build_argv(&self) -> Vec<String> {
        let mut args = vec!["siteone-crawler".to_string(), format!("--url='{}'", self.url)];

        // Performance & limits (defaults: workers=3, timeout=5, rps=10, max-urls=10000)
        if self.workers != 3 {
            args.push(format!("--workers={}", self.workers));
        }
        if self.timeout != 5 {
            args.push(format!("--timeout={}", self.timeout));
        }
        if self.max_reqs_per_sec != 10 {
            args.push(format!("--max-reqs-per-sec={}", self.max_reqs_per_sec));
        }
        if self.max_visited_urls != 10000 {
            args.push(format!("--max-visited-urls={}", self.max_visited_urls));
        }

        // Device (default: desktop)
        if self.device != "desktop" {
            args.push(format!("--device='{}'", self.device));
        }

        // Scope
        if self.single_page {
            args.push("--single-page".to_string());
        }

        // Content filtering
        if self.disable_javascript {
            args.push("--disable-javascript".to_string());
        }
        if self.disable_styles {
            args.push("--disable-styles".to_string());
        }
        if self.disable_fonts {
            args.push("--disable-fonts".to_string());
        }
        if self.disable_images {
            args.push("--disable-images".to_string());
        }
        if self.disable_files {
            args.push("--disable-files".to_string());
        }

        // Generators / exports
        if let Some(ref dir) = self.offline_export_dir {
            args.push(format!("--offline-export-dir='{}'", dir));
        }
        if let Some(ref dir) = self.markdown_export_dir {
            args.push(format!("--markdown-export-dir='{}'", dir));
        }
        if let Some(ref file) = self.sitemap_xml_file {
            args.push(format!("--sitemap-xml-file='{}'", file));
        }

        // Caching (default: enabled)
        if !self.http_cache_enabled {
            args.push("--no-cache".to_string());
        }
        if self.result_storage_file {
            args.push("--result-storage='file'".to_string());
        }

        // Extra columns
        if let Some(ref cols) = self.extra_columns {
            args.push(format!("--extra-columns='{}'", cols));
        }

        // Advanced
        if self.ignore_robots_txt {
            args.push("--ignore-robots-txt".to_string());
        }
        if self.add_random_query_params {
            args.push("--add-random-query-params".to_string());
        }
        if let Some(ref domains) = self.allowed_domains_for_external_files {
            args.push(format!("--allowed-domain-for-external-files='{}'", domains));
        }
        if let Some(ref cols) = self.hide_columns {
            args.push(format!("--hide-columns='{}'", cols));
        }
        if let Some(ref auth) = self.http_auth {
            args.push(format!("--http-auth='{}'", auth));
        }
        if let Some(ref proxy) = self.proxy {
            args.push(format!("--proxy='{}'", proxy));
        }

        args
    }

    /// Format a human-readable summary of non-default content types.
    pub fn content_summary(&self) -> String {
        let mut types = vec!["HTML"];
        if !self.disable_javascript {
            types.push("JS");
        }
        if !self.disable_styles {
            types.push("CSS");
        }
        if !self.disable_fonts {
            types.push("Fonts");
        }
        if !self.disable_images {
            types.push("Images");
        }
        if !self.disable_files {
            types.push("Files");
        }
        types.join(", ")
    }
}

/// Replace `{domain}` and `{date}` placeholders in export directory paths.
/// Called after the URL is known.
pub fn resolve_export_path(template: &str, url: &str) -> String {
    let domain = url::Url::parse(url)
        .ok()
        .and_then(|u| u.host_str().map(String::from))
        .unwrap_or_else(|| "unknown".to_string());
    let date = chrono::Local::now().format("%Y%m%d").to_string();
    template.replace("{domain}", &domain).replace("{date}", &date)
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn preset_count_is_10() {
        assert_eq!(PRESETS.len(), 10);
    }

    #[test]
    fn last_preset_is_custom() {
        assert_eq!(PRESETS[PRESETS.len() - 1].name, "Custom");
    }

    #[test]
    fn build_argv_contains_url() {
        let mut state = WizardState::from_preset(&PRESETS[0]);
        state.url = "https://example.com".to_string();
        let argv = state.build_argv();
        assert_eq!(argv[0], "siteone-crawler");
        assert_eq!(argv[1], "--url='https://example.com'");
    }

    #[test]
    fn build_argv_custom_is_minimal() {
        let mut state = WizardState::from_preset(&PRESETS[9]); // Custom
        state.url = "https://example.com".to_string();
        let argv = state.build_argv();
        // Custom preset uses all defaults, so only binary name + URL
        assert_eq!(argv.len(), 2);
    }

    #[test]
    fn build_argv_quick_audit() {
        let mut state = WizardState::from_preset(&PRESETS[0]);
        state.url = "https://example.com".to_string();
        let argv = state.build_argv();
        assert!(argv.contains(&"--workers=5".to_string()));
        assert!(argv.contains(&"--extra-columns='Title(20)'".to_string()));
    }

    #[test]
    fn build_argv_seo_disables_assets() {
        let mut state = WizardState::from_preset(&PRESETS[1]); // SEO
        state.url = "https://example.com".to_string();
        let argv = state.build_argv();
        assert!(argv.contains(&"--disable-javascript".to_string()));
        assert!(argv.contains(&"--disable-styles".to_string()));
        assert!(argv.contains(&"--disable-fonts".to_string()));
        assert!(argv.contains(&"--disable-images".to_string()));
        assert!(argv.contains(&"--disable-files".to_string()));
        assert!(argv.contains(&"--workers=8".to_string()));
        assert!(argv.contains(&"--max-reqs-per-sec=20".to_string()));
    }

    #[test]
    fn build_argv_seo_has_extra_columns() {
        let mut state = WizardState::from_preset(&PRESETS[1]); // SEO
        state.url = "https://example.com".to_string();
        let argv = state.build_argv();
        assert!(argv.contains(&"--extra-columns='Title(20),Description(20),H1=xpath://h1/text()(40)'".to_string()));
    }

    #[test]
    fn build_argv_performance_test() {
        let mut state = WizardState::from_preset(&PRESETS[2]);
        state.url = "https://example.com".to_string();
        let argv = state.build_argv();
        assert!(argv.contains(&"--timeout=10".to_string()));
        assert!(argv.contains(&"--max-reqs-per-sec=5".to_string()));
        assert!(argv.contains(&"--no-cache".to_string()));
        assert!(argv.contains(&"--max-visited-urls=5000".to_string()));
    }

    #[test]
    fn build_argv_security_check() {
        let mut state = WizardState::from_preset(&PRESETS[3]);
        state.url = "https://example.com".to_string();
        let argv = state.build_argv();
        assert!(argv.contains(&"--disable-styles".to_string()));
        assert!(argv.contains(&"--disable-fonts".to_string()));
        assert!(argv.contains(&"--disable-images".to_string()));
        assert!(!argv.contains(&"--disable-javascript".to_string())); // JS stays enabled
    }

    #[test]
    fn build_argv_offline_clone() {
        let mut state = WizardState::from_preset(&PRESETS[4]);
        state.url = "https://example.com".to_string();
        let argv = state.build_argv();
        assert!(argv.iter().any(|a| a.starts_with("--offline-export-dir=")));
        assert!(argv.contains(&"--no-cache".to_string()));
        assert!(argv.contains(&"--max-visited-urls=100000".to_string()));
        assert!(argv.contains(&"--workers=2".to_string()));
    }

    #[test]
    fn build_argv_markdown_export() {
        let mut state = WizardState::from_preset(&PRESETS[5]);
        state.url = "https://example.com".to_string();
        let argv = state.build_argv();
        assert!(argv.iter().any(|a| a.starts_with("--markdown-export-dir=")));
        assert!(argv.contains(&"--disable-javascript".to_string()));
        assert!(!argv.contains(&"--disable-images".to_string())); // images stay enabled
        assert!(argv.contains(&"--max-visited-urls=20000".to_string()));
    }

    #[test]
    fn build_argv_stress_test() {
        let mut state = WizardState::from_preset(&PRESETS[6]);
        state.url = "https://example.com".to_string();
        let argv = state.build_argv();
        assert!(argv.contains(&"--workers=20".to_string()));
        assert!(argv.contains(&"--max-reqs-per-sec=20".to_string()));
        assert!(argv.contains(&"--add-random-query-params".to_string()));
        assert!(argv.contains(&"--ignore-robots-txt".to_string()));
        assert!(argv.contains(&"--no-cache".to_string()));
        assert!(argv.contains(&"--disable-javascript".to_string()));
        assert!(argv.contains(&"--disable-styles".to_string()));
        assert!(argv.contains(&"--disable-fonts".to_string()));
        assert!(argv.contains(&"--disable-images".to_string()));
        assert!(argv.contains(&"--disable-files".to_string()));
    }

    #[test]
    fn build_argv_single_page() {
        let mut state = WizardState::from_preset(&PRESETS[7]);
        state.url = "https://example.com".to_string();
        let argv = state.build_argv();
        assert!(argv.contains(&"--single-page".to_string()));
        assert!(argv.contains(&"--workers=1".to_string()));
        assert!(argv.contains(&"--timeout=10".to_string()));
    }

    #[test]
    fn build_argv_large_site() {
        let mut state = WizardState::from_preset(&PRESETS[8]);
        state.url = "https://example.com".to_string();
        let argv = state.build_argv();
        assert!(argv.contains(&"--workers=10".to_string()));
        assert!(argv.contains(&"--max-reqs-per-sec=50".to_string()));
        assert!(argv.contains(&"--max-visited-urls=0".to_string()));
        assert!(argv.contains(&"--timeout=3".to_string()));
        assert!(argv.contains(&"--ignore-robots-txt".to_string()));
        assert!(argv.contains(&"--sitemap-xml-file='./sitemap.xml'".to_string()));
    }

    #[test]
    fn content_summary_all_enabled() {
        let state = WizardState::from_preset(&PRESETS[0]);
        assert_eq!(state.content_summary(), "HTML, JS, CSS, Fonts, Images, Files");
    }

    #[test]
    fn content_summary_html_only() {
        let state = WizardState::from_preset(&PRESETS[1]); // SEO
        assert_eq!(state.content_summary(), "HTML");
    }

    #[test]
    fn description_lengths_within_range() {
        for preset in PRESETS {
            let len = preset.description.len();
            assert!(
                (50..=65).contains(&len),
                "Preset '{}' description is {} chars (expected 50-65): \"{}\"",
                preset.name,
                len,
                preset.description
            );
        }
    }
}


================================================
FILE: tests/common/mod.rs
================================================
// Shared helpers for integration tests

use std::path::PathBuf;
use std::process::{Command, Output};

/// Get path to the compiled binary.
/// Tries release first, falls back to debug.
pub fn binary_path() -> PathBuf {
    let release = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
        .join("target")
        .join("release")
        .join("siteone-crawler");
    if release.exists() {
        return release;
    }
    // Fall back to debug build
    PathBuf::from(env!("CARGO_MANIFEST_DIR"))
        .join("target")
        .join("debug")
        .join("siteone-crawler")
}

/// Run the crawler with given arguments and return Output.
pub fn run_crawler(args: &[&str]) -> Output {
    Command::new(binary_path())
        .args(args)
        .output()
        .expect("Failed to execute crawler binary")
}

/// Run crawler and parse stdout as JSON.
pub fn run_crawler_json(args: &[&str]) -> serde_json::Value {
    let output = run_crawler(args);
    let stdout = String::from_utf8_lossy(&output.stdout);
    // JSON output may be preceded by progress lines on stderr, but stdout should be pure JSON
    serde_json::from_str(&stdout).unwrap_or_else(|e| {
        panic!(
            "Failed to parse JSON output: {}\nFirst 500 chars: {}",
            e,
            &stdout[..stdout.len().min(500)]
        )
    })
}

/// Create a temporary directory that is cleaned up when dropped.
pub struct TempDir {
    pub path: PathBuf,
}

impl TempDir {
    pub fn new(prefix: &str) -> Self {
        let path = std::env::temp_dir().join(format!("siteone-test-{}-{}", prefix, std::process::id()));
        if path.exists() {
            std::fs::remove_dir_all(&path).ok();
        }
        std::fs::create_dir_all(&path).expect("Failed to create temp dir");
        TempDir { path }
    }
}

impl Drop for TempDir {
    fn drop(&mut self) {
        std::fs::remove_dir_all(&self.path).ok();
    }
}


================================================
FILE: tests/integration_crawl.rs
================================================
// Integration tests: crawl crawler.siteone.io and verify output correctness.
//
// These tests require network access and a built binary.
// Run with: cargo test --test integration_crawl -- --ignored
//
// They are #[ignore] by default so `cargo test` stays fast and offline.
//
// Network tests use a serial mutex to prevent parallel crawls against the
// same server, which would cause rate-limiting and flaky failures.

mod common;

use common::{TempDir, run_crawler, run_crawler_json};
use std::path::Path;
use std::sync::Mutex;

/// Mutex to serialize all network tests that hit crawler.siteone.io.
/// Prevents parallel crawls from overwhelming the server.
static SERIAL: Mutex<()> = Mutex::new(());

/// Common crawler flags to be gentle on the remote server.
const GENTLE_FLAGS: [&str; 3] = ["--workers=2", "--max-reqs-per-sec=5", "--http-cache-dir="];

// =========================================================================
// 1. Full crawl of crawler.siteone.io — verify content type counts
// =========================================================================

#[test]
#[ignore]
fn crawl_siteone_content_type_counts() {
    let _guard = SERIAL.lock().unwrap_or_else(|e| e.into_inner());

    let mut args: Vec<&str> = vec!["--url=https://crawler.siteone.io", "--output=json"];
    args.extend_from_slice(&GENTLE_FLAGS);
    let json = run_crawler_json(&args);

    let tables = &json["tables"];
    let ct = &tables["content-types"];
    let rows = ct["rows"].as_array().expect("content-types rows");

    let find_count = |content_type: &str| -> i64 {
        rows.iter()
            .find(|r| r["contentType"].as_str() == Some(content_type))
            .and_then(|r| r["count"].as_str())
            .and_then(|s| s.parse().ok())
            .unwrap_or(0)
    };

    let html_count = find_count("HTML");
    let js_count = find_count("JS");
    let css_count = find_count("CSS");
    let image_count = find_count("Image");

    // Verified baseline (March 2026): HTML=54, JS=5, CSS=3, Image=10
    // Allow ±5 tolerance for HTML (site may add/remove pages)
    assert!(
        (49..=59).contains(&html_count),
        "Expected ~54 HTML pages, got {}",
        html_count
    );
    assert!((3..=8).contains(&js_count), "Expected ~5 JS files, got {}", js_count);
    assert!((2..=6).contains(&css_count), "Expected ~3 CSS files, got {}", css_count);
    assert!(
        (7..=15).contains(&image_count),
        "Expected ~10 images, got {}",
        image_count
    );

    // Total URLs: ~73
    let total_urls = json["stats"]["totalUrls"].as_i64().expect("totalUrls");
    assert!(
        (65..=85).contains(&total_urls),
        "Expected ~73 total URLs, got {}",
        total_urls
    );

    // Only 200 and 404 status codes expected
    let status_counts = json["stats"]["countByStatus"].as_object().expect("countByStatus");
    let count_200 = status_counts.get("200").and_then(|v| v.as_i64()).unwrap_or(0);
    let count_404 = status_counts.get("404").and_then(|v| v.as_i64()).unwrap_or(0);
    assert!(count_200 > 60, "Expected >60 successful URLs, got {}", count_200);
    assert!(
        count_404 >= 0 && count_404 <= 10,
        "Expected 0-10 404s, got {}",
        count_404
    );

    // Quality score should be reasonable
    let overall_score = json["qualityScores"]["overall"]["score"]
        .as_f64()
        .expect("overall score");
    assert!(
        overall_score >= 7.0,
        "Expected overall score >= 7.0, got {}",
        overall_score
    );
}

// =========================================================================
// 2. Non-existent domain — verify exit code 3 and graceful handling
// =========================================================================

#[test]
#[ignore]
fn crawl_nonexistent_domain_exits_with_code_3() {
    let _guard = SERIAL.lock().unwrap_or_else(|e| e.into_inner());

    let domain = format!(
        "https://nonexistent-{}.invalid",
        std::time::SystemTime::now()
            .duration_since(std::time::UNIX_EPOCH)
            .unwrap()
            .as_millis()
    );

    let output = run_crawler(&[
        &format!("--url={}", domain),
        "--single-page",
        "--timeout=5",
        "--http-cache-dir=",
    ]);

    assert_eq!(
        output.status.code(),
        Some(3),
        "Expected exit code 3 for non-existent domain, got {:?}",
        output.status.code()
    );
}

// =========================================================================
// 3. Non-existent domain with --ci — verify exit code 10
// =========================================================================

#[test]
#[ignore]
fn crawl_nonexistent_domain_ci_exits_with_code_10() {
    let _guard = SERIAL.lock().unwrap_or_else(|e| e.into_inner());

    let domain = format!(
        "https://nonexistent-{}.invalid",
        std::time::SystemTime::now()
            .duration_since(std::time::UNIX_EPOCH)
            .unwrap()
            .as_millis()
    );

    let output = run_crawler(&[
        &format!("--url={}", domain),
        "--single-page",
        "--timeout=5",
        "--ci",
        "--http-cache-dir=",
    ]);

    assert_eq!(
        output.status.code(),
        Some(10),
        "Expected exit code 10 for CI gate with no pages, got {:?}",
        output.status.code()
    );
}

// =========================================================================
// 4. Offline export — verify key pages exist and links are relative
// =========================================================================

#[test]
#[ignore]
fn crawl_siteone_offline_export() {
    let _guard = SERIAL.lock().unwrap_or_else(|e| e.into_inner());

    let tmp = TempDir::new("offline");
    let offline_dir = tmp.path.join("site");

    let mut args: Vec<&str> = vec!["--url=https://crawler.siteone.io", &"--offline-export-dir=PLACEHOLDER"];
    // We need the offline_dir path as a string that lives long enough
    let offline_dir_str = format!("--offline-export-dir={}", offline_dir.display());
    args = vec!["--url=https://crawler.siteone.io", &offline_dir_str];
    args.extend_from_slice(&GENTLE_FLAGS);
    let output = run_crawler(&args);

    assert!(
        output.status.success(),
        "Crawler failed with {:?}",
        output.status.code()
    );

    // Key pages must exist
    assert!(offline_dir.join("index.html").exists(), "Missing index.html");
    assert!(
        offline_dir.join("introduction/overview/index.html").exists(),
        "Missing introduction/overview/index.html"
    );
    assert!(
        offline_dir
            .join("features/seo-and-opengraph-analysis/index.html")
            .exists(),
        "Missing features/seo-and-opengraph-analysis/index.html"
    );

    // Check relative links in index.html
    let index_html = std::fs::read_to_string(offline_dir.join("index.html")).expect("Failed to read index.html");
    // Should contain relative link to introduction/overview
    assert!(
        index_html.contains("introduction/overview/index.html"),
        "index.html should contain relative link to introduction/overview/index.html"
    );
    // Should contain relative CSS reference
    assert!(
        index_html.contains("_astro/index.BRwACyc2.css") || index_html.contains("_astro/"),
        "index.html should contain relative reference to CSS in _astro/"
    );
    // Should NOT contain absolute https://crawler.siteone.io links for internal pages
    // (external links like GitHub are OK)
    let internal_absolute_links: Vec<&str> = index_html
        .match_indices("href=\"https://crawler.siteone.io")
        .map(|(i, _)| &index_html[i..i.min(index_html.len()).min(i + 80)])
        .collect();
    assert!(
        internal_absolute_links.is_empty(),
        "Offline index.html should not contain absolute links to crawler.siteone.io: {:?}",
        &internal_absolute_links[..internal_absolute_links.len().min(3)]
    );

    // Check links in a subpage point correctly back up
    let overview_html = std::fs::read_to_string(offline_dir.join("introduction/overview/index.html"))
        .expect("Failed to read overview page");
    // From introduction/overview/ the root is ../../
    assert!(
        overview_html.contains("../../index.html") || overview_html.contains("../../"),
        "Overview page should have ../../ relative paths to root"
    );

    // Verify CSS and JS assets exist
    let has_css = std::fs::read_dir(offline_dir.join("_astro"))
        .map(|entries| {
            entries
                .filter_map(|e| e.ok())
                .any(|e| e.path().extension().map(|ext| ext == "css").unwrap_or(false))
        })
        .unwrap_or(false);
    assert!(has_css, "Should have CSS files in _astro/");
}

// =========================================================================
// 5. Markdown export — verify pages and internal links use .md extension
// =========================================================================

#[test]
#[ignore]
fn crawl_siteone_markdown_export() {
    let _guard = SERIAL.lock().unwrap_or_else(|e| e.into_inner());

    let tmp = TempDir::new("markdown");
    let md_dir = tmp.path.join("md");

    let md_dir_str = format!("--markdown-export-dir={}", md_dir.display());
    let mut args: Vec<&str> = vec!["--url=https://crawler.siteone.io", &md_dir_str];
    args.extend_from_slice(&GENTLE_FLAGS);
    let output = run_crawler(&args);

    assert!(
        output.status.success(),
        "Crawler failed with {:?}",
        output.status.code()
    );

    // Key markdown files must exist
    assert!(md_dir.join("index.md").exists(), "Missing index.md");
    assert!(
        md_dir.join("introduction/overview/index.md").exists(),
        "Missing introduction/overview/index.md"
    );
    assert!(
        md_dir.join("features/seo-and-opengraph-analysis/index.md").exists(),
        "Missing features/seo-and-opengraph-analysis/index.md"
    );
    assert!(
        md_dir.join("configuration/command-line-options/index.md").exists(),
        "Missing configuration/command-line-options/index.md"
    );

    // Count total markdown files (baseline: ~51)
    let md_count = walkdir(md_dir.as_path(), "md");
    assert!(
        (45..=60).contains(&md_count),
        "Expected ~51 markdown files, got {}",
        md_count
    );

    // Check internal links in overview page use .md extension
    let overview_md = std::fs::read_to_string(md_dir.join("introduction/overview/index.md"))
        .expect("Failed to read overview markdown");

    // Internal links should be relative .md paths
    assert!(
        overview_md.contains("../../introduction/key-features/index.md"),
        "Overview should link to key-features/index.md"
    );
    assert!(
        overview_md.contains("../../getting-started/quick-start-guide/index.md"),
        "Overview should link to quick-start-guide/index.md"
    );

    // External links should remain as absolute URLs
    assert!(
        overview_md.contains("https://github.com/"),
        "External GitHub links should stay absolute"
    );

    // Check index.md links
    let index_md = std::fs::read_to_string(md_dir.join("index.md")).expect("Failed to read index.md");
    // Internal links should use .md extension, not .html
    // (index.html self-reference in nav logo is acceptable)
    let html_internal_links: Vec<&str> = index_md
        .lines()
        .filter(|line| {
            line.contains(".html)")
                && !line.contains("http://")
                && !line.contains("https://")
                && !line.contains("index.html)")
        })
        .collect();
    assert!(
        html_internal_links.is_empty(),
        "Markdown index.md should not have internal .html links: {:?}",
        &html_internal_links[..html_internal_links.len().min(3)]
    );
}

/// Count files with given extension recursively.
fn walkdir(dir: &Path, extension: &str) -> usize {
    let mut count = 0;
    if let Ok(entries) = std::fs::read_dir(dir) {
        for entry in entries.flatten() {
            let path = entry.path();
            if path.is_dir() {
                count += walkdir(&path, extension);
            } else if path.extension().map(|e| e == extension).unwrap_or(false) {
                count += 1;
            }
        }
    }
    count
}

// =========================================================================
// 6. Single page crawl — verify only one HTML page is fetched
// =========================================================================

#[test]
#[ignore]
fn crawl_siteone_single_page() {
    let _guard = SERIAL.lock().unwrap_or_else(|e| e.into_inner());

    let json = run_crawler_json(&[
        "--url=https://crawler.siteone.io",
        "--single-page",
        "--output=json",
        "--workers=2",
        "--max-reqs-per-sec=5",
        "--http-cache-dir=",
    ]);

    let tables = &json["tables"];
    let ct = &tables["content-types"];
    let rows = ct["rows"].as_array().expect("content-types rows");

    let html_count: i64 = rows
        .iter()
        .find(|r| r["contentType"].as_str() == Some("HTML"))
        .and_then(|r| r["count"].as_str())
        .and_then(|s| s.parse().ok())
        .unwrap_or(0);

    assert_eq!(html_count, 1, "Single page should crawl exactly 1 HTML page");

    // Should still fetch assets (JS, CSS, images)
    let total_urls = json["stats"]["totalUrls"].as_i64().unwrap_or(0);
    assert!(
        total_urls > 1,
        "Single page should still fetch assets, got {} total URLs",
        total_urls
    );
}

// =========================================================================
// 7. --version and --help flags
// =========================================================================

#[test]
fn version_flag_exits_with_code_2() {
    let output = run_crawler(&["--version"]);
    assert_eq!(output.status.code(), Some(2));
    let stdout = String::from_utf8_lossy(&output.stdout);
    assert!(stdout.contains("Version:"), "Expected version output, got: {}", stdout);
}

#[test]
fn help_flag_exits_with_code_2() {
    let output = run_crawler(&["--help"]);
    assert_eq!(output.status.code(), Some(2));
    let stdout = String::from_utf8_lossy(&output.stdout);
    assert!(
        stdout.contains("--url") && stdout.contains("--output"),
        "Help should list --url and --output options"
    );
}

// =========================================================================
// 8. Invalid option — verify error and exit code 101
// =========================================================================

#[test]
fn invalid_option_exits_with_code_101() {
    let output = run_crawler(&["--url=https://example.com", "--nonexistent-option=foo"]);
    assert_eq!(
        output.status.code(),
        Some(101),
        "Expected exit code 101 for unknown option"
    );
    let stderr = String::from_utf8_lossy(&output.stderr);
    assert!(
        stderr.contains("Unknown options: --nonexistent-option=foo"),
        "Error should mention the unknown option, got: {}",
        stderr
    );
}

#[test]
fn unknown_option_after_bool_flag_detected() {
    // Regression: bool flags (--ci, --single-page, --debug) must NOT consume
    // the next argument as their "value", otherwise unknown options get skipped.
    let output = run_crawler(&["--url=https://example.com", "--ci", "--no-cach"]);
    assert_eq!(
        output.status.code(),
        Some(101),
        "Expected exit code 101 for --no-cach after --ci"
    );
    let stderr = String::from_utf8_lossy(&output.stderr);
    assert!(
        stderr.contains("--no-cach"),
        "Error should mention --no-cach, got: {}",
        stderr
    );
}

#[test]
fn unknown_option_typo_without_value() {
    let output = run_crawler(&["--url=https://example.com", "--signle-page"]);
    assert_eq!(
        output.status.code(),
        Some(101),
        "Expected exit code 101 for misspelled --signle-page"
    );
    let stderr = String::from_utf8_lossy(&output.stderr);
    assert!(
        stderr.contains("--signle-page"),
        "Error should mention --signle-page, got: {}",
        stderr
    );
}

// =========================================================================
// --html-to-markdown: standalone HTML-to-Markdown conversion (no network)
// =========================================================================

#[test]
fn html_to_markdown_basic_conversion() {
    let tmp = TempDir::new("htm-convert");
    let html_path = tmp.path.join("page.html");
    std::fs::write(
        &html_path,
        "<html><body><h1>Hello World</h1><p>Paragraph with <strong>bold</strong> text.</p>\
         <ul><li>Item 1</li><li>Item 2</li></ul></body></html>",
    )
    .unwrap();

    let output = run_crawler(&[&format!("--html-to-markdown={}", html_path.display())]);
    assert!(output.status.success(), "Should exit 0");

    let stdout = String::from_utf8_lossy(&output.stdout);
    assert!(stdout.contains("# Hello World"), "Should contain h1: {}", stdout);
    assert!(stdout.contains("**bold**"), "Should contain bold: {}", stdout);
    assert!(stdout.contains("- Item 1"), "Should contain list: {}", stdout);
    assert!(stdout.contains("- Item 2"), "Should contain list item 2: {}", stdout);
}

#[test]
fn html_to_markdown_output_to_file() {
    let tmp = TempDir::new("htm-output");
    let html_path = tmp.path.join("input.html");
    let md_path = tmp.path.join("output.md");
    std::fs::write(&html_path, "<html><body><h1>Title</h1><p>Content</p></body></html>").unwrap();

    let output = run_crawler(&[
        &format!("--html-to-markdown={}", html_path.display()),
        &format!("--html-to-markdown-output={}", md_path.display()),
    ]);
    assert!(output.status.success(), "Should exit 0");
    assert!(md_path.exists(), "Output file should exist");

    let md_content = std::fs::read_to_string(&md_path).unwrap();
    assert!(
        md_content.contains("# Title"),
        "Output file should contain heading: {}",
        md_content
    );

    // stdout should be empty (output went to file)
    assert!(output.stdout.is_empty(), "stdout should be empty when writing to file");

    // status message should be on stderr
    let stderr = String::from_utf8_lossy(&output.stderr);
    assert!(
        stderr.contains("Markdown written to"),
        "stderr should contain success message: {}",
        stderr
    );
}

#[test]
fn html_to_markdown_nonexistent_file() {
    let output = run_crawler(&["--html-to-markdown=/tmp/siteone_nonexistent_file_12345.html"]);
    assert_eq!(output.status.code(), Some(101), "Should exit 101 for nonexistent file");
    let stderr = String::from_utf8_lossy(&output.stderr);
    assert!(
        stderr.contains("does not exist"),
        "Error should mention file doesn't exist: {}",
        stderr
    );
}

#[test]
fn html_to_markdown_with_disable_images() {
    let tmp = TempDir::new("htm-no-img");
    let html_path = tmp.path.join("page.html");
    std::fs::write(
        &html_path,
        "<html><body><h1>Title</h1><img src=\"photo.jpg\" alt=\"Photo\"><p>Text</p></body></html>",
    )
    .unwrap();

    let output = run_crawler(&[
        &format!("--html-to-markdown={}", html_path.display()),
        "--markdown-disable-images",
    ]);
    assert!(output.status.success());

    let stdout = String::from_utf8_lossy(&output.stdout);
    assert!(!stdout.contains("photo.jpg"), "Images should be removed: {}", stdout);
    assert!(stdout.contains("# Title"));
    assert!(stdout.contains("Text"));
}

#[test]
fn html_to_markdown_preserves_original_links() {
    let tmp = TempDir::new("htm-links");
    let html_path = tmp.path.join("page.html");
    std::fs::write(
        &html_path,
        r#"<html><body><h1>Title</h1><a href="/about.html">About</a>
           <a href="https://example.com">External</a>
           <a href="tel:+420123456">Call</a></body></html>"#,
    )
    .unwrap();

    let output = run_crawler(&[&format!("--html-to-markdown={}", html_path.display())]);
    assert!(output.status.success());

    let stdout = String::from_utf8_lossy(&output.stdout);
    // Links should NOT be rewritten to .md (standalone mode)
    assert!(
        stdout.contains("/about.html"),
        "HTML links should be preserved as-is: {}",
        stdout
    );
    assert!(
        stdout.contains("https://example.com"),
        "External links preserved: {}",
        stdout
    );
    assert!(stdout.contains("tel:+420123456"), "Tel links preserved: {}", stdout);
}

#[test]
fn html_to_markdown_with_exclude_selector() {
    let tmp = TempDir::new("htm-exclude");
    let html_path = tmp.path.join("page.html");
    std::fs::write(
        &html_path,
        "<html><body><h1>Title</h1><nav><a href=\"/\">Home</a></nav><p>Main content</p></body></html>",
    )
    .unwrap();

    let output = run_crawler(&[
        &format!("--html-to-markdown={}", html_path.display()),
        "--markdown-exclude-selector=nav",
    ]);
    assert!(output.status.success());

    let stdout = String::from_utf8_lossy(&output.stdout);
    assert!(stdout.contains("Main content"), "Content should be present: {}", stdout);
    assert!(!stdout.contains("Home"), "Nav should be excluded: {}", stdout);
}

#[test]
fn html_to_markdown_aria_hidden_excluded() {
    let tmp = TempDir::new("htm-aria");
    let html_path = tmp.path.join("page.html");
    std::fs::write(
        &html_path,
        r#"<html><body><h1>Title</h1>
           <div aria-hidden="true"><p>Hidden mega menu</p></div>
           <p>Visible content</p></body></html>"#,
    )
    .unwrap();

    let output = run_crawler(&[&format!("--html-to-markdown={}", html_path.display())]);
    assert!(output.status.success());

    let stdout = String::from_utf8_lossy(&output.stdout);
    assert!(stdout.contains("Visible content"));
    assert!(
        !stdout.contains("Hidden mega menu"),
        "aria-hidden should be excluded: {}",
        stdout
    );
}

#[test]
fn html_to_markdown_output_without_input_fails() {
    let output = run_crawler(&["--html-to-markdown-output=/tmp/out.md"]);
    assert_eq!(
        output.status.code(),
        Some(101),
        "Should exit 101 when output is set without input"
    );
    let stderr = String::from_utf8_lossy(&output.stderr);
    assert!(
        stderr.contains("--html-to-markdown-output requires --html-to-markdown"),
        "Should mention missing input: {}",
        stderr
    );
}

#[test]
fn html_to_markdown_with_move_before_h1() {
    let tmp = TempDir::new("htm-move-h1");
    let html_path = tmp.path.join("page.html");
    std::fs::write(
        &html_path,
        "<html><body><nav><a href=\"/\">Home</a><a href=\"/about\">About</a></nav>\
         <h1>Main Title</h1><p>Page body</p></body></html>",
    )
    .unwrap();

    let output = run_crawler(&[
        &format!("--html-to-markdown={}", html_path.display()),
        "--markdown-move-content-before-h1-to-end",
    ]);
    assert!(output.status.success());

    let stdout = String::from_utf8_lossy(&output.stdout);
    assert!(
        stdout.starts_with("# Main Title"),
        "Should start with h1 heading: {}",
        stdout
    );
    assert!(stdout.contains("Page body"));
}