Repository: janreges/siteone-crawler Branch: main Commit: 63203298f00e Files: 132 Total size: 2.0 MB Directory structure: gitextract_j_gpbi69/ ├── .githooks/ │ └── pre-commit ├── .github/ │ └── workflows/ │ ├── ci.yml │ ├── publish.yml │ └── release.yml ├── .gitignore ├── CHANGELOG.md ├── CLAUDE.md ├── Cargo.toml ├── LICENSE ├── README.md ├── docs/ │ ├── JSON-OUTPUT.md │ ├── OUTPUT-crawler.siteone.io.json │ ├── OUTPUT-crawler.siteone.io.txt │ └── TEXT-OUTPUT.md ├── rustfmt.toml ├── src/ │ ├── analysis/ │ │ ├── accessibility_analyzer.rs │ │ ├── analyzer.rs │ │ ├── base_analyzer.rs │ │ ├── best_practice_analyzer.rs │ │ ├── caching_analyzer.rs │ │ ├── content_type_analyzer.rs │ │ ├── dns_analyzer.rs │ │ ├── external_links_analyzer.rs │ │ ├── fastest_analyzer.rs │ │ ├── headers_analyzer.rs │ │ ├── manager.rs │ │ ├── mod.rs │ │ ├── page404_analyzer.rs │ │ ├── redirects_analyzer.rs │ │ ├── result/ │ │ │ ├── analyzer_stats.rs │ │ │ ├── dns_analysis_result.rs │ │ │ ├── header_stats.rs │ │ │ ├── heading_tree_item.rs │ │ │ ├── mod.rs │ │ │ ├── security_checked_header.rs │ │ │ ├── security_result.rs │ │ │ ├── seo_opengraph_result.rs │ │ │ └── url_analysis_result.rs │ │ ├── security_analyzer.rs │ │ ├── seo_opengraph_analyzer.rs │ │ ├── skipped_urls_analyzer.rs │ │ ├── slowest_analyzer.rs │ │ ├── source_domains_analyzer.rs │ │ └── ssl_tls_analyzer.rs │ ├── components/ │ │ ├── mod.rs │ │ ├── summary/ │ │ │ ├── item.rs │ │ │ ├── item_status.rs │ │ │ ├── mod.rs │ │ │ └── summary.rs │ │ ├── super_table.rs │ │ └── super_table_column.rs │ ├── content_processor/ │ │ ├── astro_processor.rs │ │ ├── base_processor.rs │ │ ├── content_processor.rs │ │ ├── css_processor.rs │ │ ├── html_processor.rs │ │ ├── javascript_processor.rs │ │ ├── manager.rs │ │ ├── mod.rs │ │ ├── nextjs_processor.rs │ │ ├── svelte_processor.rs │ │ └── xml_processor.rs │ ├── debugger.rs │ ├── engine/ │ │ ├── crawler.rs │ │ ├── found_url.rs │ │ ├── found_urls.rs │ │ ├── http_client.rs │ │ ├── http_response.rs │ │ ├── initiator.rs │ │ ├── manager.rs │ │ ├── mod.rs │ │ ├── parsed_url.rs │ │ └── robots_txt.rs │ ├── error.rs │ ├── export/ │ │ ├── base_exporter.rs │ │ ├── exporter.rs │ │ ├── file_exporter.rs │ │ ├── html_report/ │ │ │ ├── badge.rs │ │ │ ├── mod.rs │ │ │ ├── report.rs │ │ │ ├── tab.rs │ │ │ └── template.html │ │ ├── mailer_exporter.rs │ │ ├── markdown_exporter.rs │ │ ├── mod.rs │ │ ├── offline_website_exporter.rs │ │ ├── sitemap_exporter.rs │ │ ├── upload_exporter.rs │ │ └── utils/ │ │ ├── html_to_markdown.rs │ │ ├── markdown_site_aggregator.rs │ │ ├── mod.rs │ │ ├── offline_url_converter.rs │ │ └── target_domain_relation.rs │ ├── extra_column.rs │ ├── info.rs │ ├── lib.rs │ ├── main.rs │ ├── options/ │ │ ├── core_options.rs │ │ ├── group.rs │ │ ├── mod.rs │ │ ├── option.rs │ │ ├── option_type.rs │ │ └── options.rs │ ├── output/ │ │ ├── json_output.rs │ │ ├── mod.rs │ │ ├── multi_output.rs │ │ ├── output.rs │ │ ├── output_type.rs │ │ └── text_output.rs │ ├── result/ │ │ ├── basic_stats.rs │ │ ├── manager_stats.rs │ │ ├── mod.rs │ │ ├── status.rs │ │ ├── storage/ │ │ │ ├── file_storage.rs │ │ │ ├── memory_storage.rs │ │ │ ├── mod.rs │ │ │ ├── storage.rs │ │ │ └── storage_type.rs │ │ └── visited_url.rs │ ├── scoring/ │ │ ├── ci_gate.rs │ │ ├── mod.rs │ │ ├── quality_score.rs │ │ └── scorer.rs │ ├── server.rs │ ├── types.rs │ ├── utils.rs │ ├── version.rs │ └── wizard/ │ ├── form.rs │ ├── mod.rs │ └── presets.rs └── tests/ ├── common/ │ └── mod.rs └── integration_crawl.rs ================================================ FILE CONTENTS ================================================ ================================================ FILE: .githooks/pre-commit ================================================ #!/bin/bash # Pre-commit hook: run cargo fmt, clippy, and tests before committing. set -e echo "=== Pre-commit: cargo fmt --check ===" cargo fmt -- --check echo "=== Pre-commit: cargo clippy ===" cargo clippy -- -D warnings echo "=== Pre-commit: cargo test ===" cargo test echo "=== Pre-commit checks passed ===" ================================================ FILE: .github/workflows/ci.yml ================================================ name: CI on: push: branches: [main] pull_request: branches: [main] env: CARGO_TERM_COLOR: always jobs: check: name: Check & Lint runs-on: ubuntu-latest steps: - uses: actions/checkout@v6 - uses: dtolnay/rust-toolchain@stable with: components: rustfmt, clippy - uses: actions/cache@v5 with: path: | ~/.cargo/registry ~/.cargo/git target key: ${{ runner.os }}-cargo-${{ hashFiles('**/Cargo.lock') }} restore-keys: | ${{ runner.os }}-cargo- - name: Check formatting run: cargo fmt -- --check - name: Clippy run: cargo clippy -- -D warnings - name: Build run: cargo build - name: Run tests run: cargo test ================================================ FILE: .github/workflows/publish.yml ================================================ name: Publish to package managers # Triggers when a draft release is published (manually via GitHub UI) on: release: types: [published] workflow_dispatch: inputs: tag: description: 'Release tag (e.g. v2.0.1)' required: true permissions: contents: write jobs: # ───────────────────────────────────────────────────────────────── # Publish to crates.io # ───────────────────────────────────────────────────────────────── publish-crates: name: Publish to crates.io runs-on: ubuntu-latest if: vars.PUBLISH_CRATES == 'true' steps: - name: Checkout uses: actions/checkout@v6 with: ref: ${{ github.event.release.tag_name || inputs.tag }} - name: Determine version id: version run: | TAG="${{ github.event.release.tag_name || inputs.tag }}" echo "version=${TAG#v}" >> "$GITHUB_OUTPUT" - name: Ensure Cargo.toml has correct version env: VERSION: ${{ steps.version.outputs.version }} run: sed -i "s/^version = .*/version = \"${VERSION}\"/" Cargo.toml - name: Install Rust toolchain uses: dtolnay/rust-toolchain@stable - name: Publish env: CARGO_REGISTRY_TOKEN: ${{ secrets.CARGO_REGISTRY_TOKEN }} run: cargo publish --no-verify --allow-dirty || echo "Already published (skipping)" # ───────────────────────────────────────────────────────────────── # Update Homebrew tap # ───────────────────────────────────────────────────────────────── publish-homebrew: name: Update Homebrew formula runs-on: ubuntu-latest if: vars.PUBLISH_HOMEBREW == 'true' steps: - name: Determine version id: version run: | TAG="${{ github.event.release.tag_name || inputs.tag }}" echo "version=${TAG#v}" >> "$GITHUB_OUTPUT" - name: Download release archives and compute SHA256 env: VERSION: ${{ steps.version.outputs.version }} run: | BASE_URL="https://github.com/${{ github.repository }}/releases/download/v${VERSION}" for SUFFIX in linux-x64 linux-arm64 macos-x64 macos-arm64; do FILE="siteone-crawler-v${VERSION}-${SUFFIX}.tar.gz" curl -sfL "${BASE_URL}/${FILE}" -o "${FILE}" SHA=$(sha256sum "${FILE}" | cut -d' ' -f1) VAR_NAME="SHA_$(echo "${SUFFIX}" | tr '[:lower:]-' '[:upper:]_')" echo "${VAR_NAME}=${SHA}" >> "$GITHUB_ENV" echo "${VAR_NAME}=${SHA}" done - name: Clone Homebrew tap env: TAP_TOKEN: ${{ secrets.HOMEBREW_TAP_TOKEN }} run: | git clone "https://x-access-token:${TAP_TOKEN}@github.com/janreges/homebrew-tap.git" tap - name: Update formula env: VERSION: ${{ steps.version.outputs.version }} run: | cat > tap/Formula/siteone-crawler.rb <<'FORMULA' class SiteoneCrawler < Formula desc "Website crawler and QA toolkit in Rust for security, performance, SEO, and accessibility audits, offline cloning, markdown export, sitemap generation, cache warming, and CI/CD gating — one dependency-free binary for all major platforms, 10 tools in one." homepage "https://crawler.siteone.io/" version "VERSION_PLACEHOLDER" license "MIT" on_macos do if Hardware::CPU.arm? url "https://github.com/janreges/siteone-crawler/releases/download/v#{version}/siteone-crawler-v#{version}-macos-arm64.tar.gz" sha256 "SHA_MACOS_ARM64_PLACEHOLDER" else url "https://github.com/janreges/siteone-crawler/releases/download/v#{version}/siteone-crawler-v#{version}-macos-x64.tar.gz" sha256 "SHA_MACOS_X64_PLACEHOLDER" end end on_linux do if Hardware::CPU.arm? url "https://github.com/janreges/siteone-crawler/releases/download/v#{version}/siteone-crawler-v#{version}-linux-arm64.tar.gz" sha256 "SHA_LINUX_ARM64_PLACEHOLDER" else url "https://github.com/janreges/siteone-crawler/releases/download/v#{version}/siteone-crawler-v#{version}-linux-x64.tar.gz" sha256 "SHA_LINUX_X64_PLACEHOLDER" end end def install bin.install "siteone-crawler" end test do assert_match "SiteOne Crawler", shell_output("#{bin}/siteone-crawler --version") end end FORMULA sed -i "s/VERSION_PLACEHOLDER/${VERSION}/g" tap/Formula/siteone-crawler.rb sed -i "s/SHA_MACOS_ARM64_PLACEHOLDER/${SHA_MACOS_ARM64}/g" tap/Formula/siteone-crawler.rb sed -i "s/SHA_MACOS_X64_PLACEHOLDER/${SHA_MACOS_X64}/g" tap/Formula/siteone-crawler.rb sed -i "s/SHA_LINUX_ARM64_PLACEHOLDER/${SHA_LINUX_ARM64}/g" tap/Formula/siteone-crawler.rb sed -i "s/SHA_LINUX_X64_PLACEHOLDER/${SHA_LINUX_X64}/g" tap/Formula/siteone-crawler.rb - name: Push updated formula run: | cd tap git config user.name "github-actions[bot]" git config user.email "github-actions[bot]@users.noreply.github.com" git add Formula/siteone-crawler.rb git diff --cached --quiet && echo "Formula already up to date" && exit 0 git commit -m "chore: update siteone-crawler to v${{ steps.version.outputs.version }}" git push # ───────────────────────────────────────────────────────────────── # Update Scoop bucket # ───────────────────────────────────────────────────────────────── publish-scoop: name: Update Scoop manifest runs-on: ubuntu-latest if: vars.PUBLISH_SCOOP == 'true' steps: - name: Determine version id: version run: | TAG="${{ github.event.release.tag_name || inputs.tag }}" echo "version=${TAG#v}" >> "$GITHUB_OUTPUT" - name: Download Windows archives and compute SHA256 env: VERSION: ${{ steps.version.outputs.version }} run: | BASE_URL="https://github.com/${{ github.repository }}/releases/download/v${VERSION}" for SUFFIX in win-x64 win-arm64; do FILE="siteone-crawler-v${VERSION}-${SUFFIX}.zip" curl -sfL "${BASE_URL}/${FILE}" -o "${FILE}" SHA=$(sha256sum "${FILE}" | cut -d' ' -f1) VAR_NAME="SHA_$(echo "${SUFFIX}" | tr '[:lower:]-' '[:upper:]_')" echo "${VAR_NAME}=${SHA}" >> "$GITHUB_ENV" done - name: Clone Scoop bucket env: BUCKET_TOKEN: ${{ secrets.SCOOP_BUCKET_TOKEN }} run: | git clone "https://x-access-token:${BUCKET_TOKEN}@github.com/janreges/scoop-siteone.git" bucket - name: Update manifest env: VERSION: ${{ steps.version.outputs.version }} run: | mkdir -p bucket/bucket cat > bucket/bucket/siteone-crawler.json << 'TEMPLATE' { "version": "VERSION_PLACEHOLDER", "description": "Website crawler and QA toolkit in Rust for security, performance, SEO, and accessibility audits, offline cloning, markdown export, sitemap generation, cache warming, and CI/CD gating — one dependency-free binary for all major platforms, 10 tools in one.", "homepage": "https://crawler.siteone.io/", "license": "MIT", "architecture": { "64bit": { "url": "https://github.com/janreges/siteone-crawler/releases/download/vVERSION_PLACEHOLDER/siteone-crawler-vVERSION_PLACEHOLDER-win-x64.zip", "hash": "HASH_X64_PLACEHOLDER" }, "arm64": { "url": "https://github.com/janreges/siteone-crawler/releases/download/vVERSION_PLACEHOLDER/siteone-crawler-vVERSION_PLACEHOLDER-win-arm64.zip", "hash": "HASH_ARM64_PLACEHOLDER" } }, "extract_dir": "siteone-crawler", "bin": "siteone-crawler.exe", "checkver": "github", "autoupdate": { "architecture": { "64bit": { "url": "https://github.com/janreges/siteone-crawler/releases/download/v$version/siteone-crawler-v$version-win-x64.zip" }, "arm64": { "url": "https://github.com/janreges/siteone-crawler/releases/download/v$version/siteone-crawler-v$version-win-arm64.zip" } } } } TEMPLATE sed -i "s/VERSION_PLACEHOLDER/${VERSION}/g" bucket/bucket/siteone-crawler.json sed -i "s/HASH_X64_PLACEHOLDER/${SHA_WIN_X64}/g" bucket/bucket/siteone-crawler.json sed -i "s/HASH_ARM64_PLACEHOLDER/${SHA_WIN_ARM64}/g" bucket/bucket/siteone-crawler.json - name: Push updated manifest run: | cd bucket git config user.name "github-actions[bot]" git config user.email "github-actions[bot]@users.noreply.github.com" git add bucket/siteone-crawler.json git commit -m "chore: update siteone-crawler to v${{ steps.version.outputs.version }}" git push # ───────────────────────────────────────────────────────────────── # Submit to WinGet # ───────────────────────────────────────────────────────────────── publish-winget: name: Submit to WinGet runs-on: windows-latest # Requires initial manual submission to microsoft/winget-pkgs first. # Once JanReges.SiteOneCrawler exists in winget-pkgs, set PUBLISH_WINGET=true. if: vars.PUBLISH_WINGET == 'true' steps: - name: Determine version id: version shell: bash run: | TAG="${{ github.event.release.tag_name || inputs.tag }}" echo "version=${TAG#v}" >> "$GITHUB_OUTPUT" - name: Install wingetcreate run: winget install Microsoft.WingetCreate --accept-source-agreements --accept-package-agreements - name: Update WinGet manifest env: VERSION: ${{ steps.version.outputs.version }} WINGET_TOKEN: ${{ secrets.WINGET_TOKEN }} run: | $url_x64 = "https://github.com/janreges/siteone-crawler/releases/download/v$env:VERSION/siteone-crawler-v$env:VERSION-win-x64.zip" $url_arm64 = "https://github.com/janreges/siteone-crawler/releases/download/v$env:VERSION/siteone-crawler-v$env:VERSION-win-arm64.zip" wingetcreate update JanReges.SiteOneCrawler ` --version $env:VERSION ` --urls $url_x64 $url_arm64 ` --token $env:WINGET_TOKEN ` --submit # ───────────────────────────────────────────────────────────────── # Update AUR package # ───────────────────────────────────────────────────────────────── publish-aur: name: Update AUR package runs-on: ubuntu-latest if: vars.PUBLISH_AUR == 'true' steps: - name: Determine version id: version run: | TAG="${{ github.event.release.tag_name || inputs.tag }}" echo "version=${TAG#v}" >> "$GITHUB_OUTPUT" - name: Compute SHA256 for Linux archives env: VERSION: ${{ steps.version.outputs.version }} run: | BASE_URL="https://github.com/${{ github.repository }}/releases/download/v${VERSION}" for SUFFIX in linux-x64 linux-arm64; do FILE="siteone-crawler-v${VERSION}-${SUFFIX}.tar.gz" curl -sfL "${BASE_URL}/${FILE}" -o "${FILE}" SHA=$(sha256sum "${FILE}" | cut -d' ' -f1) VAR_NAME="SHA_$(echo "${SUFFIX}" | tr '[:lower:]-' '[:upper:]_')" echo "${VAR_NAME}=${SHA}" >> "$GITHUB_ENV" done - name: Setup SSH for AUR env: AUR_SSH_KEY: ${{ secrets.AUR_SSH_KEY }} run: | mkdir -p ~/.ssh echo "$AUR_SSH_KEY" > ~/.ssh/aur chmod 600 ~/.ssh/aur echo "Host aur.archlinux.org" >> ~/.ssh/config echo " IdentityFile ~/.ssh/aur" >> ~/.ssh/config echo " User aur" >> ~/.ssh/config ssh-keyscan aur.archlinux.org >> ~/.ssh/known_hosts - name: Clone AUR repo and update PKGBUILD env: VERSION: ${{ steps.version.outputs.version }} run: | git clone ssh://aur@aur.archlinux.org/siteone-crawler-bin.git aur cd aur cat > PKGBUILD << PKGBUILD # Maintainer: Jan Reges pkgname=siteone-crawler-bin pkgver=${VERSION} pkgrel=1 pkgdesc="Website crawler and QA toolkit in Rust for security, performance, SEO, and accessibility audits, offline cloning, markdown export, sitemap generation, cache warming, and CI/CD gating — one dependency-free binary for all major platforms, 10 tools in one." arch=('x86_64' 'aarch64') url="https://crawler.siteone.io/" license=('MIT') provides=('siteone-crawler') conflicts=('siteone-crawler') source_x86_64=("https://github.com/janreges/siteone-crawler/releases/download/v\${pkgver}/siteone-crawler-v\${pkgver}-linux-x64.tar.gz") source_aarch64=("https://github.com/janreges/siteone-crawler/releases/download/v\${pkgver}/siteone-crawler-v\${pkgver}-linux-arm64.tar.gz") sha256sums_x86_64=('${SHA_LINUX_X64}') sha256sums_aarch64=('${SHA_LINUX_ARM64}') package() { install -Dm755 "\${srcdir}/siteone-crawler/siteone-crawler" "\${pkgdir}/usr/bin/siteone-crawler" install -Dm644 "\${srcdir}/siteone-crawler/LICENSE" "\${pkgdir}/usr/share/licenses/\${pkgname}/LICENSE" } PKGBUILD cat > .SRCINFO << SRCINFO pkgbase = siteone-crawler-bin pkgdesc = Website crawler and QA toolkit in Rust for security, performance, SEO, and accessibility audits, offline cloning, markdown export, sitemap generation, cache warming, and CI/CD gating — one dependency-free binary for all major platforms, 10 tools in one. pkgver = ${VERSION} pkgrel = 1 url = https://crawler.siteone.io/ arch = x86_64 arch = aarch64 license = MIT provides = siteone-crawler conflicts = siteone-crawler source_x86_64 = https://github.com/janreges/siteone-crawler/releases/download/v${VERSION}/siteone-crawler-v${VERSION}-linux-x64.tar.gz sha256sums_x86_64 = ${SHA_LINUX_X64} source_aarch64 = https://github.com/janreges/siteone-crawler/releases/download/v${VERSION}/siteone-crawler-v${VERSION}-linux-arm64.tar.gz sha256sums_aarch64 = ${SHA_LINUX_ARM64} pkgname = siteone-crawler-bin SRCINFO git config user.name "Jan Reges" git config user.email "jan.reges@siteone.cz" git add PKGBUILD .SRCINFO git commit -m "chore: update siteone-crawler to v${VERSION}" git push # ───────────────────────────────────────────────────────────────── # Publish .deb and .rpm to Cloudsmith (APT + DNF repository) # ───────────────────────────────────────────────────────────────── publish-cloudsmith: name: Publish to Cloudsmith runs-on: ubuntu-latest if: vars.PUBLISH_CLOUDSMITH == 'true' steps: - name: Determine version id: version run: | TAG="${{ github.event.release.tag_name || inputs.tag }}" echo "version=${TAG#v}" >> "$GITHUB_OUTPUT" - name: Download .deb, .rpm and .apk from release env: GH_TOKEN: ${{ github.token }} VERSION: ${{ steps.version.outputs.version }} run: | mkdir -p packages BASE_URL="https://github.com/${{ github.repository }}/releases/download/v${VERSION}" # Download all .deb, .rpm and .apk assets from the release for file in $(gh release view "v${VERSION}" --repo "${{ github.repository }}" --json assets -q '.assets[].name' | grep -E '\.(deb|rpm|apk)$'); do echo "Downloading ${file} ..." curl -sfL "${BASE_URL}/${file}" -o "packages/${file}" done - name: List packages run: ls -lhR packages/ - name: Install Cloudsmith CLI run: pip install cloudsmith-cli - name: Upload .deb packages env: CLOUDSMITH_API_KEY: ${{ secrets.CLOUDSMITH_API_KEY }} run: | for deb in packages/*.deb; do [ -f "$deb" ] || continue echo "Uploading $deb ..." cloudsmith push deb janreges/siteone-crawler/any-distro/any-version "$deb" --republish done - name: Upload .rpm packages env: CLOUDSMITH_API_KEY: ${{ secrets.CLOUDSMITH_API_KEY }} run: | for rpm in packages/*.rpm; do [ -f "$rpm" ] || continue echo "Uploading $rpm ..." cloudsmith push rpm janreges/siteone-crawler/any-distro/any-version "$rpm" --republish done - name: Upload .apk packages env: CLOUDSMITH_API_KEY: ${{ secrets.CLOUDSMITH_API_KEY }} run: | for apk in packages/*.apk; do [ -f "$apk" ] || continue echo "Uploading $apk ..." cloudsmith push alpine janreges/siteone-crawler/alpine/any-version "$apk" --republish done ================================================ FILE: .github/workflows/release.yml ================================================ name: Release # Trigger: push a tag like v1.0.10 on: push: tags: - 'v*' # Manual trigger for building artifacts only (no release created) workflow_dispatch: inputs: version: description: 'Version number (e.g. 1.0.10)' required: true permissions: contents: write env: CARGO_TERM_COLOR: always jobs: build: name: Build ${{ matrix.artifact_suffix }} runs-on: ${{ matrix.os }} strategy: fail-fast: false matrix: include: - target: x86_64-unknown-linux-gnu os: ubuntu-latest artifact_suffix: linux-x64 archive: tar.gz - target: aarch64-unknown-linux-gnu os: ubuntu-latest artifact_suffix: linux-arm64 archive: tar.gz cross: true - target: x86_64-apple-darwin os: macos-latest artifact_suffix: macos-x64 archive: tar.gz - target: aarch64-apple-darwin os: macos-latest artifact_suffix: macos-arm64 archive: tar.gz - target: x86_64-pc-windows-msvc os: windows-latest artifact_suffix: win-x64 archive: zip - target: aarch64-pc-windows-msvc os: windows-latest artifact_suffix: win-arm64 archive: zip - target: x86_64-unknown-linux-musl os: ubuntu-latest artifact_suffix: linux-musl-x64 archive: tar.gz musl: true - target: aarch64-unknown-linux-musl os: ubuntu-latest artifact_suffix: linux-musl-arm64 archive: tar.gz cross: true musl: true steps: - name: Checkout uses: actions/checkout@v6 - name: Determine version id: version shell: bash run: | if [[ "${{ github.event_name }}" == "workflow_dispatch" ]]; then VERSION="${{ github.event.inputs.version }}" else # Extract from tag: v1.0.10 -> 1.0.10 VERSION="${GITHUB_REF_NAME#v}" fi echo "version=${VERSION}" >> "$GITHUB_OUTPUT" echo "Version: ${VERSION}" - name: Install Rust toolchain uses: dtolnay/rust-toolchain@stable with: targets: ${{ matrix.target }} - name: Install cross (for cross-compilation) if: matrix.cross run: cargo install cross --git https://github.com/cross-rs/cross - name: Install musl tools if: matrix.musl && !matrix.cross run: sudo apt-get install -y musl-tools - name: Update version in source shell: bash run: | VERSION="${{ steps.version.outputs.version }}" DATE_SUFFIX="$(date +%Y%m%d)" VERSION_CODE="${VERSION}.${DATE_SUFFIX}" # Update Cargo.toml sed -i.bak "s/^version = .*/version = \"${VERSION}\"/" Cargo.toml # Update version.rs sed -i.bak "s/^pub const CODE: .*/pub const CODE: \&str = \"${VERSION_CODE}\";/" src/version.rs echo "Cargo.toml version: ${VERSION}" echo "version.rs CODE: ${VERSION_CODE}" - name: Build shell: bash run: | if [[ "${{ matrix.cross }}" == "true" ]]; then cross build --release --target ${{ matrix.target }} else cargo build --release --target ${{ matrix.target }} fi # ── macOS Code Signing & Notarization ────────────────────────── - name: Import Apple certificate if: runner.os == 'macOS' env: CERTIFICATE_BASE64: ${{ secrets.APPLE_CERTIFICATE_BASE64 }} CERTIFICATE_PASSWORD: ${{ secrets.APPLE_CERTIFICATE_PASSWORD }} run: | CERTIFICATE_PATH="$RUNNER_TEMP/certificate.p12" KEYCHAIN_PATH="$RUNNER_TEMP/signing.keychain-db" KEYCHAIN_PASSWORD="$(openssl rand -hex 16)" echo -n "$CERTIFICATE_BASE64" | base64 --decode -o "$CERTIFICATE_PATH" security create-keychain -p "$KEYCHAIN_PASSWORD" "$KEYCHAIN_PATH" security set-keychain-settings -lut 21600 "$KEYCHAIN_PATH" security unlock-keychain -p "$KEYCHAIN_PASSWORD" "$KEYCHAIN_PATH" security import "$CERTIFICATE_PATH" \ -P "$CERTIFICATE_PASSWORD" \ -A -t cert -f pkcs12 \ -k "$KEYCHAIN_PATH" security set-key-partition-list \ -S apple-tool:,apple: \ -k "$KEYCHAIN_PASSWORD" \ "$KEYCHAIN_PATH" security list-keychain -d user -s "$KEYCHAIN_PATH" - name: Sign macOS binary if: runner.os == 'macOS' env: SIGNING_IDENTITY: ${{ secrets.APPLE_SIGNING_IDENTITY }} run: | BINARY="target/${{ matrix.target }}/release/siteone-crawler" codesign --force --options runtime \ --sign "$SIGNING_IDENTITY" \ "$BINARY" echo "Verifying signature..." codesign --verify --verbose "$BINARY" echo "Signature OK" - name: Notarize macOS binary if: runner.os == 'macOS' env: APPLE_ID: ${{ secrets.APPLE_ID }} APPLE_ID_PASSWORD: ${{ secrets.APPLE_ID_PASSWORD }} APPLE_TEAM_ID: ${{ secrets.APPLE_TEAM_ID }} run: | BINARY="target/${{ matrix.target }}/release/siteone-crawler" NOTARIZE_ZIP="$RUNNER_TEMP/notarize.zip" # ditto is required — Apple's notary service rejects zip-created archives ditto -c -k --keepParent "$BINARY" "$NOTARIZE_ZIP" echo "Submitting for notarization..." xcrun notarytool submit "$NOTARIZE_ZIP" \ --apple-id "$APPLE_ID" \ --password "$APPLE_ID_PASSWORD" \ --team-id "$APPLE_TEAM_ID" \ --wait echo "Notarization complete" - name: Clean up keychain if: runner.os == 'macOS' && always() run: | KEYCHAIN_PATH="$RUNNER_TEMP/signing.keychain-db" if [ -f "$KEYCHAIN_PATH" ]; then security delete-keychain "$KEYCHAIN_PATH" fi # ──────────────────────────────────────────────────────────────── - name: Package (Unix) if: matrix.archive == 'tar.gz' shell: bash run: | VERSION="${{ steps.version.outputs.version }}" ARTIFACT="siteone-crawler-v${VERSION}-${{ matrix.artifact_suffix }}" mkdir -p "staging/siteone-crawler" cp "target/${{ matrix.target }}/release/siteone-crawler" "staging/siteone-crawler/" cp README.md "staging/siteone-crawler/" 2>/dev/null || true cp LICENSE "staging/siteone-crawler/" 2>/dev/null || true chmod +x "staging/siteone-crawler/siteone-crawler" (cd staging && tar czf "../${ARTIFACT}.tar.gz" siteone-crawler/) echo "ARTIFACT_PATH=${ARTIFACT}.tar.gz" >> "$GITHUB_ENV" - name: Package (Windows) if: matrix.archive == 'zip' shell: bash run: | VERSION="${{ steps.version.outputs.version }}" ARTIFACT="siteone-crawler-v${VERSION}-${{ matrix.artifact_suffix }}" mkdir -p "staging/siteone-crawler" cp "target/${{ matrix.target }}/release/siteone-crawler.exe" "staging/siteone-crawler/" cp README.md "staging/siteone-crawler/" 2>/dev/null || true cp LICENSE "staging/siteone-crawler/" 2>/dev/null || true (cd staging && 7z a -r "../${ARTIFACT}.zip" siteone-crawler/) echo "ARTIFACT_PATH=${ARTIFACT}.zip" >> "$GITHUB_ENV" # ── Build .deb and .rpm packages (Linux only) ────────────── - name: Install cross-compilation tools (arm64) if: runner.os == 'Linux' && matrix.cross run: sudo apt-get install -y binutils-aarch64-linux-gnu - name: Strip binary (Linux) if: runner.os == 'Linux' shell: bash run: | BINARY="target/${{ matrix.target }}/release/siteone-crawler" if [[ "${{ matrix.target }}" == "aarch64"* ]]; then aarch64-linux-gnu-strip -s "$BINARY" || true else strip -s "$BINARY" || true fi - name: Build .deb package if: runner.os == 'Linux' shell: bash run: | cargo install cargo-deb if [[ "${{ matrix.musl }}" == "true" ]]; then cargo deb --no-build --no-strip --target ${{ matrix.target }} --variant static else cargo deb --no-build --no-strip --target ${{ matrix.target }} fi echo "DEB_PATH=$(ls target/${{ matrix.target }}/debian/*.deb)" >> "$GITHUB_ENV" - name: Build .rpm package if: runner.os == 'Linux' shell: bash run: | cargo install cargo-generate-rpm mkdir -p target/release cp "target/${{ matrix.target }}/release/siteone-crawler" target/release/ if [[ "${{ matrix.musl }}" == "true" ]]; then # Override package name for static/musl variant sed -i 's/^name = "siteone-crawler"$/name = "siteone-crawler-static"/' Cargo.toml fi cargo generate-rpm --target ${{ matrix.target }} echo "RPM_PATH=$(find target -name '*.rpm' -path '*/generate-rpm/*' | head -1)" >> "$GITHUB_ENV" - name: Upload .deb artifact if: runner.os == 'Linux' uses: actions/upload-artifact@v7 with: name: siteone-crawler-${{ matrix.artifact_suffix }}-deb path: ${{ env.DEB_PATH }} - name: Upload .rpm artifact if: runner.os == 'Linux' uses: actions/upload-artifact@v7 with: name: siteone-crawler-${{ matrix.artifact_suffix }}-rpm path: ${{ env.RPM_PATH }} # ──────────────────────────────────────────────────────────────── - name: Upload artifact uses: actions/upload-artifact@v7 with: name: siteone-crawler-${{ matrix.artifact_suffix }} path: ${{ env.ARTIFACT_PATH }} # ───────────────────────────────────────────────────────────────── # Build Alpine .apk packages from musl binaries # ───────────────────────────────────────────────────────────────── package-alpine: name: Build Alpine .apk (${{ matrix.arch }}) needs: build runs-on: ubuntu-latest strategy: fail-fast: false matrix: include: - arch: x86_64 artifact_suffix: linux-musl-x64 - arch: aarch64 artifact_suffix: linux-musl-arm64 steps: - name: Checkout uses: actions/checkout@v6 - name: Determine version id: version shell: bash run: | if [[ "${{ github.event_name }}" == "workflow_dispatch" ]]; then VERSION="${{ github.event.inputs.version }}" else VERSION="${GITHUB_REF_NAME#v}" fi echo "version=${VERSION}" >> "$GITHUB_OUTPUT" - name: Download musl binary uses: actions/download-artifact@v8 with: name: siteone-crawler-${{ matrix.artifact_suffix }} path: dist - name: Extract binary run: | VERSION="${{ steps.version.outputs.version }}" tar xzf "dist/siteone-crawler-v${VERSION}-${{ matrix.artifact_suffix }}.tar.gz" -C dist - name: Setup Alpine uses: jirutka/setup-alpine@v1 with: arch: ${{ matrix.arch }} packages: abuild - name: Prepare signing key shell: alpine.sh --root {0} env: ALPINE_RSA_KEY: ${{ secrets.ALPINE_RSA_PRIVATE_KEY }} ALPINE_RSA_PUB: ${{ secrets.ALPINE_RSA_PUBLIC_KEY }} run: | BUILDER=runner # Install signing key mkdir -p /etc/apk/keys printf '%s\n' "$ALPINE_RSA_PUB" > /etc/apk/keys/siteone.rsa.pub # Setup abuild config for builder mkdir -p "/home/$BUILDER/.abuild" printf '%s\n' "$ALPINE_RSA_KEY" > "/home/$BUILDER/.abuild/siteone.rsa" printf '%s\n' "$ALPINE_RSA_PUB" > "/home/$BUILDER/.abuild/siteone.rsa.pub" chmod 600 "/home/$BUILDER/.abuild/siteone.rsa" cat > "/home/$BUILDER/.abuild/abuild.conf" << 'EOF' PACKAGER_PRIVKEY="$HOME/.abuild/siteone.rsa" EOF chown -R "$BUILDER" "/home/$BUILDER/.abuild" # Add user to abuild group addgroup "$BUILDER" abuild - name: Build .apk shell: alpine.sh {0} env: VERSION: ${{ steps.version.outputs.version }} run: | ARCH=$(uname -m) # Prepare build directory mkdir -p ~/build cp "$GITHUB_WORKSPACE/dist/siteone-crawler/siteone-crawler" ~/build/ cp "$GITHUB_WORKSPACE/LICENSE" ~/build/ 2>/dev/null || true # Create APKBUILD cat > ~/build/APKBUILD << EOF # Maintainer: Jan Reges pkgname=siteone-crawler pkgver=${VERSION} pkgrel=1 pkgdesc="Website crawler and QA toolkit in Rust" url="https://crawler.siteone.io/" arch="${ARCH}" license="MIT" source="" options="!check !strip" package() { install -Dm755 "\$startdir/siteone-crawler" "\$pkgdir/usr/bin/siteone-crawler" install -Dm644 "\$startdir/LICENSE" "\$pkgdir/usr/share/licenses/\$pkgname/LICENSE" 2>/dev/null || true } EOF # Build the package cd ~/build abuild -d -P ~/packages # Copy and rename to include arch (both arches produce the same filename) mkdir -p "$GITHUB_WORKSPACE/apk-out" for f in $(find ~/packages -name '*.apk'); do BASENAME=$(basename "$f" .apk) cp "$f" "$GITHUB_WORKSPACE/apk-out/${BASENAME}-${ARCH}.apk" done - name: Upload .apk artifact uses: actions/upload-artifact@v7 with: name: siteone-crawler-alpine-${{ matrix.arch }} path: apk-out/*.apk release: name: Create GitHub Release needs: [build, package-alpine] runs-on: ubuntu-latest if: always() && startsWith(github.ref, 'refs/tags/v') && needs.build.result == 'success' steps: - name: Checkout uses: actions/checkout@v6 - name: Download all artifacts uses: actions/download-artifact@v8 with: path: artifacts merge-multiple: true - name: Determine version id: version run: echo "version=${GITHUB_REF_NAME#v}" >> "$GITHUB_OUTPUT" - name: List artifacts run: ls -lhR artifacts/ - name: Create Release uses: softprops/action-gh-release@v2 with: name: "v${{ steps.version.outputs.version }}" body: | ### Downloads | Platform | Architecture | File | |----------|-------------|------| | Linux | x64 | `siteone-crawler-v${{ steps.version.outputs.version }}-linux-x64.tar.gz` | | Linux | arm64 | `siteone-crawler-v${{ steps.version.outputs.version }}-linux-arm64.tar.gz` | | Linux | x64 (musl/static) | `siteone-crawler-v${{ steps.version.outputs.version }}-linux-musl-x64.tar.gz` | | Linux | arm64 (musl/static) | `siteone-crawler-v${{ steps.version.outputs.version }}-linux-musl-arm64.tar.gz` | | macOS | arm64 (Apple Silicon) | `siteone-crawler-v${{ steps.version.outputs.version }}-macos-arm64.tar.gz` | | macOS | x64 (Intel) | `siteone-crawler-v${{ steps.version.outputs.version }}-macos-x64.tar.gz` | | Windows | x64 | `siteone-crawler-v${{ steps.version.outputs.version }}-win-x64.zip` | | Windows | arm64 | `siteone-crawler-v${{ steps.version.outputs.version }}-win-arm64.zip` | ### Linux packages (glibc — best performance, requires glibc 2.39+) | Format | Architecture | File | |--------|-------------|------| | Debian/Ubuntu (.deb) | x64 | `siteone-crawler_${{ steps.version.outputs.version }}-1_amd64.deb` | | Debian/Ubuntu (.deb) | arm64 | `siteone-crawler_${{ steps.version.outputs.version }}-1_arm64.deb` | | Fedora/RHEL (.rpm) | x64 | `siteone-crawler-${{ steps.version.outputs.version }}-1.x86_64.rpm` | | Fedora/RHEL (.rpm) | arm64 | `siteone-crawler-${{ steps.version.outputs.version }}-1.aarch64.rpm` | ### Linux packages (musl/static — any Linux, ~50–80% slower) | Format | Architecture | File | |--------|-------------|------| | Debian/Ubuntu (.deb) | x64 | `siteone-crawler-static_${{ steps.version.outputs.version }}-1_amd64.deb` | | Debian/Ubuntu (.deb) | arm64 | `siteone-crawler-static_${{ steps.version.outputs.version }}-1_arm64.deb` | | Fedora/RHEL (.rpm) | x64 | `siteone-crawler-static-${{ steps.version.outputs.version }}-1.x86_64.rpm` | | Fedora/RHEL (.rpm) | arm64 | `siteone-crawler-static-${{ steps.version.outputs.version }}-1.aarch64.rpm` | | Alpine (.apk) | x64 | `siteone-crawler-${{ steps.version.outputs.version }}-r1-x86_64.apk` | | Alpine (.apk) | arm64 | `siteone-crawler-${{ steps.version.outputs.version }}-r1-aarch64.apk` | ### Quick start ```bash # Extract and run tar xzf siteone-crawler-v${{ steps.version.outputs.version }}-linux-x64.tar.gz cd siteone-crawler ./siteone-crawler --url=https://example.com ``` ### Install via package manager ```bash # Debian/Ubuntu (glibc — Ubuntu 24.04+, Debian 13+) sudo dpkg -i siteone-crawler_${{ steps.version.outputs.version }}-1_amd64.deb # Debian/Ubuntu (static/musl — older distributions) sudo dpkg -i siteone-crawler-static_${{ steps.version.outputs.version }}-1_amd64.deb # Fedora/RHEL sudo dnf install ./siteone-crawler-${{ steps.version.outputs.version }}-1.x86_64.rpm ``` files: artifacts/* generate_release_notes: true draft: true prerelease: false ================================================ FILE: .gitignore ================================================ /target /tmp/ /dist/ *.swp *.swo *~ .idea/ .vscode/ *.cache ================================================ FILE: CHANGELOG.md ================================================ ### Changelog All notable changes to this project will be documented in this file. Dates are displayed in UTC. #### [v1.0.9](https://github.com/janreges/siteone-crawler/compare/v1.0.8...v1.0.9) - typos: non exhaustive typo and spelling corrections [`#8`](https://github.com/janreges/siteone-crawler/pull/8) - offline exporter: new option --ignore-store-file-error for the OfflineWebsiteExporter [`#16`](https://github.com/janreges/siteone-crawler/pull/16) - url handling: added option --transform-url to force requests for some URL to be internally transformed and a different URL/domain (e.g. local) to be queried, fixes #58 [`#58`](https://github.com/janreges/siteone-crawler/issues/58) - html report: added option to list which sections to include in the HTML report via --html-report-options (see README.md), fixes #63 [`#63`](https://github.com/janreges/siteone-crawler/issues/63) - offline export: fix behavior regarding URLs containing various valid UTF-8 characters (German, Chinese, etc.), fixes #65 [`#65`](https://github.com/janreges/siteone-crawler/issues/65) - seo analysis: fix for an issue that occurs when encoding UTF-8 due to some special characters in the content, fixes #51 [`#51`](https://github.com/janreges/siteone-crawler/issues/51) - offline website exporter: added option --offline-export-no-auto-redirect-html, which disables the generation of automatic sub-folder.html with meta redirects to sub-folder/index.html, fixes #54 [`#54`](https://github.com/janreges/siteone-crawler/issues/54) - offline website exporter: fix replacing reference where it is followed by and not an immediate number, fixes #52 [`#52`](https://github.com/janreges/siteone-crawler/issues/52) - slowest analyzer: fixed typo slowest->slower, fixes #42 [`#42`](https://github.com/janreges/siteone-crawler/issues/42) - url & sitemaps: as --url it is now possible to specify a URL to sitemap xml, or sitemap index xml, from which to find a list of all URLs, fixes #25 [`#25`](https://github.com/janreges/siteone-crawler/issues/25) - github: remove all unnecessary files from the release package [`e54029c`](https://github.com/janreges/siteone-crawler/commit/e54029cbef015a259d92e93933e81af2e851a145) - github: fix release workflow [`c9d5361`](https://github.com/janreges/siteone-crawler/commit/c9d5361acd646b24e47cb6e60e7d07be12cd96c9) - github: workflow for automatic creation of release archives for all 5 supported platforms/architectures [`0a461ac`](https://github.com/janreges/siteone-crawler/commit/0a461aca0b145982005b6a460d3e852a0767426a) - webp analysis: if there are avif images on the website (they are more optimized than webp), we will not report the absence of webp [`e067653`](https://github.com/janreges/siteone-crawler/commit/e06765332fa743f9bb22f5eb589cb71a01dc90db) - term: if TERM is not set or we're not in a TTY, use default width 138 [`eb839e4`](https://github.com/janreges/siteone-crawler/commit/eb839e423abf4df7020822986dc9e2ae43d44971) - options: handling of the situation of calling only 'crawler' without a parameters - complete documentation and a red message about the need to pass at least the --url parameter will be displayed [`fc390ae`](https://github.com/janreges/siteone-crawler/commit/fc390ae693ba201b060effc67a90ad893772558f) - phpstan: fix errors found by phpstan and increasing the memory limit for phpstan [`650d46a`](https://github.com/janreges/siteone-crawler/commit/650d46abb867ff04df24be01c3c6daebd42b0911) - tests: fix the tests after removing the underscore for the external domain [`b31a872`](https://github.com/janreges/siteone-crawler/commit/b31a872fdc439a321c364665d18634906ce8ad30) - Revert "url parser: fix url parsing in some cases when href starts with './'" [`240430b`](https://github.com/janreges/siteone-crawler/commit/240430bc90039063b9e810980360f798afa46f74) - url parser: fix url parsing in some cases when href starts with './' [`2443532`](https://github.com/janreges/siteone-crawler/commit/244353202c80c152b6a3b63ef83f6046338404e9) - url parser: fix url parsing in some cases when href starts with './' [`fe33e7b`](https://github.com/janreges/siteone-crawler/commit/fe33e7b6c404a62cf636db20b6193196c4bf6e25) - website to markdown: added --markdown-remove-links-and-images-from-single-file - useful when used within an AI tool to obtain context from a website (typically with documentation of a solution/framework) [`631e544`](https://github.com/janreges/siteone-crawler/commit/631e544b9eb836a01f68f055e80b8b35b16687dc) - website to markdown: fixed the problem with incorrect sorting of the root index.md (homepage should be at the beginning) [`c2ffff3`](https://github.com/janreges/siteone-crawler/commit/c2ffff32a48e84c872e132417ef1623015755e7e) - website to markdown: fine tuning of the resulting markdown files, correct detection of table headers, removal of excess whitespaces [`ee40b29`](https://github.com/janreges/siteone-crawler/commit/ee40b2915611824c676c5d4761a266edba6be0d2) - website to markdown: added --markdown-export-single-file for the ability to save all website content into one combined markdown file (smart detection and removal of shared headers and footers is also implemented) [`af01376`](https://github.com/janreges/siteone-crawler/commit/af013766830991f473d26dc25dc5804cc88b7c76) - readme: changed partnership to powered by JetBrains [`e77f755`](https://github.com/janreges/siteone-crawler/commit/e77f755527319e99d66cec6b2b2864dee4d560e4) - readme: added partnership with JetBrains [`0104646`](https://github.com/janreges/siteone-crawler/commit/0104646b6f209eae1c530bb68160a5fa238f7dda) - website to markdown: added implicit excluded selectors for typical 'hidden' classes [`b3c57d6`](https://github.com/janreges/siteone-crawler/commit/b3c57d69f9ef52592cab308b493b328b81c29705) - website to markdown: consecutive links fixes (ignore links without visible text or defined href) [`6d9a310`](https://github.com/janreges/siteone-crawler/commit/6d9a31053a56bb532961395a2e1821f2028e36ac) - website to markdown: list fixes and prepared auto-removal of duplicates (e.g. desktop & mobile version of menus) [`338b0c6`](https://github.com/janreges/siteone-crawler/commit/338b0c692a434a4f3a2a20160c9f45004526c04a) - website to markdown: removed unwanted escaping from links/images [`35c6f57`](https://github.com/janreges/siteone-crawler/commit/35c6f579a62080316210ec00734af8069ea32f27) - website to markdown: refactoring the way ul/ol lists are composed (there were problems with nested lists and whitespaces) [`15ea68c`](https://github.com/janreges/siteone-crawler/commit/15ea68ce7e3e3d962c5813ad971571eec42fe933) - README: improved introduction and added icons [`737b8c6`](https://github.com/janreges/siteone-crawler/commit/737b8c63bce618bdd613090205866d03bde1d67b) - docs: added Table of Contents to JSON-OUTPUT.md and TEXT-OUTPUT.md [`2aa2856`](https://github.com/janreges/siteone-crawler/commit/2aa28569de9fa0315219e68d084cc675ead57303) - docs: added detailed documentation and real sample JSON and TXT output from the crawler for a better idea of ​​its functionality [`09495d1`](https://github.com/janreges/siteone-crawler/commit/09495d187e3f80d4f4c29176c12540e300c5cb6f) - docs: added detailed documentation and real sample JSON and TXT output from the crawler for a better idea of ​​its functionality [`cb7606b`](https://github.com/janreges/siteone-crawler/commit/cb7606b2c8fe1547cfdf787d0b7050693228ff2e) - json output docs: first version [`73e8d45`](https://github.com/janreges/siteone-crawler/commit/73e8d45ad93e1cc88a778533d0955e22cec9d6c7) - output options: added option --timezone (e.g. Europe/Prague, default is UTC) to set the time zone in which dates and times in HTML reports and exported folder/file names should be, refs #57 [`e3d3213`](https://github.com/janreges/siteone-crawler/commit/e3d321315b6c9f0290b5795345a90e78af32a358) - website to markdown: use link URL as text when link text is empty [`873ffae`](https://github.com/janreges/siteone-crawler/commit/873ffae76a8d96c8a2a4e4670ad09f4ed8527d4a) - website to markdown: if the link contains nested div/span tags, display the link in markdown as a list-item so that it is on its own line [`c48f346`](https://github.com/janreges/siteone-crawler/commit/c48f34614a057b79e5f9e5d5fbb9877cd7c2d25f) - website to markdown: removed the use of html2markdown (problematic integration on windows due to cygwin) and replaced with a custom HtmlToMarkdownConverter [`4e1db09`](https://github.com/janreges/siteone-crawler/commit/4e1db090f7b9276663c8fda587e8673d67783340) - content processor: added justification for skipping URLs due to exceeding --max-depth [`a6bc08a`](https://github.com/janreges/siteone-crawler/commit/a6bc08ac2367b3fb008b51e1278b3b78ae5bfe28) - README: converting arguments to a table view and adding missing links to the outline [`c23a686`](https://github.com/janreges/siteone-crawler/commit/c23a6860f06918062a159747039bd38e868cd7f8) - README: added all missing options (--max-reqs-per-sec, --max-heading-level, --websocket-server, --console-width and a few others less important) [`82c48bc`](https://github.com/janreges/siteone-crawler/commit/82c48bccf597a7c3811c16ef6c8b29fc37d7c46c) - extra columns: added option to extract data using XPath and RegEx to --extra-columns [`cd6d55a`](https://github.com/janreges/siteone-crawler/commit/cd6d55af254f4f38b25399293aa6d122c578f4c7) - http response: ensuring that the repeated response header is merged into a concatenated string, instead of an array, refs #48 [`c0f3b21`](https://github.com/janreges/siteone-crawler/commit/c0f3b210e3ddca203eb9363f038bcf4e30a3f30c) - css processor: fix for a situation where some processors could cause CSS content to be NULL [`c8f2ffc`](https://github.com/janreges/siteone-crawler/commit/c8f2ffc45628a2f0f1e477dc7e2ea436c9ebafbe) - website to markdown: better removal of nested images in situations like [![logo by @foobar](data:image/gif;base64,fooo= "logo by @foobar")](index.html) [`9ecba5e`](https://github.com/janreges/siteone-crawler/commit/9ecba5e91608fbbcd625e3ff42621869a7e31f00) - website to markdown: first version of the converter of entire web pages to markdown [`b944edb`](https://github.com/janreges/siteone-crawler/commit/b944edbcc33381c97ba220e1920994574c676225) - security check: handle case of multiple headers with the same name [`706977e`](https://github.com/janreges/siteone-crawler/commit/706977e545c428ab82714e95a75294841dac5e46) - html processor: do not remove the schema and host for URLs defined in --ignore-regex [`8be42af`](https://github.com/janreges/siteone-crawler/commit/8be42afea5af076aee097842fa3c4996e66c47ef) - offline export: added --offline-export-remove-unwanted-code=<1/0> (default is 1) to remove unwanted code for offline mode - typically, JS of the analytics, social networks, cookie consent, cross origins, etc .. refs #37 [`17a11fa`](https://github.com/janreges/siteone-crawler/commit/17a11fa3fe7a2d9c012e0f70c2392e833e02193c) - loop protection: added --max-non200-responses-per-basename as configurable protection against looping with dynamic non-200 URLs. If a basename (the last part of the URL after the last slash) has more non-200 responses than this limit, other URLs with same basename will be ignored/skipped [`063bddf`](https://github.com/janreges/siteone-crawler/commit/063bddf47a9fe82dc2b08297acd16fd154001feb) - bin/swoole-cli: upgrade to latest Swoole 6.0.0 (this version already supports Swoole\Threads - in the future there will be a refactoring that will relieve us of the necessity to use Swoole\Table, which requires memory preallocation for a predefined number of rows + my ticket https://github.com/swoole/swoole-src/issues/5460 has been processed regarding the support of getting the values of repeated header) [`b6e7c23`](https://github.com/janreges/siteone-crawler/commit/b6e7c23c055032a1003605ef2679f1ca59b64a08) - css processor: fix query string and anchor processing for paths in url() + don't replace url(data:*) with complex information e.g. about svg including brackets, refs #31 [`36eece8`](https://github.com/janreges/siteone-crawler/commit/36eece89c0719602145dbf51673d80355a80bfd2) - skipped urls: width defined fixed at 60 - better for most situations than the previous dynamic calculation [`8ef462f`](https://github.com/janreges/siteone-crawler/commit/8ef462f2fb52b65213136705536ba575dd2a9511) - manager: refactored mb_convert_encoding() -> htmlentities() as part of the migration to PHP 8.4.1 [`5c7c903`](https://github.com/janreges/siteone-crawler/commit/5c7c903d7d4d178b691c870e4a71fc862685c21d) - http cache analysis: added analysis of http cache of all pages and assets - divided by content type, domains, and their combination [`b09cfbd`](https://github.com/janreges/siteone-crawler/commit/b09cfbdf3fe033feef3b64b0fcbbda15dc0308ab) - css processing: added search for urls in @import url(*.css) [`c964fea`](https://github.com/janreges/siteone-crawler/commit/c964fea1382fec71b990ac2cd89683590694d5b3) - analysis/report: if there is no URL with code >= 200, there is no point to perform analysis, print empty output of all analyzers and generate full report [`c1bb448`](https://github.com/janreges/siteone-crawler/commit/c1bb448922cb47d0fe7fa28d2c5f540d6961ea94) - options: fix passing booleans to correctUrl() in case of empty '-u' or '--url' parameters (recognized as boolean flags) [`a297fec`](https://github.com/janreges/siteone-crawler/commit/a297feccb34002604705c180855d8f12cd0e41a2) - skipped-urls: added overview of skipped URLs including summary across domains - not only from security point of view it is good to know where external links are pointing and from where js/css/fonts/images are loaded [`84ae146`](https://github.com/janreges/siteone-crawler/commit/84ae1467a6c02b194e9e5631351f00a52b5924e0) - user-agent: if a manually defined user-agent ends with the exclamation !, do not add the signature siteone-crawler/version and remove the exclamation [`cfda3b0`](https://github.com/janreges/siteone-crawler/commit/cfda3b072e208966f9e7078211257d1a027d2bfa) - options: better response and warning for unfilled required --url [`52e50db`](https://github.com/janreges/siteone-crawler/commit/52e50db58f15bd10ab64b70f4c3f3fbf299c0135) - dns resolving: added --resolve attribute, which behaves exactly the same as curl, and using the 'domain:port:ip' entry it is possible to provide a custom IP address for the domain:port pair [`4031181`](https://github.com/janreges/siteone-crawler/commit/403118132807f30c65ed89b1b2d8f924a22e3a90) - windows/cygwin: workarounds for cygwin environment to return as much DNS/SSL/TLS info as possible even if nslookup or dig cannot be called [`bfc4f55`](https://github.com/janreges/siteone-crawler/commit/bfc4f5508e85b4af2e7c17309181791a3a9d5fc1) - upload timeout: fix that --upload-timeout does not overwrite the primary timeout [`c429639`](https://github.com/janreges/siteone-crawler/commit/c429639e30d44420bc9af017536714df52868813) - readme: adding a sample report and clone of nextjs.org and a few other updates [`07ad5e1`](https://github.com/janreges/siteone-crawler/commit/07ad5e119b47455ff4a2e3ba6230a21203d40396) - readme: added description for --allowed-domain-for-external-files and --allowed-domain-for-crawling [`0c8b1b3`](https://github.com/janreges/siteone-crawler/commit/0c8b1b3fb791a5c0e8f540a34d62122682680c19) - filtering: added --single-foreign-page to ensure that only the linked page and its assets are loaded from the external domain (which second-level domain is not the same as the initialization URL), but not all other pages on the external domain are automatically crawled [`c4af4ec`](https://github.com/janreges/siteone-crawler/commit/c4af4ec5fb76456f4d47eaf6041ba4be4fbb48b8) - filtering: added --disable-all-assets as a shortcut for calling all --disable-* flags [`7e32c44`](https://github.com/janreges/siteone-crawler/commit/7e32c440fb0ee0260f7b1e2c6b2a01b753ffb149) - filtering: added --max-depth=<int> for maximum crawling depth (for pages, not assets) and --single-page moved to basic options [`2dbff75`](https://github.com/janreges/siteone-crawler/commit/2dbff756dec3735f8f8c9f293dcd846eb3b3fde6) - resource filtering: added --single-page for loading only one given URL and their assets [`7325a4b`](https://github.com/janreges/siteone-crawler/commit/7325a4bbf633f60015309e257af509a5f21384d5) - offline exporter: added the possibility to use --replace-query-string to replace the default behavior where the query string is replaced by a short hash constructed from the query string in filenames, see issue #30 [`1a3482c`](https://github.com/janreges/siteone-crawler/commit/1a3482c6dada06b8482f205ceb181d8b42a62607) - offline export: added --replace-content=<val> option to replace content in HTML/JS/CSS before saving to disk (with strict text & regexp support) [`81cddaa`](https://github.com/janreges/siteone-crawler/commit/81cddaaf57550ac253b3e1ab322c3f5498374e96) - revert caps [`76a7418`](https://github.com/janreges/siteone-crawler/commit/76a74184c871714871f537344a84e757069fff0c) - Revert "Auxiliary commit to revert individual files from b3bb0eea10075aee124cce485379c24ece78df79" [`5878be9`](https://github.com/janreges/siteone-crawler/commit/5878be97f663d8ac70eac9e56578e628faeabb9f) - robots.txt handling: process Disallow records only for user-agent 'SiteOne-Crawler' or '*' [`9c2c989`](https://github.com/janreges/siteone-crawler/commit/9c2c989c569fed518bb5139c1d496159cc486683) - new option for the OfflineWebsiteExporter [`2c4bbbc`](https://github.com/janreges/siteone-crawler/commit/2c4bbbc6f0e55a4f3af6a89be50450e15b65cdd2) - tables: added --rows-limit option (default 200) to hard limit the length of all tables with data from analyses (except Visited URLs) to prevent very long and slow reports .. tables are sorted by severity, so it should be ok [`9798252`](https://github.com/janreges/siteone-crawler/commit/9798252901dd25797d1d38fa26a19c6dbc409fa1) - video gallery: added display of all found videos with video player (including use of observer for lazy loading and smart option to preload first seconds of video + button to play 2 seconds of each video sequentially) [`411736a`](https://github.com/janreges/siteone-crawler/commit/411736ac3852d07464fe4a4a52c4c0bf171d716f) - license: change of licensing to MIT [`14b73e2`](https://github.com/janreges/siteone-crawler/commit/14b73e2e10cc924112966d2c5b16812dadf1fc48) - non exhaustive typo and spelling corrections [`b3bb0ee`](https://github.com/janreges/siteone-crawler/commit/b3bb0eea10075aee124cce485379c24ece78df79) #### [v1.0.8](https://github.com/janreges/siteone-crawler/compare/v1.0.7...v1.0.8) > 24 August 2024 - reports: changed file name composition from report.mydomain.com.* to mydomain.com.report.* [`#9`](https://github.com/janreges/siteone-crawler/pull/9) - version: update to 1.0.8.20240824 [`6c634e0`](https://github.com/janreges/siteone-crawler/commit/6c634e0f88cce49aa3f5fb9cd69ca55fa5191bd8) - version 1.0.8.20240824 + changelog [`a02cc7b`](https://github.com/janreges/siteone-crawler/commit/a02cc7bf4c0fc4703189341d9ea0be2345b95796) - crawler: solved edge-case, which very rarely occurred when the queue processing was already finished, but the last outstanding coroutine still found some new URL [`a85990d`](https://github.com/janreges/siteone-crawler/commit/a85990d662d74af281805cfdf10c0320fee0007a) - javascript processor: improvement of webpack JS processing in order to correctly replace paths from VueJS during offline export (as e.g. in case of docs.netlify.com) .. without this, HTML had the correct paths in the left menu, but JS immediately broke them because they started with an absolute path with a slash at the beginning [`9bea99b`](https://github.com/janreges/siteone-crawler/commit/9bea99b9684e6059b8abfad4b382fafdad31c9a9) - offline export: detect and process fonts.googleapis.com/css* as CSS even if there is no .css extension [`da33100`](https://github.com/janreges/siteone-crawler/commit/da33100975635be8305e07c2023a22c300b66216) - js processor: removed the forgotten var_dump [`5f2c36d`](https://github.com/janreges/siteone-crawler/commit/5f2c36de1666e6987d2c9d88a39e3b6d0a2e1f32) - offline export: improved search for external JS in the case of webpack (dynamic composition of URLs from an object with the definition of chunks) - it was debugged on docs.netlify.com [`a61e72e`](https://github.com/janreges/siteone-crawler/commit/a61e72e7f5b773a437b4151432db04a5afd7124a) - offline export: in case the URL ends with a dot and a number (so it looks like an extension), we must not recognize it as an extension in some cases [`c382d95`](https://github.com/janreges/siteone-crawler/commit/c382d959f7440ebfcd95566ec0050e771a2f3495) - offline url converter: better support for SVG in case the URL does not contain an extension at all, but has e.g. 'icon' in the URL (it's not perfect) [`c9c01a6`](https://github.com/janreges/siteone-crawler/commit/c9c01a69905fefce82f4e8f85e707a0d1abb5e1e) - offline exporter: warning instead of exception for some edge-cases, e.g. not saving SVG without an extension does not cause the export to stop [`9d285f4`](https://github.com/janreges/siteone-crawler/commit/9d285f4d599ba8892dd8752e8d831cd3c86af178) - cors: do not set Origin request header for images (otherwise error 403 on cdn.sanity.io for svg, etc.) [`2f3b7eb`](https://github.com/janreges/siteone-crawler/commit/2f3b7eb51a03d42d3d2961c84aadcd118b546e05) - best practice analyzer: in checking for missing quotes ignore values ​​longer than 1000 characters (fixes, e.g., at skoda-auto.cz the error Compilation failed: regular expression is too large at offset 90936) [`8a009df`](https://github.com/janreges/siteone-crawler/commit/8a009df9734773275fd9805862dc9bfeeccb6079) - html report: added loading of extra headers to the visited URL list in the HTML report [`781cf17`](https://github.com/janreges/siteone-crawler/commit/781cf17c18088126db74ebc1ef00fee3d6784979) - Frontload the report names [`62d2aae`](https://github.com/janreges/siteone-crawler/commit/62d2aae57e31c7bfa53720446cc8dfbc59e482af) - robots.txt: added option --ignore-robots-txt (we often need to view internal or preview domains that are otherwise prohibited from indexing by search engines) [`9017c45`](https://github.com/janreges/siteone-crawler/commit/9017c45a675dd327895b57f14095ad6bd52a02fc) - http client: adden an explicit 'Connection: close' header and explicitly calling $client->close(), even though Swoole was doing it automatically after exiting the coroutine [`86a7346`](https://github.com/janreges/siteone-crawler/commit/86a7346d059452d210b945ca4329e1cc17781dca) - javascript processor: parse url addresses to import the JS module only in JS files (otherwise imports from HTML documentation, e.g. on the websites svelte.dev or nextjs.org, were parsed by mistake) [`592b618`](https://github.com/janreges/siteone-crawler/commit/592b618c01e75509e16a812fafab7f21f3c7c64d) - html processor: added obtaining urls from HTML attributes that are not wrapped in quotes (but I am aware that current regexps can cause problems in the cases when are used spaces, which are not properly escaped) [`f00abab`](https://github.com/janreges/siteone-crawler/commit/f00ababfa459eca27dce7657fe91c70831f86089) - offline url converter: swapping woff2/woff order for regex because in this case their priority is important and because of that woff2 didn't work properly [`3f318d1`](https://github.com/janreges/siteone-crawler/commit/3f318d19fa0a3757546493ac7f47cca21922b1f5) - non-200 url basename detection: we no longer consider e.g. image generators that have the same basename and the url to the image in the query parameters as the same basename [`bc15ef1`](https://github.com/janreges/siteone-crawler/commit/bc15ef198bb13fe845fef8cd4946b2cab5c2ea6d) - supertable: activation of automatic creation of active links also for homepage '/' [`c2e228e`](https://github.com/janreges/siteone-crawler/commit/c2e228e0d475351431cf9b060487e86ce6d33e52) - analysis and robots.txt: improving the display of url addresses for SEO analysis in the case of a multi-domain website, so that it cannot happen that the same url, e.g. '/', is in the overview multiple times without recognizing the domain or scheme + improving the work with robots.txt in SEO detection and displaying urls banned for indexing [`47c7602`](https://github.com/janreges/siteone-crawler/commit/47c7602217e40a4f6d4f3af5c71d6dff72952aab) - offline website exporter: we add the suffix '_' to the folder name only in the case of a typical extension of a static file - we don't want this to happen with domain names as well [`d16722a`](https://github.com/janreges/siteone-crawler/commit/d16722a5ad6271270fb0fff11e66a7f02f3b6e9a) - javascript processor: extract JS urls also from imports like import {xy} from "./path/foo.js" [`aec6cab`](https://github.com/janreges/siteone-crawler/commit/aec6cab051a46df9d89866f5cfd7e66312dafb92) - visited url: added 'txt' extension to looksLikeStaticFileByUrl() [`460c645`](https://github.com/janreges/siteone-crawler/commit/460c6453d91e85c2889ebaa2b2542fd88c5ffa6a) - html processor: extract JS urls also from <link href="*.js">, typically with rel="modulepreload" [`c4a92be`](https://github.com/janreges/siteone-crawler/commit/c4a92bee00d96c530431134370a3ba0d2216a1c1) - html processor: extracting repeated calls to getFullUrl() into a variable [`a5e1306`](https://github.com/janreges/siteone-crawler/commit/a5e1306530717d9edd4f95a7989539a172a38f4a) - analysis: do not include urls that failed to load (timeout, skipping, etc.) in the analysis of content-types and source-domains - prevention of displaying content type 'unknown' [`b21ecfb`](https://github.com/janreges/siteone-crawler/commit/b21ecfb85f58d07c0a82b93826ad2977ab2cd523) - cli options: improved method of removing quotes even for options that can be arrays - also fixes --extra-columns='Title' [`97f2761`](https://github.com/janreges/siteone-crawler/commit/97f27611acf2fc4ed24b1e5574be84711ea3fa12) - url skipping: if there are a lot of URLs with the same basename (ending after the last slash), we will allow a maximum of 5 requests for URLs with the same basename - the purpose is to prevent a lot of 404 from being triggered when there is an incorrect relative link to relative/my-img.jpg on all pages (e.g. on 404 page on v2.svelte.dev) [`4fbb917`](https://github.com/janreges/siteone-crawler/commit/4fbb91791f9111cc6f9d98b60732fcca7fad2f1f) - analysis: perform most of the analysis only on URLs from domains for which we have crawling enabled [`313adde`](https://github.com/janreges/siteone-crawler/commit/313addede29ac847273b6ab6ed3a8ab878a6fb4a) - audio & video: added audio/video file search in <audio> and <video> tags, if file crawling is not disabled [`d72a5a5`](https://github.com/janreges/siteone-crawler/commit/d72a5a51bd6863425a3d8bcffc7a9b5eb831f979) - base practices: retexting stupid warning like '<h2> after <h0>' to '<h2> without previous heading [`041b383`](https://github.com/janreges/siteone-crawler/commit/041b3836a8a585158ae1a1a6fb0057b367f3a4f6) - initial url redirect: in the case thats is entered url that redirects to another url/domain within the same 2nd-level domain (typically http->https or mydomain.tld -> www.mydomain.tld redirects), we continue crawling with new url/domain and declare a new url as initial url [`166e617`](https://github.com/janreges/siteone-crawler/commit/166e617fbc893798dc7b340f43de75df2d4cf335) #### [v1.0.7](https://github.com/janreges/siteone-crawler/compare/v1.0.6...v1.0.7) > 22 December 2023 - version 1.0.7.20231222 + changelog [`9d2be52`](https://github.com/janreges/siteone-crawler/commit/9d2be52776c081989322953c7a31debfd4947420) - html report template: updated logo link to crawler.siteone.io [`9892cfe`](https://github.com/janreges/siteone-crawler/commit/9892cfe5708a3da2f5fc355246dd50b2a0c5cb4f) - http headers analysis: renamed 'Headers' to 'HTTP headers' [`436e6ea`](https://github.com/janreges/siteone-crawler/commit/436e6ea5a9914c8615bb03b444ac0aad15e31c49) - sitemap generator: added info about crawler to generated sitemap.xml [`7cb7005`](https://github.com/janreges/siteone-crawler/commit/7cb7005bf50b8f93b421c94c57ff51eb99b45912) - html report: refactor of all inline on* event listeners to data attributes and event listeners added from static JS inside <script>, so that we can disable all inline JS in the online HTML report and allow only our JS signed with hashes by Content-Security-Policy [`b576eef`](https://github.com/janreges/siteone-crawler/commit/b576eef55a5678a67928970fc51aaaefd7abd1a8) - readme: removed HTTP auth from roadmap (it's already done), improved guide how to implement own upload endpoint and message about SMTP moved under mailer options [`e1567ae`](https://github.com/janreges/siteone-crawler/commit/e1567aee52f9d09c1cef1ad35babaf9eea388175) - utils: hide passwords/authentication specified in cli parameters as *auth=xyz (e.g. --http-auth=abc:xyz)" in html report [`c8bb88f`](https://github.com/janreges/siteone-crawler/commit/c8bb88fc1a65ecdfd53db23fc5d972b841830837) - readme: fixed formatting of the upload and expert options [`2d14bd5`](https://github.com/janreges/siteone-crawler/commit/2d14bd5972496989624f91617de2689601e1c027) - readme: added Upload Options [`d8352c5`](https://github.com/janreges/siteone-crawler/commit/d8352c5acfddbeef1c1ae6498556dc296d944e0b) - upload exporter: added possibility via --upload to upload HTML report to offline URL, by default crawler.siteone.io/html/* [`2a027c3`](https://github.com/janreges/siteone-crawler/commit/2a027c38bfdb8e6e416b9a79ebe81e809c9326d9) - parsed-url: fixed warning in the case of url without host [`284e844`](https://github.com/janreges/siteone-crawler/commit/284e844f3f94cdb02032ddb76e51caa9a584c120) - seo and opengraph: fixed false positives 'DENY (robots.txt)' in some cases [`658b649`](https://github.com/janreges/siteone-crawler/commit/658b6494130fa282505ec38f12aa058acf7709b9) - best practices and inline-svgs: detection and display of the entire icon set in the HTML report in the case of <svg> with more <symbol> or <g> [`3b2772c`](https://github.com/janreges/siteone-crawler/commit/3b2772c59f822b7b4a6f91e15b616815b5ff92c4) - sitemap generator: sort urls primary by number of dashes and secondary alphabetically (thanks to this, urls of the main levels will be at the beginning) [`bbc47e6`](https://github.com/janreges/siteone-crawler/commit/bbc47e6239f9693c621016a50e624698dc3d242d) - sitemap generator: only include URLs from the same domain as the initial URL [`9969254`](https://github.com/janreges/siteone-crawler/commit/9969254e35cd8c134f85a7817de8722091f0377c) - changelog: updated by 'composer changelog' [`0c67fd4`](https://github.com/janreges/siteone-crawler/commit/0c67fd4f8d308d8d51d5b912d9b82cc96fb6e4fb) - package.json: used by auto-changelog generator [`6ad8789`](https://github.com/janreges/siteone-crawler/commit/6ad87895e5a8ab8bbce3d9cbf92ee5e8b8218cc0) #### [v1.0.6](https://github.com/janreges/siteone-crawler/compare/v1.0.5...v1.0.6) > 8 December 2023 - readme: removed bold links from the intro (it didn't look as good on github as it did in the IDE) [`b675873`](https://github.com/janreges/siteone-crawler/commit/b6758733cde67f11322a2f82573b19ec1a0edc9d) - readme: improved intro and gif animation with the real output [`fd9e2d6`](https://github.com/janreges/siteone-crawler/commit/fd9e2d69c8f940cfaa81ad7bab86f1a74f01b0da) - http auth: for security reasons, we only send auth data to the same 2nd level domain (and possibly subdomains). With HTTP basic auth, the name and password are only base64 encoded and we would send them to foreign domains (which are referred to from the crawled website) [`4bc8a7f`](https://github.com/janreges/siteone-crawler/commit/4bc8a7f9871064aa1c88c374aa299904409d2817) - html report: increased specificity of the .header class for the header, because this class were also used by the generic class at <td class='header'> in security tab [`9d270e8`](https://github.com/janreges/siteone-crawler/commit/9d270e884545d6459f20348db71404e513ae8928) - html report: improved readability of badge colors in light mode [`76c5680`](https://github.com/janreges/siteone-crawler/commit/76c5680397446b84f3b13800590d914b7a9b0533) - crawler: moving the decrement of active workers after parsing URLs from the content, where further filling of the queue could occur (for this reason, queue processing could sometimes get stuck in the final stages) [`f8f82ab`](https://github.com/janreges/siteone-crawler/commit/f8f82ab61c1969952bb70f1b598ed3d97938a84e) - analysis: do not parse/check empty HTML (it produced unnecessary warning) - it is valid to have content-type: text/html but with connect-lengt: 0 (for example case for 'gtm.js?id=') [`436d81b`](https://github.com/janreges/siteone-crawler/commit/436d81b81f905178fb972f8b5cd0236bac244bc4) #### [v1.0.5](https://github.com/janreges/siteone-crawler/compare/v1.0.4...v1.0.5) > 3 December 2023 - changelog: updated changelog after 3 added commits to still untagged draft release 1.0.5 [`f42fe18`](https://github.com/janreges/siteone-crawler/commit/f42fe18de89676dc0dea4dc033207c934282d04b) - utils tests: fixed tests of methods getAbsolutePath() and getOutputFormattedPath() [`d4f4576`](https://github.com/janreges/siteone-crawler/commit/d4f4576ff566eb48495c9fb55a898b0989ef42c3) - crawler.php: replaced preg_match to str_contains [`5b28952`](https://github.com/janreges/siteone-crawler/commit/5b289521cdbb90b6571a29cb9c880e065b852129) - version: 1.0.5.20231204 + changelog [`7f2e974`](https://github.com/janreges/siteone-crawler/commit/7f2e9741fab25e9369151bc2d79a38b8827e2463) - option: replace placeholders like a '%domain' also in validateValue() method because there is also check if path is writable with attempt to mkdir [`329143f`](https://github.com/janreges/siteone-crawler/commit/329143fa23925ea523504735b3f724c026fe5ac6) - swoole in cygwin: improved getBaseDir() to work better even with the version of Swoole that does not have SCRIPT_DIR [`94cc5af`](https://github.com/janreges/siteone-crawler/commit/94cc5af4411a8c7427ee136a937ac629b8637668) - html processor: it must also process the page with the redirect, because is needed to replace the URL in the meta redirect tag [`9ce0eee`](https://github.com/janreges/siteone-crawler/commit/9ce0eeeebe1e524b9d46d91dd4cecb2e796db8c3) - sitemap: use formatted output path (primary for better output in Cygwin environment with needed C:/foo <-> /cygwin/c/foo conversion) [`6297a7f`](https://github.com/janreges/siteone-crawler/commit/6297a7f4069f9e09c013268e0df896db2fa91dec) - file exporter: use formatted output path (primary for better output in Cygwin environment with needed C:/foo <-> /cygwin/c/foo conversion) [`426cfb2`](https://github.com/janreges/siteone-crawler/commit/426cfb2b32f854d65abfce841e4e4f4badf04fef) - options: in the case of dir/file validation, we want to work with absolute paths for more precise error messages [`6df228b`](https://github.com/janreges/siteone-crawler/commit/6df228bdfc87a2c9fb6eee611fdc87d976b7f721) - crawler.php: improved baseDir detection - we want to work with absolute path in all scenarios [`9d1b2ce`](https://github.com/janreges/siteone-crawler/commit/9d1b2ce9bedb15ede90bcee9641e1cfc62b9c3cc) - utils: improved getAbsolutePath() for cygwin and added getOutputFormattedPath() with reverse logic for cygwin (C:/foo/bar <-> /cygdrive/c/foo/bar) [`161cfc5`](https://github.com/janreges/siteone-crawler/commit/161cfc5c4fd3fa3675cade409d7d5e11db2da0c6) - offline export: renamed --offline-export-directory to --offline-export-dir for consistency with --http-cache-dir or --result-storage-dir [`26ef45d`](https://github.com/janreges/siteone-crawler/commit/26ef45d145a1a02a5313067e6298571e26d9618b) #### [v1.0.4](https://github.com/janreges/siteone-crawler/compare/v1.0.3...v1.0.4) > 30 November 2023 - dom parsing: handling warnings in case of impossibility to parse some DOM elements correctly, fixes #3 [`#3`](https://github.com/janreges/siteone-crawler/issues/3) - version: 1.0.4.20231201 + changelog [`8e15781`](https://github.com/janreges/siteone-crawler/commit/8e15781265cdd9cce10d9dcde57d46b57b50e1cf) - options: ignore empty values in the case of directives with the possibility of repeated definition [`5e30c2f`](https://github.com/janreges/siteone-crawler/commit/5e30c2f8ad6cf00ad819ba1d7d6ec4e6c95a7113) - http-cache: now the http cache is turned off using the 'off' value (it's more understandable) [`9508409`](https://github.com/janreges/siteone-crawler/commit/9508409fbba2d96dc92cd73bed5abe462d5cea15) - core options: added --console-width to enforce the definition of the console width and disable automatic detection via 'tput cols' on macOS/Linux or 'mode con' on Windows (used by Electron GUI) [`8cf44b0`](https://github.com/janreges/siteone-crawler/commit/8cf44b06616e15301c486146a7c6b1003ce5137f) - gui support: added base-dir detection for Windows where the GUI crawler runs in Cygwin [`5ce893a`](https://github.com/janreges/siteone-crawler/commit/5ce893a66c7f1e21af025603b66223e04246e029) - renaming: renamed 'siteone-website-crawler' to 'siteone-crawler' and 'SiteOne Website Crawler' to 'SiteOne Crawler' [`64ddde4`](https://github.com/janreges/siteone-crawler/commit/64ddde4b53f16679a8c4671c98b3f9c619d94b42) - utils: fixed color-support detection [`62dbac0`](https://github.com/janreges/siteone-crawler/commit/62dbac07d15ecfa0ff677c277e2a3381a47025bf) - core options: added --force-color options to bypass tty detection (used by Electron GUI) [`607b4ad`](https://github.com/janreges/siteone-crawler/commit/607b4ad8583845adea209f75edfa27870ac23f9d) - best practice analysis: in the case of checking an image (e.g. for the existence of WebP/AVIF), we also want to check external images, because very often websites have images linked from external domains or services for image modification or optimization [`6100187`](https://github.com/janreges/siteone-crawler/commit/6100187347e0bbba6270335e2d9b2faf37475333) - html report: set scaleDown as default object-fit for image gallery [`91cd300`](https://github.com/janreges/siteone-crawler/commit/91cd300dcd7455c2b9be548fb2746cea7fd7c904) - offline exporter: added short -oed as alias to --offline-export-directory [`22368d9`](https://github.com/janreges/siteone-crawler/commit/22368d9a892aab8011aa4a0884bf01a8560f6167) - image gallery: list of all images on the website (except those from the srcset, where there would be duplicates only in other sizes or formats), including SVG with rich filtering options (through image format, size and source tag/attribute) and the option of choosing small/medium/view and scale-down/contains/cover for object-fit css property [`43de0af`](https://github.com/janreges/siteone-crawler/commit/43de0af1c60d398f91b373c192d1a35ac2df2fd1) - core options: added a shortened version of the command name consisting of only one hyphen and the first letters of the words of the full command (e.g. --memory-limit has short version -ml), added getInitialScheme() [`eb9a3cc`](https://github.com/janreges/siteone-crawler/commit/eb9a3cc62dffc58be2701c52bb21509d39a5dfad) - visited url: added 'sourceAttr' with information about where the given URL was found and useful helper methods [`6de4e39`](https://github.com/janreges/siteone-crawler/commit/6de4e39c5f8b9ba685e3865193274ccf0ee91a3d) - found urls: in the case of the occurrence of one URL in several places/attributes, we consider the first one to be the main one (typically the same URL in src and then also in srcset) [`660bb2b`](https://github.com/janreges/siteone-crawler/commit/660bb2b2bd2cb6949fe9c573e72b31e9fb97a9fe) - url parsing: added more recognition of which attributes the given URL address was parsed from (we need to recognize src and srcset for ImageGallery in particular) [`802c3c6`](https://github.com/janreges/siteone-crawler/commit/802c3c66a40087745e68f47392f0e6e8e9725171) - supertable and urls: in removing the redundant hostname for a more compact URL output, we also take into account the scheme http:// or https:// of initial URL (otherwise somewhere it lookedlike duplicate) + prevention of ansi-color definitions for bash in the HTML output [`915469e`](https://github.com/janreges/siteone-crawler/commit/915469e2a4a6d0fed337ca70efe9170758751ade) - title/description/keywords parsing: added html entities decoding because some website uses decoded entities with í – etc [`920523d`](https://github.com/janreges/siteone-crawler/commit/920523d3c55baf6cd7b2602334d9776b3e40f4d7) - crawler: added 'sourceAttr' to the swoole table queue and already visited URLs (we will use it in the Image Gallery for filtering, so as not to display unnecessarily and a lot of duplicate images only in other resolutions from the srcsets) [`0345abc`](https://github.com/janreges/siteone-crawler/commit/0345abc6dab770e3196dd88ff0123a2050828644) - url parameter: it is already possible not to enter the scheme and https:// or http:// will be added automatically (http:// for e.g. for localhost) [`85e14e9`](https://github.com/janreges/siteone-crawler/commit/85e14e961b53b83c208ac936972a335cace61bf8) - disabled images: in the case of a request to remove the images, replace their body with a 1x1px transparent gif and place a semi-transparent hatch with the crawler logo and opacity as a background [`c1418c3`](https://github.com/janreges/siteone-crawler/commit/c1418c3154301fd3995dde421b066f16850203e7) - url regex filtering: added option , which will allow you to limit the list of crawled pages according to the declared regexps, but at the same time it will allow you to crawl and download assets (js, css, images, fonts, documents, etc.) from any URL (but with respect to allowed domains) [`21e67e5`](https://github.com/janreges/siteone-crawler/commit/21e67e5be74050cd5b7c9998654ed66f18db4d85) - img srcset parsing: because a valid URL can also contain a comma (and various dynamic parametric img generators use them) and in the srcset a comma+whitespace should be used to separate multiple values, this is also reflected in the srcset parsing [`0db578b`](https://github.com/janreges/siteone-crawler/commit/0db578bda37c024b2b111c814e35c2107e4751ad) - websocket server: added option to set --websocket-server, which starts a parallel process with the websocket server, through which the crawler sends various information about the progress of crawling (this will also be used by Electron UI applications) [`649132f`](https://github.com/janreges/siteone-crawler/commit/649132f8965421cd1bb3570fbb9f534e6caef313) - http client: handle scenario when content loaded from cache is not valid (is_bool) [`1ddd099`](https://github.com/janreges/siteone-crawler/commit/1ddd099ecdadc5752016237ec1f0acf80e907dc8) - HTML report: updated logo with final look [`2a3bb42`](https://github.com/janreges/siteone-crawler/commit/2a3bb428180067a649f2467419920b3d4f70a9fd) - mailer: shortening and simplifying email content [`e797107`](https://github.com/janreges/siteone-crawler/commit/e7971071f8c5e4cff1472464ce9ec4407c198a59) - robots.txt: added info about loaded robots.txt to summary (limited to 10 domains for case of huge multi domain crawling) [`00f9365`](https://github.com/janreges/siteone-crawler/commit/00f93659637705bc6389c5f073a29f09b743370f) - redirects analyzer: handled edge case with empty url [`e9be1e3`](https://github.com/janreges/siteone-crawler/commit/e9be1e350b1d114c54b7099b54277da23467b538) - text output: added fancy banner with crawler logo (thanks to great SiteOne designers!) and smooth effect [`e011c35`](https://github.com/janreges/siteone-crawler/commit/e011c35f3cbc87fceb9d7a9c56c726817c79b543) - content processors: added applyContentChangesBeforeUrlParsing() and better NextJS chunks handling [`e5c404f`](https://github.com/janreges/siteone-crawler/commit/e5c404f2d52a7c2ebdb80ae3c93760c7e881dc9a) - url searches: added ignoring data:, mailto:, tel:, file:// and other non-requestable resources also to FoundUrls [`5349be2`](https://github.com/janreges/siteone-crawler/commit/5349be242f99567b8f5f093537a696ef5fd319ac) - crawler: added declare(strict_types=1) and banner [`27134d2`](https://github.com/janreges/siteone-crawler/commit/27134d29d16e3e24c633f010f731f11deeeadcb7) - heading structure analysis: highlighting and calculating errors for duplicate <h1> + added help cursor with a hint [`f5c7db6`](https://github.com/janreges/siteone-crawler/commit/f5c7db6206ed06e0cbaf38a7ae2505be573da2e6) - core options: added --help and --version, colorized help [`6f1ada1`](https://github.com/janreges/siteone-crawler/commit/6f1ada112898580d2de028c02e32fdeb8ad2a845) - ./crawler binary - send output of cd - to /dev/null and hide unwanted printed script path [`16fe79d`](https://github.com/janreges/siteone-crawler/commit/16fe79d08e24c4a6fbd87d16417413725aaa24e8) - README: updated paths in the documentation - it is now possible to use the ERROR: Option --url () must be valid URL [`86abd99`](https://github.com/janreges/siteone-crawler/commit/86abd998da94971c2512b6018085f39e8dd5db7f) - options: --workers default for Cygwin runtime is now 1 (instead of 3), because Cygwin runtime is highly unstable when workers > 1 [`f484960`](https://github.com/janreges/siteone-crawler/commit/f4849606fb382e1b759f547c4f1bfe2e5d8b4d02) #### [v1.0.3](https://github.com/janreges/siteone-crawler/compare/v1.0.2...v1.0.3) > 10 November 2023 - version: 1.0.3.20231110 + changelog [`5b80965`](https://github.com/janreges/siteone-crawler/commit/5b8096550dcd489a998d34fae44e3d99375e33e3) - cache/storage: better race-condition handling in a situation where several coroutines could write the same folder at one time, then mkdir reported 'File exists' [`be543dc`](https://github.com/janreges/siteone-crawler/commit/be543dc195e675e49064b20ee091903f1977942a) #### [v1.0.2](https://github.com/janreges/siteone-crawler/compare/v1.0.1...v1.0.2) > 10 November 2023 - version: 1.0.2.20231110 + changelog [`230b947`](https://github.com/janreges/siteone-crawler/commit/230b9478a36ee664dfe080447c09da9c4a9bc25c) - html report: added aria labels to active/important elements [`a329b9d`](https://github.com/janreges/siteone-crawler/commit/a329b9d4e0f040996c17cb3382cf3c07c61a4b35) - version: 1.0.1.20231109 - changelog [`50dc69c`](https://github.com/janreges/siteone-crawler/commit/50dc69c9ab956691bbf97860355d410a0bdba0c9) #### [v1.0.1](https://github.com/janreges/siteone-crawler/compare/v1.0.0...v1.0.1) > 9 November 2023 - version: 1.0.1.20231109 [`e213cb3`](https://github.com/janreges/siteone-crawler/commit/e213cb326db78e2f69fd3e4f04b9728223550a3d) - offline exporter: fixed case when on https:// website is link to same path but with http:// protocol (it overrided proper *.html file just with meta redirect .. real case from nextjs.org) [`4a1be0b`](https://github.com/janreges/siteone-crawler/commit/4a1be0bdfb62167c498f6c3b4c91fe74532ff833) - html processor: force to remove all anchor listeners when NextJS is detected (it is very hard to achive a working NextJS with offline file:// protocol) [`2b1d935`](https://github.com/janreges/siteone-crawler/commit/2b1d935419bade80d8e6ab07b2ae04ded0df131e) - file exporters: now by default crawler generates a html/json/txt report to 'tmp/[report|output].%domain%.%datetime%.[html|json|txt]' .. i assume that most people will want to save/see them [`7831c6b`](https://github.com/janreges/siteone-crawler/commit/7831c6b87dd41444a0fca529bc450bf7934ef541) - security analysis: removed multi-line console output for recommendations .. it was ugly [`310af30`](https://github.com/janreges/siteone-crawler/commit/310af308859dbb2fd5895af468195e2339f2788d) - json output: added JSON_UNESCAPED_UNICODE for unescaped unicode chars (e.g. czech chars will be readable) [`cf1de9f`](https://github.com/janreges/siteone-crawler/commit/cf1de9f60820963ccb78a00b43ca3aec8b311a77) - mailer: do not send e-mails in case of interruption of the crawler using ctrl+c [`19c94aa`](https://github.com/janreges/siteone-crawler/commit/19c94aac8211b4550ba11497e1332d604f8cdbc7) - refactoring: manager stats logic extracted into ManagerStats and implemented also into manager of content processors + stats added into 'Crawler stats' tab in HTML report [`3754200`](https://github.com/janreges/siteone-crawler/commit/3754200652dc91ac05efe22812e64c0e4be84019) - refactoring: content related logic extracted to content processors based on ContentProcessor interface with methods findUrls():?FoundUrls, applyContentChangesForOfflineVersion():void and isContentTypeRelevant():bool + better division of web framework related logic (NextJS, Astro, Svelte, ...) + better URL handling and maximized usage of ParsedUrl [`6d9f25c`](https://github.com/janreges/siteone-crawler/commit/6d9f25ce82f8a1cfbfbc6bc0b5a6a07262c427b1) - phpstan: ignore BASE_DIR warning [`6e0370a`](https://github.com/janreges/siteone-crawler/commit/6e0370aafe02d3bb2ca528ea8a9a37995f5ddce6) - offline website exporter: improved export of a website based on NextJS, but it's not perfect, because latest NextJS version do not have some JS/CSS path in code, but they are generated dynamicly from arrays/objects [`c4993ef`](https://github.com/janreges/siteone-crawler/commit/c4993efcb97f7058834713ed273f9c4274be5cad) - seo analyzer: fixed trim() warning when no <h1> found [`f0c526f`](https://github.com/janreges/siteone-crawler/commit/f0c526f5d2ff7d0155c1bfc7da7a6c0f2f7a1419) - offline export: a lot of improvements when generating the offline version of the website on NextJS - chunk detection from the manifest, replacing paths, etc. [`98c2e15`](https://github.com/janreges/siteone-crawler/commit/98c2e15acf4e22d25301d160968555c19ddd44cc) - seo and og: fixed division by zero when no og/twitter tags found [`19e4259`](https://github.com/janreges/siteone-crawler/commit/19e4259c519a3e41eb7aa8eabce80e6364e74639) - console output: lots of improvements for nice, consistent and minimal word-wrap output [`596a5dc`](https://github.com/janreges/siteone-crawler/commit/596a5dc17945359ffc0fef2ed8ed8ee8bfc1db00) - basic file/dir structure: created ./crawler (for Linux/macOS) and ./crawler.bat for Windows, init script moved to ./src, small related changes about file/dir path building [`5ce41ee`](https://github.com/janreges/siteone-crawler/commit/5ce41ee8e78425747bf40327152bd99499c64013) - header status: ignore too dynamic Content-Disposition header [`4e0c6fd`](https://github.com/janreges/siteone-crawler/commit/4e0c6fdf5c356f8c0eea78ccebe29641b90f96b4) - offline website exporter: added .html extensions to typical dynamic language extensions, because without it the browser will show them as source code [`7130b9e`](https://github.com/janreges/siteone-crawler/commit/7130b9eb666eca5b08c9dbeda91198bc85b31379) - html report: show tables with details, even if they are without data (it is good to know that the checks were carried out, but nothing was found) [`da019e4`](https://github.com/janreges/siteone-crawler/commit/da019e4591682c21e9f78de1ec26939088d92ccc) - tests: repaired tests after last changes of file/url building for offline website .. merlot is great! [`7c77c41`](https://github.com/janreges/siteone-crawler/commit/7c77c411ff67c01e07d16cb2acce0e926b264fcd) - utils: be more precise and do not replace attributes in SVG .. creative designers will not love you when looking at the broken SVG in HTML report [`3fc81bb`](https://github.com/janreges/siteone-crawler/commit/3fc81bb0c47eef2935da2e74721a809a9aff0959) - utils: be more precise in parsing phone numbers, otherwise people will 'love' you because of false positives .. wine is still great [`51fd574`](https://github.com/janreges/siteone-crawler/commit/51fd574c764d832d74cb5e67eed890bd9d349a5c) - html parser: better support for formatted html with tags/attributes on multiple lines [`89a36d2`](https://github.com/janreges/siteone-crawler/commit/89a36d2fcf3d96b61c4b3d2e20d5a46f4cb96cb8) - utils: don't be hungry in stripJavaScript() because you ate half of my html :) wine is already in my head... [`0e00957`](https://github.com/janreges/siteone-crawler/commit/0e0095727638b7940d2e555a6be231ad3dde19e4) - file result storage: changed cache directory structure for consistency with http client's cache, so it looks like my.domain.tld-443/04/046ec07c.cache [`26bf428`](https://github.com/janreges/siteone-crawler/commit/26bf428f95bc428485d7cf505e74c8a69c94d869) - http client cache: for better consistency with result storage cache, directory structure now contains also port, so it looks like my.domain.tld-443/b9/b989bdcf2b9389cf0c8e5edb435adc05.cache [`a0b2e09`](https://github.com/janreges/siteone-crawler/commit/a0b2e09d01e36aed56c0208a8001d616755de096) - http client cache: improved directory structure for large scale and better orientation for partial cache deleting.. current structure in tmp dir: my.domain.tld/b9/b989bdcf2b9389cf0c8e5edb435adc05.cache [`10e02c1`](https://github.com/janreges/siteone-crawler/commit/10e02c189297f28ea563ba6f3792462c2d6790ea) - offline website exporter: better srcset handling - urls can be defined with or without sizes [`473c1ad`](https://github.com/janreges/siteone-crawler/commit/473c1ad0d753df209aa160b0d90687c4bff21912) - html report: blue color for search term, looks better [`cb47df9`](https://github.com/janreges/siteone-crawler/commit/cb47df98e230c0375dbcb14c278250709bf3644a) - offline website exporter: handled situation of the same-name folder/file when both the folder /foo/next.js/ and the file /foo/next.js existed on the website (real case from vercel.com) [`7c27d2c`](https://github.com/janreges/siteone-crawler/commit/7c27d2c2277dd134615563ee4eaa706ec0ee7485) - exporters: added exec times to summary messages [`41c8873`](https://github.com/janreges/siteone-crawler/commit/41c8873dc33d7f08d91f77d71fcf1bf2fafa30ae) - crawler: use port from URL if defined or by scheme .. previous solution didn't work properly for localhost:port and parsed URLs to external websites [`324ba04`](https://github.com/janreges/siteone-crawler/commit/324ba04267b962a56817dd10e3ecba7777702aa2) - heading analysis: changed sorting to DESC by errors, renamed Headings structure -> Heading structure [`dbc1a38`](https://github.com/janreges/siteone-crawler/commit/dbc1a38f33d4094aebe64020531518538e2b3baf) - security analysis: detection and ignoring of URLs that point to a non-existent static file but return 404 HTML, better description [`193fb7d`](https://github.com/janreges/siteone-crawler/commit/193fb7dcf1f994aba69b646576bf7c6f8701a975) - super table: added escapeOutputHtml property to column for better escape managing + updated related supertables [`bfb901c`](https://github.com/janreges/siteone-crawler/commit/bfb901cb82b9cda81198df0dc87885b5eceb5c93) - headings analysis: replace usage of DOMNode->textContent because when the headings contain other tags, including <script>, textContent also contains JS code, but without the <script> tag [`5c426c2`](https://github.com/janreges/siteone-crawler/commit/5c426c24969a063aa3366da02520025733cf16e7) - best practices: better missing quotes detection and minimizing false positives in special cases (HTML/JS in attributes, etc.) [`b03a534`](https://github.com/janreges/siteone-crawler/commit/b03a5345e7f71f880ee4d36fb9f51c230d8c772f) - best practices: better SVG detection and minimizing false positives (e.g. code snippets with SVG), improved look in HTML report and better descriptions [`c35f7e2`](https://github.com/janreges/siteone-crawler/commit/c35f7e226f6cd384e5c8cf4b9af3a1a0d3be4cfc) - headers analysis: added [ignored generic values] or [see values below] for specific headers [`a7b444d`](https://github.com/janreges/siteone-crawler/commit/a7b444dab0e1c3949abfa0e0746db18343b9b55d) - core options: changed --hide-scheme-and-host to --show-scheme-and-host (by default is hidden schema+host better) [`3c202e9`](https://github.com/janreges/siteone-crawler/commit/3c202e998a824f97b6f481575a24e2924c9dc663) - truncating: replaced '...' with '…' [`870cf8c`](https://github.com/janreges/siteone-crawler/commit/870cf8cd447fd14e389d76bcc8853b1e691f5349) - accessibility analyzer: better descriptions [`514b471`](https://github.com/janreges/siteone-crawler/commit/514b47124d101cd4f0bd67148f41ea5644febd62) - crawler & http client: if the response is loaded from the cache, we do not wait due to rate limiting - very useful for repeated executions [`61fbfab`](https://github.com/janreges/siteone-crawler/commit/61fbfab34ba07c1856099051b8f68dc76b1adf09) - header stats: added missing strval in values preview [`9e11030`](https://github.com/janreges/siteone-crawler/commit/9e1103064af0962ed4963cace61bf7ad201d19a2) - content type analyzer: increased column width for MIME type from 20 to 26 (enough for application/octet-stream) [`c806674`](https://github.com/janreges/siteone-crawler/commit/c806674ee82d0aba90a9d61e10ff2b5e2cf6c813) - SSL/TLS analyzer: fixed issues on Windows with Cygwin where nslookup does not work reliably [`714b9e1`](https://github.com/janreges/siteone-crawler/commit/714b9e12a2426574731b62d460c98f1fed95aa18) - text output: removed redundant whitespaces from banner after .YYYYMMDD was added to the version number [`8b76205`](https://github.com/janreges/siteone-crawler/commit/8b76205b41ca9cbf4dd32e7d908f4fe932c4a2a3) - readme: added link to #ready-to-use-releases to summary [`574b39e`](https://github.com/janreges/siteone-crawler/commit/574b39e836794c98e7be8ceaa81d1ab0c50ab149) - readme: added section Ready-to-use releases [`44d686b`](https://github.com/janreges/siteone-crawler/commit/44d686b910a36747d002ec2886b85c22be5c4864) - changelog: added changelog by https://github.com/cookpete/auto-changelog/tree/master + added 'composer changelog' [`d11af7e`](https://github.com/janreges/siteone-crawler/commit/d11af7e4d847362276e1dd4cec3c25cad38263fb) #### v1.0.0 > 7 November 2023 - proxy: added support for --proxy=<host:port>, closes #1 [`#1`](https://github.com/janreges/siteone-crawler/issues/1) - license: renamed to LICENSE.md [`c0f8ec2`](https://github.com/janreges/siteone-crawler/commit/c0f8ec22a68741b1740981dc98bdec13d8e5182a) - license: added license CC 4.0 BY [`bd5371b`](https://github.com/janreges/siteone-crawler/commit/bd5371b99363fbb5de29c33f0fcc572d154e467d) - version: set v1.0.0.20231107 [`bdbf2be`](https://github.com/janreges/siteone-crawler/commit/bdbf2be97e68cfa01fb992fb960c1c5313d5780f) - version: set v1.0.0 [`a98e61e`](https://github.com/janreges/siteone-crawler/commit/a98e61e161652861541743df6fe1d8c55be446f9) - SSL/TLS analyzer: uncolorize valid-to in summary item, phpstan fixes (non-funcional changes) [`88d1d9f`](https://github.com/janreges/siteone-crawler/commit/88d1d9fec8bc29cd26ab88c18d6c122939b59bba) - content type analyzer: added table with MIME types [`b744f13`](https://github.com/janreges/siteone-crawler/commit/b744f139e417b625bd22ea282f744b55406853b1) - seo analysis: added TOP10 non-unique titles and descriptions to tab SEO and OpenGraph + badges [`4ae14c1`](https://github.com/janreges/siteone-crawler/commit/4ae14c13be5163704c2c6a2d55d75bc83f41f801) - html report: increased sidebar width to prevent wrapping in the case of higher numbers in badges [`c5c8f4c`](https://github.com/janreges/siteone-crawler/commit/c5c8f4cae991bbdd6b6a8a7fab6cbaae1c199344) - dns analyzer: increased column size to prevent auto-truncation of dns/ip addresses [`b4d4127`](https://github.com/janreges/siteone-crawler/commit/b4d4127b2b67efd63fff53ae0ad27b6c9a987501) - html report: fixed badge with errors on DNS and SSL tab [`e290403`](https://github.com/janreges/siteone-crawler/commit/e29040349ac4966b22842e52ee4c102a67f9860c) - html report: ensure that no empty tabs will be in report (e.g. in case where all analyzers will be deactivated by --analyzer-filter-regex='/anything/') [`6dd5bcc`](https://github.com/janreges/siteone-crawler/commit/6dd5bcc67d215bca085ef75cb98398aa162ce5fa) - html report: improved replacement of non-badged cells to transparent badge for better alignment [`172a074`](https://github.com/janreges/siteone-crawler/commit/172a074c519a55c492d2b72250232e23749cd75b) - html report: increased visible part of long tables from 500px to 658px (based on typical sidebar height), updated title [`0be355f`](https://github.com/janreges/siteone-crawler/commit/0be355f5474ad6aff461ac3362127569d29eac22) - utils: selected better colors for ansi->html conversion [`6c2a8e3`](https://github.com/janreges/siteone-crawler/commit/6c2a8e364790e2cdb338f164c572aafd9e3db6c1) - SSL/TLS analyzer: evaluation and hints about unsafe or recommeneded protocols, from-to validation, colorized output [`5cea1fe`](https://github.com/janreges/siteone-crawler/commit/5cea1fe51d500db433c4d86fe5fa8660d2ef2a14) - SEO & OpenGraph analyzers: refactored class names, headings structure moved to own tab, other small improvements [`75a9724`](https://github.com/janreges/siteone-crawler/commit/75a97245af1e896ab3304891dd4459873ad3a26f) - security analyzer: bette vulnerabilities explanation and better output formatting [`ee172cb`](https://github.com/janreges/siteone-crawler/commit/ee172cb25073e2e5452b38d5a6c52802e9585bcc) - summary: selected more suitable icons from the utf-8 set that work well in the console and HTML [`ef67483`](https://github.com/janreges/siteone-crawler/commit/ef67483827755895f0edf3149f4f106d28ba1942) - header stats: addValue() can accept both string and array [`a0d746b`](https://github.com/janreges/siteone-crawler/commit/a0d746ba9f956c03cb4ad1bddee14a26951ff86d) - headers & redirects - text improvements [`3ac9010`](https://github.com/janreges/siteone-crawler/commit/3ac9010c33e9048f1b3d24182232ae182ae681ca) - dns analyzer: colorized output and added info about CNAME chain into summary [`7dd1f8a`](https://github.com/janreges/siteone-crawler/commit/7dd1f8ac1eafcdcd92f651d397b561f6383fdcfc) - best practices analyzer: added SVG sanitization to prevent XSS, fine-tuning of missing quotes detection, typos [`4dc1eb5`](https://github.com/janreges/siteone-crawler/commit/4dc1eb592de3631f61ed67dfb87466a95462d5f3) - options: added extras option, e.g. for number range validation [`760a865`](https://github.com/janreges/siteone-crawler/commit/760a865082a7cd5f8e439f3fc9094fb7503a78be) - seo and socials: small type-hint and phpstan fixes [`bf695be`](https://github.com/janreges/siteone-crawler/commit/bf695be5fa859ca49bef67fb6511039e4301bb34) - best practice analyzer: added found depth to messages about too deep DOM depth [`220b43c`](https://github.com/janreges/siteone-crawler/commit/220b43c77a6d4747a29cf483e11a985dc07ac460) - analysis: added SSL/TLS analyzer with info about SSL certificate, its validity, supported protocols, issuer .. in the report SSL/TLS info are under tab 'DNS and TLS/SSL' [`3daf175`](https://github.com/janreges/siteone-crawler/commit/3daf1757e1eee765ea3d6b2dca1ed55ffb694d4a) - super table: show fulltext only for >= 10 rows + visible height of the table in HTML shorten to 500px/20 rows and show 'Show entire table' link .. implemented only with HTML+CSS, so that it also works on devices without JS (e.g. e-mail browser on iOS) [`7fb9e52`](https://github.com/janreges/siteone-crawler/commit/7fb9e52de2514b0fc1a11032238de815f76acb37) - analysis: added seo & sharing analysis - meta info (title, h1, description, keywords), OG/Twitter data, heading structure details [`53e12e6`](https://github.com/janreges/siteone-crawler/commit/53e12e63102d70b0329194493599523808758716) - best practices: added checks for WebP and AVIF images [`0ccabc6`](https://github.com/janreges/siteone-crawler/commit/0ccabc633cdae4b7ef7b03aad22ab8cfab1a590f) - best practices: added brotli support reporting to tables [`7ff2c53`](https://github.com/janreges/siteone-crawler/commit/7ff2c53e56705c19de77d54db578338252007b99) - super table: added option to specify whether the table should be displayed on the output to the console, html or json [`6bb6217`](https://github.com/janreges/siteone-crawler/commit/6bb62177522a61bab1673b9d5f19e18f50bd54a3) - headers analysis: analysis of HTTP headers of all requests to the main domain, their detailed breakdown, values and statistics [`1fcc1db`](https://github.com/janreges/siteone-crawler/commit/1fcc1dba38a3ac41f0547a4f11a2aef9af1d876f) - analysis: fixed search of attributes with missing quotes [`3db31b9`](https://github.com/janreges/siteone-crawler/commit/3db31b9c01317d8c8ac6eba6b98679be79982c3e) - super table: added the number of found/displayed lines next to the full text [`6e7f3d4`](https://github.com/janreges/siteone-crawler/commit/6e7f3d4b4de0cfa378920c9389291a9902c0c486) - super table: removed setting column widths for HTML table - works best without forcing widths [`2a785e7`](https://github.com/janreges/siteone-crawler/commit/2a785e70b675ef681b005042a50b289b3b29d600) - html report: even wider content of the report is allowed, for better functioning for high-resolution displays [`363990c`](https://github.com/janreges/siteone-crawler/commit/363990c3566cb39d653ab2760df6bb4d2acd8149) - pages 404: truncate too long urls [`082bae6`](https://github.com/janreges/siteone-crawler/commit/082bae6f28d2ba8296591a0885548faa0b38a59a) - fixes: fixed various minor warnings related to specific content or parameters [`da1802d`](https://github.com/janreges/siteone-crawler/commit/da1802d82f8ccf2de3f4329bf3b952ebefeb3449) - options: ignore extra comma or empty value in list [`3f5cab6`](https://github.com/janreges/siteone-crawler/commit/3f5cab68bc4981faea7b7bed30b9f687ea773830) - super table: added useful fulltext search for all super tables [`50a4edf`](https://github.com/janreges/siteone-crawler/commit/50a4edf9caa69f67fdc21c3c32a92d201c211ccc) - colors: more light color for badge.neutral in light mode because previous was too contrasting [`0dbad09`](https://github.com/janreges/siteone-crawler/commit/0dbad0920f8f8a9f14186f9513e3ea6793fcf297) - colors: notice is now blue instead of yellow and severity order fix in some places (critical -> warning -> notice -> ok -> info) [`1b50b99`](https://github.com/janreges/siteone-crawler/commit/1b50b99ae079a4d1cdc350038e105d469dec524a) - colors: changed gray color to more platform-consistent color, otherwise gray was too dark on macOS [`173c9bd`](https://github.com/janreges/siteone-crawler/commit/173c9bd211bf066b69bb3adbde487ec3e99f6da1) - scripts: removed helper run.tests* scripts [`e9f0c8f`](https://github.com/janreges/siteone-crawler/commit/e9f0c8ff768042737bfab57b5d2270df995c611e) - analysis: added table with detailed list of security findings and URLs [`5b9e0fe`](https://github.com/janreges/siteone-crawler/commit/5b9e0fe1c3a514941abf2e277bf3f2bd4e017004) - analysis: added SecurityAnalyzer, which checks the existence and values of security headers and performs HTML analysis for common issues [`0cb7cb9`](https://github.com/janreges/siteone-crawler/commit/0cb7cb9daac5303227e31b72b0f6931218968bf7) - http auth: added support for basic HTTP authentication by --http-auth=username:password [`147e004`](https://github.com/janreges/siteone-crawler/commit/147e0040e97f6ad37da7897813063cbb73302e22) - error handling: improved behaviour in case of entering a non-existent domain or problems with DNS resolving [`5c08fb4`](https://github.com/janreges/siteone-crawler/commit/5c08fb4c82409863f73fcdcd66f9a0ba76206c5c) - html report: implemented completely redesigned html report with useful information, with light/dark mode and possibility to sort tables by clicking on the header .. design inspired by Zanrly from Shuffle.dev [`05da14f`](https://github.com/janreges/siteone-crawler/commit/05da14f50b108deec4827c5c0324bbd1b9775b37) - http client: fix of extension detection in the case of very non-standard or invalid URLs [`113faa5`](https://github.com/janreges/siteone-crawler/commit/113faa501016f14c017f5f1eaa586a6fae35efbf) - options: increased default memory limit from 512M to 2048M + fixed refactored 'file-system' -> 'file' in docs for result storage [`1471b28`](https://github.com/janreges/siteone-crawler/commit/1471b2884bcbf1806a388e4ae85cc4f7e1bc11fe) - utils: fix that date formats are not detected as a phone number in parsePhoneNumbersFromHtml() [`e4e1009`](https://github.com/janreges/siteone-crawler/commit/e4e10097f7e74816dd716d2713516d5ff8eef39a) - strict types: added declare(strict_types=1) to all classes with related fixes and copyright [`92dd47c`](https://github.com/janreges/siteone-crawler/commit/92dd47c72e4f1aaa5a05187f60f2a9f0a5c285ee) - dns analyzer: added information about the DNS of the given domain - shows the entire cname/alias chain as well as the final resolved IPv4/IPv6 addresses + tests [`199421d`](https://github.com/janreges/siteone-crawler/commit/199421df3c96e2f2bec20f45230cbd812e9fc21c) - utils: helper function parsePhoneNumbersFromHtml() used in BestPracticeAnalyzer + tests [`09cc5fb`](https://github.com/janreges/siteone-crawler/commit/09cc5fbbbdf7f4a706ef912221e32d476fa397b4) - summary consistency: forced dots at the end of each item in the summary list [`4758e38`](https://github.com/janreges/siteone-crawler/commit/4758e38c3b2ab73476516662129e3b6abd78ff44) - crawler: support for more benevolent tags for title and meta attributes .. e.g. even the title can contain other HTML attributes [`770b339`](https://github.com/janreges/siteone-crawler/commit/770b339fb7b6ac86af56a864feb184977974d37d) - options: default timeout increased from 3 to 5 seconds .. after testing on a lot websites, it makes better sense [`eb74207`](https://github.com/janreges/siteone-crawler/commit/eb7420736f5c4d353651ec39d8d030a8485e1486) - super table: added option to force non-breakable spaces in column cells [`3500818`](https://github.com/janreges/siteone-crawler/commit/35008185064331d33c380e0643606f2dbaeb2b64) - best practice analyzer: added measurement of individual steps + added checking of active links with phone numbers <a href="tel: 123..."> [`1bb39e8`](https://github.com/janreges/siteone-crawler/commit/1bb39e87a440975e8956fbf1d66b81ef1b424574) - accessibility analyzer: added measurement of individual steps + removed DOMDocument parsing after refactoring [`2a7c49b`](https://github.com/janreges/siteone-crawler/commit/2a7c49b415dd2864cc37497d409cb083abb99df5) - analysis: added option to measure the duration and number of analysis steps + the analyzeVisitedUrl() method already accepts DOMDocument (if HTML) so the analyzers themselves do not have to do it twice [`d8b9a3d`](https://github.com/janreges/siteone-crawler/commit/d8b9a3d8e0016ec4cc6da908a1bd9db39370e9da) - super table: calculated auto-width can't be shorter than column name (label) [`b97484f`](https://github.com/janreges/siteone-crawler/commit/b97484f22d59bee04b935fa204d18c609ba8658c) - utils: removed ungreedy flag from all regular expressions, it caused problems under some circumstances [`03fc202`](https://github.com/janreges/siteone-crawler/commit/03fc202ed2f30fe4bd2001e8fcaecbea5ca45f7e) - phpstan: fixed all level 5 issues [`04c21aa`](https://github.com/janreges/siteone-crawler/commit/04c21aaeeed24117740fac22b5756363e3a4769d) - phpstan: fixed all level 4 issues [`91fee49`](https://github.com/janreges/siteone-crawler/commit/91fee49a0aefa603c4dba9bc1f19d658a7ab413e) - phpstan: fixed all level 3 issues [`2f7866a`](https://github.com/janreges/siteone-crawler/commit/2f7866a389b05e3c796e7f1f0bd7f6410a23cb05) - phpstan: fixed all level 2 issues [`e438996`](https://github.com/janreges/siteone-crawler/commit/e4389962be4a476bdcacc6acc18f36c7037b90ee) - phpstan: installed phpstan with level 2 for now [`b896e6c`](https://github.com/janreges/siteone-crawler/commit/b896e6c0552e4fd938088594a7d44d6af14fc809) - tests: allowed nextjs.org for crawling (incorrectly because of this, a couple of tests did not pass) [`cdc7f56`](https://github.com/janreges/siteone-crawler/commit/cdc7f5688f6aca0e822c3fa6daee6a3acd99eeeb) - refactor: moved /Crawler/ into /src/Crawler/ + added file attachment support to mailer [`2f0d26c`](https://github.com/janreges/siteone-crawler/commit/2f0d26c7d2f7cb65495b375dd4b11bf7849888e2) - sitemap exporter: renamed addErrorToSummary -> addCriticalToSummary [`e46e192`](https://github.com/janreges/siteone-crawler/commit/e46e1926df52a3edfc4137ebd8ede9dee8a45bf1) - text output: added options --show-inline-criticals and --show-inline-warning which displays the found problems directly under the URL - the displayed table will be less clear, but the problems are clearly visible [`725b212`](https://github.com/janreges/siteone-crawler/commit/725b2124172710895d86503fd4a933e2ea91efaa) - composer.json: added require declarations for ext-dom, ext-libxml (used in analyzers) and ext-zlib (used in cache/storages) [`3542cf0`](https://github.com/janreges/siteone-crawler/commit/3542cf03829e9a3c745e58e0df1bc2f6284d25ba) - analysis: added accessibility and best practices analyzers with useful checks [`860316f`](https://github.com/janreges/siteone-crawler/commit/860316fa685509104462412aeb125417dceaee28) - analysis: added AnalysisManager for better analysis control with the possibility to filter required analyzers using --analyzer-filter-regex [`150569f`](https://github.com/janreges/siteone-crawler/commit/150569fd20c380781ed5971cefd47308762a730a) - result storage: options --result-storage, --result-storage-dir and --result-storage-compression for storage of response bodies and headers (by default is used memory storage but you can use file storage for extremely large websites) [`d2a8fab`](https://github.com/janreges/siteone-crawler/commit/d2a8fabcef72067500dfcb0065e87ebc4395dac3) - http cache: added --http-cache-dir and --http-cache-compression parameters (by default http cache is on and set to 'tmp/http-client-cache' and compression is disabled) [`2eb9ed8`](https://github.com/janreges/siteone-crawler/commit/2eb9ed86d9d53b4735a3de3cf6d06b652818dbc0) - super table: the currentOrderColumn is already optional - sometimes we want to leave the table sorted according to the input array [`4fba880`](https://github.com/janreges/siteone-crawler/commit/4fba880fcf137a6207df4c5177cf3ec80afaa3ae) - analysis: replaced severity ok/warning/error with ok/notice/warning/critical - it made more sense for analyzers [`18dbaa7`](https://github.com/janreges/siteone-crawler/commit/18dbaa7a4a760874ba39c75af28f7e808fb8eb2e) - analysis: added support for immediate analysis of visited URLs with the possibility to insert the analyzer's own columns into the main table [`004865f`](https://github.com/janreges/siteone-crawler/commit/004865f223c9ec688c4f522cd8f93d8022458130) - content types: fixed json/xml detection [`00fc180`](https://github.com/janreges/siteone-crawler/commit/00fc1808838c7a191cc9986e884ffda26f841281) - content type analyzer: decreased URLs column size from 6 to 5 - that's enough [`2eefbaf`](https://github.com/janreges/siteone-crawler/commit/2eefbafad24f68118a2efe8d6ddedc4d3d45b5cf) - formatting: unification of duration formatting across the entire application [`412ee7a`](https://github.com/janreges/siteone-crawler/commit/412ee7ab5c5eda19dfc5492a6cc9edbb7c5969c6) - super table: fixed sorting for array of arrays [`4829be8`](https://github.com/janreges/siteone-crawler/commit/4829be8f8e1d3f0d8201dedfa99d245453601422) - source domains analyzer: minor formatting improvements [`2d32ced`](https://github.com/janreges/siteone-crawler/commit/2d32cedb59aa13e4e27a1dbe58eff586e4407cd9) - offline website exporter: added info about successful export to summary [`92e7e46`](https://github.com/janreges/siteone-crawler/commit/92e7e46bdbc1f1cff329cf4aff5ee99dd70332e2) - help: added red message about invalid CLI parameters also to the end of help output, because help is already too long [`6942e8f`](https://github.com/janreges/siteone-crawler/commit/6942e8f4535d748763a124207634ea7548bbfa83) - super table: added column property 'formatterWillChangeValueLength' to handle situation with the colored text and broken padding [`7371a68`](https://github.com/janreges/siteone-crawler/commit/7371a68f11191b0b21307e6ca703e362f476b815) - analyzers: setting a more meaningful analyzers order [`5e8f747`](https://github.com/janreges/siteone-crawler/commit/5e8f747392f291abdfb0140038c42fe84801955c) - analyzers: added source domains analyzer with summary of domains and downloaded content types (number/size/duration) [`f478f17`](https://github.com/janreges/siteone-crawler/commit/f478f178fb2f79a81e5db89909951816ac6e1c9f) - super table: added auto-width column feature [`d2c04de`](https://github.com/janreges/siteone-crawler/commit/d2c04dec3312d72ed373236d73f7a4d3bbf8c20d) - renaming: '--max-workers' to '--workers' with possibility to use shortcut '-w=<num>' + adding possibility to use shortcut '-rps=<num>' for '--max-reqs-per-sec=<num>' [`218f8ff`](https://github.com/janreges/siteone-crawler/commit/218f8ffcca15550853bcb4ace44dedf260d1e735) - extra columns: added ability to force columns to the required length via "!" + refactoring using ExtraColumn [`def82ff`](https://github.com/janreges/siteone-crawler/commit/def82ff3f5f11efa2e4ef812e086a5c8379ac962) - readme: divisionlit of features into several groups and divided accordingly [`c03d231`](https://github.com/janreges/siteone-crawler/commit/c03d2311b618f8aad165ffad39ae51989f60f846) - offline exporter: export of the website to the offline form has already been fine-tuned (but not perfect yet), --disable-* options to disable JS/CSS/images/fonts/etc. and a lot of other related functionalities [`0d04a98`](https://github.com/janreges/siteone-crawler/commit/0d04a9805bdebea708eba44cc6680bd58995d559) - crawler: added possibility to set speed via --max-reqs-per-sec (default 10) [`d57cc4a`](https://github.com/janreges/siteone-crawler/commit/d57cc4a39e6ce1882ee3233b015200382d90f06f) - tests: dividing asserts for URL conversion testing into different detailed groups [`f6221cb`](https://github.com/janreges/siteone-crawler/commit/f6221cb5d3e5e844f146a95940479b20604c37cf) - html url parser: added support for loading fonts from <link href='...'> [`4c482d1`](https://github.com/janreges/siteone-crawler/commit/4c482d1078fb535e4a3be96f6c3e7ded2ea02d65) - manager: remove avif/webp support if OfflineWebsiteExporter is active - we want to use only long-supported jpg/png/gif on the local offline version [`3ec81d3`](https://github.com/janreges/siteone-crawler/commit/3ec81d338590ae16ee337cbbfa8a741e01b0522d) - http response: transformation of the redirect to html with redirection through the <meta> tag [`8f6ff16`](https://github.com/janreges/siteone-crawler/commit/8f6ff161066a82af9ae91a738aae66327fe407b6) - initiator: skip comments or empty arguments [`12f4c52`](https://github.com/janreges/siteone-crawler/commit/12f4c52b7fe0429926c2a6540e8842eae4882888) - http client: added crawler signature to User-Agent and X-Crawler-Info header + added possibility to set Origin request header (otherwise some servers block downloading the fonts) [`ae4eaf3`](https://github.com/janreges/siteone-crawler/commit/ae4eaf3298e0bc94c1d913d08393426e380ba4ad) - visited url: added isStaticFile() [`f1cd5e8`](https://github.com/janreges/siteone-crawler/commit/f1cd5e8e397b734dc3353db943c2928ff46cf520) - crawler: increased pcre.backtrack_limit and pcre.recursion_limit (100x) to support longer HTML/CSS/JS [`35a6e9a`](https://github.com/janreges/siteone-crawler/commit/35a6e9a4729fffa7ee0a77b0be50621c4077a7b9) - core options: renamed --headers-to-table to --extra-columns [`7c30988`](https://github.com/janreges/siteone-crawler/commit/7c30988fdecdaeb6aa89aed15a864a033c121d2f) - crawler: added type for audio and xml + static cache for getContentTypeIdByContentTypeHeader [`386599e`](https://github.com/janreges/siteone-crawler/commit/386599e881051ae8c14b7ec9688690e50c0dd7dc) - found urls: normalization of URL takes care of spaces + change of source type to int [`c3063a2`](https://github.com/janreges/siteone-crawler/commit/c3063a247f10bf00b8516eb2303bb85cab426c15) - debugging: possibility to enable debugging through ParsedUrl [`979dc0e`](https://github.com/janreges/siteone-crawler/commit/979dc0e89af063b5ffe04b49275ceb0fa9191db2) - offline url converter: class for solving the translation of URL addresses to offline/local + tests [`44118e6`](https://github.com/janreges/siteone-crawler/commit/44118e6bf96f6b25c7d8410084f76dfb3eb10188) - url converter: TargetDomainRelation enum with tests [`fd6cf21`](https://github.com/janreges/siteone-crawler/commit/fd6cf216d903785adf46923ed2a805937f724d15) - initiator: check only script basename in unknown args check [`888448f`](https://github.com/janreges/siteone-crawler/commit/888448fc9c598a7e8f750e746214b2834722b412) - offline website export: to run the exporter is necessary to set --offline-export-directory [`33e9f95`](https://github.com/janreges/siteone-crawler/commit/33e9f952814b52bdfc7634cf4b9521d393b87417) - offline website export: to run the exporter is necessary to set --offline-export-directory [`bcc007b`](https://github.com/janreges/siteone-crawler/commit/bcc007b6a3a9c0e9de23e76bd6f9150c7d2295c9) - log & tmp: added .gitkeep for versioning of these folders - they are used by some optional features [`065f8ef`](https://github.com/janreges/siteone-crawler/commit/065f8ef27fabe889e8a35b98fd75ce260263d268) - offline website export & tests: added the already well-functioning option to export the entire website to offline mode working from local static HTML files, including images, fonts, styles, scripts and other files (no documentation yet) + lot of related changes in Crawler + added first test testing some important functionalities about relative URL building [`4633211`](https://github.com/janreges/siteone-crawler/commit/463321199e6f9bac10b097e3f286da6a13f36906) - composer & phpunit: added composer, phpunit and license CC BY 4.0 [`4979143`](https://github.com/janreges/siteone-crawler/commit/4979143ac2aea9d7b3fe9fcfb9d57f1890c1f114) - visited-url: added info if is external and if is allowed to crawl it [`268a696`](https://github.com/janreges/siteone-crawler/commit/268a6960f8ff69046c8e6c73beae98d24b73ba1f) - text-output: added peak memory usage and average traffic bandwidth to total stats [`cb68340`](https://github.com/janreges/siteone-crawler/commit/cb683407e2cdcd62f5484da96baf9ef43e49a4b3) - crawler: added video support and fixed javascript detection by content-type [`3c3eb96`](https://github.com/janreges/siteone-crawler/commit/3c3eb9625f20657e971249c14cdff97a0a0b8687) - url parsers: extraction of url parsing from html/css into dedicated classes and FoundUrl with info about source tag/attribute [`d87597d`](https://github.com/janreges/siteone-crawler/commit/d87597d36507c7bd6029f87bf1801586eea9b420) - manager: ensure that done callback is executed only once [`d99cccd`](https://github.com/janreges/siteone-crawler/commit/d99cccd91b43680e0726f9c037fb568a9e8be1b4) - http-client: extraction of http client functionality into dedicated classes and implemented cache for HTTP responses (critical for efficient development) [`8439e37`](https://github.com/janreges/siteone-crawler/commit/8439e376c50a346e133a2d99e7406020bb89030a) - debugging: added debugging related expert options + Debugger class [`2c89682`](https://github.com/janreges/siteone-crawler/commit/2c89682feaf65a4f224da8ebaf05c48aa899eccc) - parsed-url: added query, it is already needed [`860df08`](https://github.com/janreges/siteone-crawler/commit/860df086ae8c8556420d92e249b3b459b8bf288f) - status: trim only HTML bodies because trim break some types of binary files, e.g. avif [`fca2156`](https://github.com/janreges/siteone-crawler/commit/fca2156a2f9607f705a32833a650ae70d5690772) - url parsers: unification of extension length in relevant regexes to {1,10} [`96a3548`](https://github.com/janreges/siteone-crawler/commit/96a35484ba5ab0eee7e43837c1eade1aba6f8a57) - basic-stats: fixed division by zero and nullable times [`8c38b96`](https://github.com/janreges/siteone-crawler/commit/8c38b9660752f132c09e3ceaab596e54176b46e9) - fastest-analyzer: show only URLs with status 200 on the TOP list [`0085dd1`](https://github.com/janreges/siteone-crawler/commit/0085dd1fcbd3b5657eca73345921fe3fc6f407bc) - content-type-analyzer: added stats for 42x statuses (429 Too many requests) [`4f49d12`](https://github.com/janreges/siteone-crawler/commit/4f49d124d1d9993abe3babd9a181c9768b5c2903) - file export: fixed HTML report error after last refactoring [`e77fa6c`](https://github.com/janreges/siteone-crawler/commit/e77fa6cf791da08b522e2124545c303ab5de67ed) - sitemap: publish only URLs with status 200 OK [`b2d4448`](https://github.com/janreges/siteone-crawler/commit/b2d44488a28aeca3421c36ca1e5ada0030de26d8) - summary: added missing </ul> and renamed heading Stats to Summary in HTML report [`c645e16`](https://github.com/janreges/siteone-crawler/commit/c645e16016611a49f70c3d5de9e6ab4d58a45048) - status summary: added summary showing important analyzed metrics with OK/WARNING/CRITICAL icons, ordering by severity and INFO about the export execution + interrupting the script by CTRL+C will also run all analyzers, exporters and display all statistics for already processed URLs [`fd643d0`](https://github.com/janreges/siteone-crawler/commit/fd643d016036f4eed5418375f8b25cfe08549ed0) - output consistency: ensuring color and formatting consistency of different types of values (status codes, request durations) [`3ffe1d2`](https://github.com/janreges/siteone-crawler/commit/3ffe1d2a939d718a6fae9c1f927646cfbec808f4) - analyzers: added content-type analyzer with stats for total/avg times, total sizes and statuses 200x, 300x, 400x, 500x [`0475347`](https://github.com/janreges/siteone-crawler/commit/04753478bce1f81dfdab73cd19b0541e725317fe) - crawler: better content-type handling for statistics and added 'Type' column to URL lists + refactored info from array to class [`346caf4`](https://github.com/janreges/siteone-crawler/commit/346caf45f3a18e75a0cf4d0e65961fbee63c9632) - supertable: is now able to display from the array-of-arrays as well as from the array-of-objects + it can translate color declarations from bash to HTML colors when rendering to HTML [`80f0b1c`](https://github.com/janreges/siteone-crawler/commit/80f0b1ca3d50ee7dfae9a01eccbe15fcc06a72d5) - analyzers: TOP slowest/fastest pages analyzer now evaluates only HTML pages, otherwise static content skews the results + decreased minTime for slowest analysis from 0.1 to 0.01 sec (on a very fast and cached website, the results were empty, which is not ideal) [`1390bbc`](https://github.com/janreges/siteone-crawler/commit/1390bbc6daa5484fed8612731dc99f734c406042) - major refactoring: implementation of the Status class summarizing useful information for analyzers/exporters (replaces the JsonOutput over-use) + implementation of basic analyzers (404, redirects, slow/fast URLs) + SuperTable component that exports data to text and HTML + choice of memory-limit setting + change of some default values [`efb9a60`](https://github.com/janreges/siteone-crawler/commit/efb9a60aa0be5cb8af55b09723a236370fccb904) - url parsing: fixes for cases when query params are used with htm/html/php/asp etc. + mini readme fix [`af1acfa`](https://github.com/janreges/siteone-crawler/commit/af1acfa9efa536d2ef2e51b2f0a2404ef9d2417a) - minor refactoring: renaming about core options, small non-functional changes [`1dd258e`](https://github.com/janreges/siteone-crawler/commit/1dd258e81eb4d06658e5e41e62141d5be48ce622) - major refactoring: better modularity and auto loading in the area of the exporters, analyzers, their configurability and help auto-building + new mailer options --mail-from-name and --mail-subject-template [`0c57dbd`](https://github.com/janreges/siteone-crawler/commit/0c57dbdb30702cc6669a703788b530fbc4d04af6) - json output: automatic shortening of the URL according to the text width of the console, because if the long URL exceeds the width of the window, the rewriting of the line with the progressbar stops working properly [`106332b`](https://github.com/janreges/siteone-crawler/commit/106332b1d8421dbea5f8725536fa3efed6834564) - manual exit: captures CTRL+C and ends with the statistics for at least the current URLs [`7f4fc80`](https://github.com/janreges/siteone-crawler/commit/7f4fc80c5f9f0fe47da2d9bee2e139489c36a966) - error handling: show red error with help when queue or visited tables are full and info how to fix it [`4efbd73`](https://github.com/janreges/siteone-crawler/commit/4efbd734d775aaa2e6dd66d2d8ed7a007871a1dd) - DOM elements: implemented DOM elements counter and when you add 'DOM' to --headers-to-column you will see DOM elements count [`1837a9c`](https://github.com/janreges/siteone-crawler/commit/1837a9cb12f97a33aec6bcf03a54250bd48545a2) - sitemap and no-color: implemented xml/txt sitemap generator and --no-color option [`f9ade44`](https://github.com/janreges/siteone-crawler/commit/f9ade44d470d97bcc399039bc91a5ce74a6537c1) - readme: added table of contents and rewrited intro, features and installation chapters [`469fd1c`](https://github.com/janreges/siteone-crawler/commit/469fd1cf15af4d191c239b2523e0fd8614f7653f) - readme: removed deprecated and duplicate mailer docs [`c5effe8`](https://github.com/janreges/siteone-crawler/commit/c5effe84aece85f7a6aaa97228cd84a5eade4f8b) - readme and CLI help: dividing the parameters into clear groups and improving parameters description - in README.md is detailed form, in CLI instructions is a shorter version. [`19ff724`](https://github.com/janreges/siteone-crawler/commit/19ff724ec0d21f08c4d6cf09def06ba27b023598) - include/ignore regex: added option to limit crawled URLs with the common combination of --include-regex and --ignore-regex [`88e393d`](https://github.com/janreges/siteone-crawler/commit/88e393d33c07fab77173432fd0faf7fe631c2c2c) - html report: masking passwords, styling, added logo, better info ordering and other small changes [`4cdcdab`](https://github.com/janreges/siteone-crawler/commit/4cdcdabf145ffe6f02d84b3250b2a1fc46a5677a) - mailer & exports: implemented ability to send HTML report to e-mail via SMTP + exports to HTML/JSON/TXT file + better reporting of HTTP error conditions (timeout, etc.) + requests for assets are sent only as HEAD without the need to download all binary data + updated documentation [`a97c29d`](https://github.com/janreges/siteone-crawler/commit/a97c29d78f07b4d854853c474fb9d0542b6f2796) - table output: option to set expected column length for better look by 'X-Cache(10)' [`e44f89d`](https://github.com/janreges/siteone-crawler/commit/e44f89d6c3114ccf02c70f38d5ffa5a0f081c1b2) - output: renamed print*() methods to more meaningul add*() relevant also for JSON output [`1069c4a`](https://github.com/janreges/siteone-crawler/commit/1069c4a346d13878c52a316b5953ffa997ec3700) - options: default timeout decreased from 10 to 3, --table-url-column-size renamed to --url-column-size and decreased its default value from 100 to 80, new option --hide-progress-bar, changed --truncate-url-to-column-size to --do-not-truncate-url [`e75038c`](https://github.com/janreges/siteone-crawler/commit/e75038c56afcf85ae591b1dbedf33a54fcd84754) - readme: improved documentation describing use on Windows, macOS or arm64 Linux [`baf2d05`](https://github.com/janreges/siteone-crawler/commit/baf2d0596a3e8367d51fe6ab75793d803e984330) - readme: added info about really tested crawler on Windows with Cygwin (Cygwin has some output limitations and it is not possible to achieve such nice behavior as on Linux) [`1f195c0`](https://github.com/janreges/siteone-crawler/commit/1f195c0c9c8565a37fcb5786070e69c6aa0b8e0e) - windows compatibility: ensuring compatibility with running through cygwin Swoole, which I recommend in the documentation for Windows users [`c22cc45`](https://github.com/janreges/siteone-crawler/commit/c22cc4559ed3de2ac5e4e6e2957b4d3233b4fda5) - json output: implemented nice continuos progress reporting, intentionally on STDERR so the output on STDOUT can be used to save JSON to file + improved README.md [`c095249`](https://github.com/janreges/siteone-crawler/commit/c095249d03c96a00da75553b10dadf7e025a5b0b) - limits: increased limit of max queue length from 1000 to 2000 (this default will more suitable even for medium-sized websites) [`c8c3312`](https://github.com/janreges/siteone-crawler/commit/c8c33121c371cc4d0f0791a250178254d9e3a88a) - major refactoring: splitting the code into classes, improving error handling and implementing other functions (JSON output, assets crawling) [`f6902fc`](https://github.com/janreges/siteone-crawler/commit/f6902fc025943ef96150739ae6834358097b235d) - readme: added information how to use crawler with Windows, macOS or arm64 architecture + a few other details [`721f4bb`](https://github.com/janreges/siteone-crawler/commit/721f4bb73e92f65ca3aab789219f046dea665931) - url parsing: handled situations when relative or dotted URLs are also used in HTML, e.g. href='sub/page', href='./sub/page' or href='../sub/page', href='../../sub/page' etc. + few minor optimizations [`c2bbf72`](https://github.com/janreges/siteone-crawler/commit/c2bbf72cf636340a43ebf8472c38008d0fc50f27) - memory allocation: added optional params --max-queue-length=<n> (default 1000), --max-visited-urls=<n> (default 5000) and --max-url-length=<u> (default 2000) [`947a43f`](https://github.com/janreges/siteone-crawler/commit/947a43f3bb826ad852ca51390ae2778fbff320e0) - Initial commit with first version 2023.10.1 [`7109788`](https://github.com/janreges/siteone-crawler/commit/71097884df3c1ade6fd7c02b4ac9ac8f5f161a12) ================================================ FILE: CLAUDE.md ================================================ # CLAUDE.md This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository. ## Setup After Clone ```bash git config core.hooksPath .githooks # enable pre-commit hook (fmt + clippy + tests) ``` ## Build & Test Commands ```bash cargo fmt # auto-format code (always run before build) cargo build # debug build cargo build --release # release build (~11s) cargo test # unit tests + offline integration tests (~300 tests) cargo test --test integration_crawl -- --ignored --test-threads=1 # network integration tests (crawls crawler.siteone.io) cargo test scoring::ci_gate::tests::all_checks_pass # run a single test by name cargo clippy -- -D warnings # lint (CI enforces zero warnings) cargo fmt -- --check # format check ``` ## Quick Run ```bash ./target/release/siteone-crawler --url=https://example.com --single-page ./target/release/siteone-crawler --url=https://example.com --output=json --http-cache-dir= # no cache ./target/release/siteone-crawler --html-to-markdown=page.html # convert local HTML to markdown (stdout) ./target/release/siteone-crawler --html-to-markdown=page.html --html-to-markdown-output=page.md # convert to file ``` ## Architecture ### Crawl Lifecycle (in order) 1. **CLI Parsing** (`Initiator` → `CoreOptions::parse_argv()`): Parses 120+ CLI options, merges config file if present, validates. Exits with code 101 on error, code 2 on `--help`/`--version`. Non-crawl utility modes (`--serve-markdown`, `--serve-offline`, `--html-to-markdown`) exit early in `main.rs` before creating the Manager. 2. **Analyzer Registration** (`Initiator::register_analyzers()`): Creates all 15 analyzer instances (Accessibility, BestPractice, Caching, ContentType, DNS, ExternalLinks, Fastest, Headers, Page404, Redirects, Security, SeoAndOpenGraph, SkippedUrls, Slowest, SourceDomains, SslTls) and registers them with `AnalysisManager`. Some analyzers receive config from CLI options (e.g. `fastest_top_limit`, `max_heading_level`). 3. **Manager Setup** (`Manager::run()`): Creates `Status` (result storage), `Output` (text/json/multi), `HttpClient` (with optional proxy, auth, cache), `ContentProcessorManager` (HTML, CSS, JS, XML, Astro, Next.js, Svelte processors), and the `Crawler` instance. 4. **Robots.txt Fetch** (`Crawler::fetch_robots_txt()`): Before crawling starts, fetches and parses `/robots.txt` from the initial domain. Respects `--ignore-robots-txt` option. 5. **Crawl Loop** (`Crawler::run()`): Breadth-first concurrent URL processing: - URL queue (`DashMap`) seeded with initial URL - Tokio tasks limited by `Semaphore` (= `--workers` count) + rate limiting (`--max-reqs-per-sec`) - Per-URL flow: check robots.txt → HTTP request → on error, store with negative status code → on success, run content processors → extract links from HTML → enqueue discovered URLs - Content processors (`HtmlProcessor`, `CssProcessor`, etc.) transform response bodies during crawl — used by offline/markdown exporters for URL rewriting - Each visited URL's response is stored in `Status` for post-crawl analysis - Per-URL data collected: status code, headers, body, response time, content type, size, redirects 6. **Post-Crawl Analysis** (`Manager::run_post_crawl()`): Sequential pipeline after crawling ends: - Transfer skipped URLs from crawler to `Status` - Run all registered analyzers (`AnalysisManager::run_analyzers()`): each analyzer gets read access to `Status` (all crawled data) and write access to `Output` (adds tables/findings) - Add content processor stats table 7. **Exporters** (`Manager::run_exporters()`): Generate output files based on CLI options: - `SitemapExporter`: XML/TXT sitemap files - `OfflineWebsiteExporter`: Static website copy with rewritten relative URLs - `MarkdownExporter`: HTML→Markdown conversion with relative .md links - `FileExporter`: Save text/JSON output to file - `HtmlReport`: Self-contained HTML report (also used by Mailer and Upload) - `MailerExporter`: Email HTML report via SMTP - `UploadExporter`: Upload report to remote server 8. **Scoring** (`scorer::calculate_scores()`): Computes quality scores (0–10) across 5 weighted categories (Performance 20%, SEO 20%, Security 25%, Accessibility 20%, Best Practices 15%). Deductions come from summary findings (criticals, warnings) and stats (404s, 5xx, slow responses). 9. **CI/CD Gate** (`ci_gate::evaluate()`): When `--ci` is active, checks scores and stats against configurable thresholds (`--ci-min-score`, `--ci-max-404`, etc.). Returns exit code 10 on failure. 10. **Summary & Output** (`Output::add_summary()`, `Output::end()`): Prints summary table with OK/Warning/Critical counts, finalizes output. Exit code: 0 = success, 3 = no pages crawled, 10 = CI gate failed. ### How Analyzers Work Each analyzer implements the `Analyzer` trait (`analysis/analyzer.rs`). Analyzers are **post-crawl only** — they don't run during crawling. The `AnalysisManager` calls each analyzer's `analyze(&Status, &mut Output)` method after all URLs have been visited. Analyzers read crawled data from `Status` (visited URLs, response headers, bodies, skipped URLs) and produce `SuperTable` instances that get added to `Output`. Analyzers also add `Item` entries to the `Summary` (OK, Warning, Critical, Info findings) which feed into scoring. ### How Content Processors Work Content processors implement `ContentProcessor` (`content_processor/content_processor.rs`) and run **during crawl** on each URL's response body. They serve two purposes: (1) transform content for offline/markdown export (rewrite URLs to relative paths), and (2) extract metadata (links, assets). Processors are type-specific: `HtmlProcessor` handles HTML, `CssProcessor` handles CSS `url()` references, etc. The `ContentProcessorManager` dispatches to the right processor based on content type. ### Concurrency Model The crawler uses tokio for async I/O with a semaphore-based worker pool (`options.workers`). Shared state uses: - `Arc>` for lock-free concurrent maps (URL queue, visited URLs, skipped URLs) - `Arc>` for sequential-access state (Status, Output, AnalysisManager) - `Arc` for simple flags and counters ### Key Traits - **`Analyzer`** (`analysis/analyzer.rs`): Post-crawl analysis (SEO, security, headers, etc.). Each analyzer gets `&Status` and `&mut Output`. - **`Exporter`** (`export/exporter.rs`): Output generators (HTML report, offline website, markdown, sitemap, mailer, upload). - **`Output`** (`output/output.rs`): Formatting backend. Implementations: `TextOutput`, `JsonOutput`, `MultiOutput`. - **`ContentProcessor`** (`content_processor/content_processor.rs`): Per-URL content transformation during crawl (HTML, JS, CSS, XML processors). ### Options System CLI options are defined in `options/core_options.rs` via `get_options()` which returns an `Options` struct with typed option groups. Parsing flow: `parse_argv()` → merge config file → parse flags → `CoreOptions::from_options()` → `apply_option_value()` for each option. New CLI options require: adding the field to `CoreOptions`, a case in `apply_option_value()`, and an entry in the appropriate option group. ### Exit Codes | Code | Meaning | |------|---------| | 0 | Success (with `--ci`: all thresholds passed) | | 1 | Runtime error | | 2 | Help/version displayed | | 3 | No pages successfully crawled (DNS failure, timeout, etc.) | | 10 | CI/CD quality gate failed | | 101 | Configuration error | ### HTTP Response Body `HttpResponse.body` is `Option>` (not String) to preserve binary data for images, fonts, etc. Use `body_text()` for string content. Failed HTTP requests return `Ok(HttpResponse)` with negative status codes (-1 connection error, -2 timeout, -4 send error), not `Err`. ### Testing Structure - **Unit tests**: In-file `#[cfg(test)] mod tests` blocks (standard Rust convention) - **Integration tests**: `tests/integration_crawl.rs` with shared helpers in `tests/common/mod.rs` - Network-dependent integration tests are `#[ignore]` — run explicitly with `--ignored` ### Testing Complex Scenarios with Sample Websites The crawler has a built-in HTTP server (`--serve-offline=`) that can serve any local directory as a static website. This enables efficient local testing of edge cases without deploying a real site: 1. Create a sample website directory, e.g. `./tmp/sample-website-xyz/` 2. Add HTML files and assets simulating the desired scenario (spaces in filenames, special characters, redirect chains, broken links, specific heading structures, etc.) 3. Start the built-in server: `./target/release/siteone-crawler --serve-offline=./tmp/sample-website-xyz/ --serve-port=8888` 4. In another terminal, crawl the local site: `./target/release/siteone-crawler --url=http://127.0.0.1:8888/` 5. Verify the crawler handles the scenario correctly (output, offline export, analysis results) This approach is useful for reproducing bug reports, testing regex edge cases (e.g. URLs with spaces, HTML entities, unusual attribute quoting), validating offline/markdown export for specific HTML structures, and any scenario that would be hard to find on a live website. ### Key Files - `src/engine/crawler.rs` (~1700 lines): Core crawl loop, URL queue management, HTML/content parsing - `src/options/core_options.rs` (~2500 lines): All 120+ CLI options, parsing, validation - `src/export/utils/offline_url_converter.rs` (~1400 lines): URL-to-file-path conversion for offline export - `src/export/html_report/report.rs`: HTML report generation with embedded template - `src/scoring/scorer.rs`: Quality score calculation from summary findings - `src/scoring/ci_gate.rs`: CI/CD threshold evaluation ### Edition & Rust Version Project uses `edition = "2024"` (Rust 1.85+) with `rust-version = "1.94"`. Edition 2024 features used throughout: `unsafe extern` blocks, `if let` chaining (`if let ... && ...`), `unsafe { std::env::set_var() }`. ### Commit Policy **Never commit automatically.** Commits are only allowed on explicit user request. Before every commit, always run `git status`, review the changes, and stage only the relevant files — never use `git add -A` or `git add .` blindly. ### Commit Messages Use [Conventional Commits](https://www.conventionalcommits.org/): `feat:`, `fix:`, `refactor:`, `perf:`, `docs:`, `style:`, `ci:`, `chore:`, `test:`. Examples: - `feat: add built-in HTTP server for markdown/offline exports` - `fix: correct non-ASCII text corruption in heading ID generation` - `perf: eliminate heap allocation in content_type_for_extension` - `chore: bump version to 2.0.3` ### Releasing a New Version 1. Update version in `Cargo.toml` (`version = "X.Y.Z"`) 2. Update version in `src/version.rs` (`pub const CODE: &str = "X.Y.Z.YYYYMMDD";`) 3. Run `cargo check` so that `Cargo.lock` is updated with the new version 4. Commit all three files (`Cargo.toml`, `src/version.rs`, `Cargo.lock`): `git commit -m "chore: bump version to X.Y.Z"` 5. Tag and push: `git tag vX.Y.Z && git push && git push --tags` ### Important Conventions - Tables, column order, and formatting must stay consistent across versions. The HTML parser uses the `scraper` crate. - HTTP cache lives in `tmp/http-client-cache/` by default. Delete it for fresh crawls or use `--http-cache-dir=` to disable. - `rustls` requires explicit `ring` CryptoProvider installation in `main.rs`. ================================================ FILE: Cargo.toml ================================================ [package] name = "siteone-crawler" version = "2.3.0" edition = "2024" rust-version = "1.94" authors = ["Ján Regeš "] description = "Website crawler and QA toolkit in Rust for security, performance, SEO, and accessibility audits, offline cloning, markdown export, sitemap generation, cache warming, and CI/CD gating — one dependency-free binary for all major platforms, 10 tools in one." license = "MIT" repository = "https://github.com/janreges/siteone-crawler" homepage = "https://crawler.siteone.io/" keywords = ["crawler", "seo", "website-analysis", "accessibility", "security"] categories = ["command-line-utilities", "web-programming"] readme = "README.md" [[bin]] name = "siteone-crawler" path = "src/main.rs" [dependencies] tokio = { version = "1", features = ["full"] } reqwest = { version = "0.13", features = ["gzip", "brotli", "deflate", "rustls", "socks", "cookies", "stream", "blocking", "multipart"] } scraper = "0.25" regex = "1" clap = { version = "4", features = ["derive"] } serde = { version = "1", features = ["derive"] } serde_json = "1" colored = "3" dashmap = "6" hickory-resolver = "0.25" rustls = { version = "0.23", features = ["ring"] } x509-parser = "0.18" lettre = { version = "0.11", default-features = false, features = ["tokio1-rustls-tls", "smtp-transport", "builder"] } flate2 = "1" brotli = "8" chrono = { version = "0.4", features = ["serde"] } chrono-tz = "0.10" terminal_size = "0.4" quick-xml = "0.39" thiserror = "2" anyhow = "1" md-5 = "0.10" url = "2" percent-encoding = "2" mime = "0.3" once_cell = "1" indexmap = "2" gethostname = "1.1" rustls-native-certs = "0.8" ego-tree = "0.10" base64 = "0.22" dirs = "6" pulldown-cmark = "0.13.1" inquire = { version = "0.9", default-features = false, features = ["crossterm"] } crossterm = "0.29" fancy-regex = "0.17" [package.metadata.deb] maintainer = "Ján Regeš " copyright = "2023-2026, Ján Regeš" depends = "libc6" section = "web" priority = "optional" extended-description = """\ SiteOne Crawler is an ultra-fast, open-source website crawler and QA toolkit \ written in Rust. It helps developers, DevOps teams, QA engineers, and technical \ SEO specialists crawl websites, audit quality, stress-test pages under load, \ clone sites for offline browsing and archiving, export content to markdown, \ generate sitemaps, warm caches, and enforce CI/CD quality gates — all from a \ single, dependency-free binary for Linux, macOS, and Windows.\n\ \n\ It combines multiple website tooling workflows in one application: security, \ performance, SEO, accessibility, and best-practices audits; whole-site quality \ scoring; UX checks that other tools miss (e.g. non-clickable phone numbers, \ missing alt text, broken heading hierarchy); reporting of all external links \ with their source pages, redirects, and 404s; stress/load testing with tunable \ concurrency and rate limits; offline multi-domain cloning with URL rewriting; \ markdown export for documentation, archiving, or AI workflows; sitemap \ generation; post-deploy cache warming; and automated quality checks for CI/CD \ pipelines.\n\ \n\ SiteOne Crawler can output results as interactive HTML reports (including an \ image gallery of all pictures found on the site), structured JSON, or readable \ terminal text, making it suitable both for local development and for automation \ in CI/CD environments. It can also email HTML reports directly via \ the user's own SMTP server and includes a built-in web server for browsing \ generated markdown exports, plus extensive CLI configurability for advanced \ use cases.\n\ \n\ Whether you need a technical website audit, an offline mirror, a load-testing \ helper, a markdown export for LLM/AI processing, or a reliable quality gate \ before deployment, SiteOne Crawler delivers 10 tools in one — as an ultra-fast, \ portable, open-source Rust binary with zero runtime dependencies.""" assets = [ ["target/release/siteone-crawler", "usr/bin/", "755"], ["README.md", "usr/share/doc/siteone-crawler/", "644"], ["LICENSE", "usr/share/doc/siteone-crawler/", "644"], ] [package.metadata.deb.variants.static] name = "siteone-crawler-static" depends = "" conflicts = "siteone-crawler" provides = "siteone-crawler" extended-description = """\ Statically linked (musl) variant of SiteOne Crawler for maximum Linux compatibility. \ This version runs on any Linux distribution regardless of the installed glibc version. \ Install this if the standard siteone-crawler package reports a 'GLIBC not found' error. \ Note: ~50–80% slower than the glibc variant for CPU-intensive operations (offline and \ markdown export) due to the musl memory allocator.""" [package.metadata.generate-rpm] assets = [ { source = "target/release/siteone-crawler", dest = "/usr/bin/siteone-crawler", mode = "0755" }, { source = "README.md", dest = "/usr/share/doc/siteone-crawler/README.md", mode = "0644" }, { source = "LICENSE", dest = "/usr/share/doc/siteone-crawler/LICENSE", mode = "0644" }, ] ================================================ FILE: LICENSE ================================================ MIT License Copyright (c) 2023-2026 Ján Regeš Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: README.md ================================================ # SiteOne Crawler SiteOne Crawler is a powerful and easy-to-use **website analyzer, cloner, and converter** designed for developers seeking security and performance insights, SEO specialists identifying optimization opportunities, and website owners needing reliable backups and offline versions. **Now rewritten in Rust** for maximum performance, minimal resource usage, and zero runtime dependencies. The transition from PHP+Swoole to Rust resulted in **25% faster execution** and **30% lower memory consumption** while producing identical output. **Discover the SiteOne Crawler advantage:** * **Run Anywhere:** Single native binary for **🪟 Windows**, **🍎 macOS**, and **🐧 Linux** (x64 & arm64). No runtime dependencies. * **Work Your Way:** Launch the binary without arguments for an **interactive wizard** 🧙 with 10 preset modes, use the extensive **command-line interface** 📟 ([releases](https://github.com/janreges/siteone-crawler/releases), [▶️ video](https://www.youtube.com/watch?v=25T_yx13naA&list=PL9mElgTe-s1Csfg0jXWmDS0MHFN7Cpjwp)) for automation and power, or enjoy the intuitive **desktop GUI application** 💻 ([GUI app](https://github.com/janreges/siteone-crawler-gui), [▶️ video](https://www.youtube.com/watch?v=rFW8LNEVNdw)) for visual control. * **Rich Output Formats:** Interactive **HTML audit report** 📊 with sortable tables and quality scoring (0.0-10.0) (see [nextjs.org sample](https://crawler.siteone.io/html/2024-08-23/forever/cl8xw4r-fdag8wg-44dd.html)), detailed **JSON** for programmatic consumption, and human-readable **text** for terminal. Send HTML reports directly to your inbox via **built-in SMTP mailer** 📧. * **CI/CD Integration:** Built-in **quality gate** (`--ci`) with configurable thresholds — exit code 10 on failure enables automated deployment blocking. Also useful for **cache warming** — crawling the entire site after deployment populates your reverse proxy/CDN cache. * **Offline & Markdown Power:** Create complete **offline clones** 💾 for browsing without a server ([nextjs.org clone](https://crawler.siteone.io/examples-exports/nextjs.org/)) or convert entire websites into clean **Markdown** 📝 — perfect for backups, documentation, or feeding content to AI models ([examples](https://github.com/janreges/siteone-crawler-markdown-examples/)). * **Deep Crawling & Analysis:** Thoroughly crawl every page and asset, identify errors (404s, redirects), generate **sitemaps** 🗺️, and even get **email summaries** 📧 (watch [▶️ video example](https://www.youtube.com/watch?v=PHIFSOmk0gk)). * **Learn More:** Dive into the 🌐 [Project Website](https://crawler.siteone.io/), explore the detailed [Documentation](https://crawler.siteone.io/configuration/command-line-options/), or check the [JSON](docs/JSON-OUTPUT.md)/[Text](docs/TEXT-OUTPUT.md) output specs. GIF animation of the crawler in action (also available as a [▶️ video](https://www.youtube.com/watch?v=25T_yx13naA&list=PL9mElgTe-s1Csfg0jXWmDS0MHFN7Cpjwp)): ![SiteOne Crawler](docs/siteone-crawler-command-line.gif) ## Table of contents - [✨ Features](#-features) * [🕷️ Crawler](#️-crawler) * [🛠️ Dev/DevOps assistant](#️-devdevops-assistant) * [📊 Analyzer](#-analyzer) * [📧 Reporter](#-reporter) * [💾 Offline website generator](#-offline-website-generator) * [📝 Website to markdown converter](#-website-to-markdown-converter) * [🗺️ Sitemap generator](#️-sitemap-generator) - [🚀 Installation](#-installation) * [📦 Pre-built binaries](#-pre-built-binaries) * [🍺 Homebrew (macOS / Linux)](#-homebrew-macos--linux) * [🐧 Debian / Ubuntu (apt)](#-debian--ubuntu-apt) * [🎩 Fedora / RHEL (dnf)](#-fedora--rhel-dnf) * [🦎 openSUSE / SLES (zypper)](#-opensuse--sles-zypper) * [🏔️ Alpine Linux (apk)](#️-alpine-linux-apk) * [🔨 Build from source](#-build-from-source) - [▶️ Usage](#️-usage) * [Interactive wizard](#interactive-wizard) * [Basic example](#basic-example) * [CI/CD example](#cicd-example) * [Fully-featured example](#fully-featured-example) * [⚙️ Arguments](#️-arguments) + [Basic settings](#basic-settings) + [Output settings](#output-settings) + [Resource filtering](#resource-filtering) + [Advanced crawler settings](#advanced-crawler-settings) + [File export settings](#file-export-settings) + [Mailer options](#mailer-options) + [Upload options](#upload-options) + [Offline exporter options](#offline-exporter-options) + [Markdown exporter options](#markdown-exporter-options) + [Sitemap options](#sitemap-options) + [Expert options](#expert-options) + [Fastest URL analyzer](#fastest-url-analyzer) + [SEO and OpenGraph analyzer](#seo-and-opengraph-analyzer) + [Slowest URL analyzer](#slowest-url-analyzer) + [Built-in HTTP server](#built-in-http-server) + [HTML-to-Markdown conversion](#html-to-markdown-conversion) + [CI/CD settings](#cicd-settings) - [🏆 Quality Scoring](#-quality-scoring) - [🔄 CI/CD Integration](#-cicd-integration) - [📄 Output Examples](#-output-examples) - [🧪 Testing](#-testing) - [⚠️ Disclaimer](#️-disclaimer) - [📜 License](#-license) ## ✨ Features In short, the main benefits can be summarized in these points: - **🕷️ Crawler** - very powerful crawler of the entire website reporting useful information about each URL (status code, response time, size, custom headers, titles, etc.) - **🛠️ Dev/DevOps assistant** - offers stress/load testing with configurable concurrent workers (`--workers`) and request rate (`--max-reqs-per-sec`), cache warming, localhost testing, and rich URL/content-type filtering - **📊 Analyzer** - analyzes all webpages and reports strange or error behaviour and useful statistics (404, redirects, bad practices, SEO and security issues, heading structures, etc.) - **📧 Reporter** - interactive **HTML audit report**, structured **JSON**, and colored **text** output; built-in **SMTP mailer** sends HTML reports directly to your inbox - **💾 Offline website generator** - clone entire websites to browsable local HTML files (no server needed) including all assets. Supports **multi-domain clones** — include subdomains or external domains with intelligent cross-linking. - **📝 Website to markdown converter** - export the entire website to browsable text markdown (viewable on GitHub or any text editor), or generate a **single-file markdown** with smart header/footer deduplication — ideal for **feeding to AI tools**. Includes a **built-in web server** that renders markdown exports as styled HTML pages. Also supports **standalone HTML-to-Markdown conversion** of local files (`--html-to-markdown`). See [markdown examples](https://github.com/janreges/siteone-crawler-markdown-examples/). - **🗺️ Sitemap generator** - allows you to generate `sitemap.xml` and `sitemap.txt` files with a list of all pages on your website - **🏆 Quality scoring** - automatic quality scoring (0.0-10.0) across 5 categories: Performance, SEO, Security, Accessibility, Best Practices - **🔄 CI/CD quality gate** - configurable thresholds with exit code 10 on failure for automated pipelines; also useful as a **post-deployment cache warmer** for reverse proxies and CDNs The following features are summarized in greater detail: ### 🕷️ Crawler - **all major platforms** supported without dependencies (🐧 Linux, 🪟 Windows, 🍎 macOS, arm64) — single native binary - has incredible **🚀 native Rust performance** with async I/O and multi-threaded crawling - provides simulation of **different device types** (desktop/mobile/tablet) thanks to predefined User-Agents - will crawl **all files**, styles, scripts, fonts, images, documents, etc. on your website - will respect the `robots.txt` file and will not crawl the pages that are not allowed - has a **beautiful interactive** and **🎨 colourful output** - it will **clearly warn you** ⚠️ of any wrong use of the tool (e.g. input parameters validation or wrong permissions) - as `--url` parameter, you can specify also a `sitemap.xml` file (or [sitemap index](https://www.sitemaps.org/protocol.html#index)), which will be processed as a list of URLs. In sitemap-only mode, the crawler follows only URLs from the sitemap — it does not discover additional links from HTML pages. Gzip-compressed sitemaps (`*.xml.gz`) are fully supported, both as direct URLs and when referenced from sitemap index files. - respects the HTML `` tag when resolving relative URLs on pages that use it. ### 🛠️ Dev/DevOps assistant - allows testing **public** and **local projects on specific ports** (e.g. `http://localhost:3000/`) - works as a **stress/load tester** — configure the number of **concurrent workers** (`--workers`) and the **maximum requests per second** (`--max-reqs-per-sec`) to simulate various traffic levels and test your infrastructure's resilience against high load or DoS scenarios - combine with **rich filtering options** — include/ignore URLs by regex (`--include-regex`, `--ignore-regex`), disable specific asset types (`--disable-javascript`, `--disable-images`, etc.), or limit crawl depth (`--max-depth`) to focus the load on specific parts of your website - will help you **warm up the application cache** or the **cache on the reverse proxy** of the entire website ### 📊 Analyzer - will **find the weak points** or **strange behavior** of your website - built-in analyzers cover SEO, security headers, accessibility, best practices, performance, SSL/TLS, caching, and more ### 📧 Reporter Three output formats: - **Interactive HTML report** — a self-contained `.html` file with sortable tables, quality scores, color-coded findings, and sections for SEO, security, accessibility, performance, headers, redirects, 404s, and more. Open it in any browser — no server needed. - **JSON output** — structured data with all crawled URLs, response details, analysis findings, scores, and CI/CD gate results. Ideal for programmatic consumption, dashboards, and integrations. - **Text output** — human-readable colored terminal output with tables, progress bars, and summaries. Additional reporting features: - **Built-in SMTP mailer** — send the HTML audit report directly to one or more email addresses via your own SMTP server. Configure sender, recipients, subject template, and SMTP credentials via CLI options. - will provide you with data for **SEO analysis**, just add the `Title`, `Keywords` and `Description` extra columns - will provide useful **summaries and statistics** at the end of the processing ### 💾 Offline website generator - will help you **export the entire website** to offline form, where it is possible to browse the site through local HTML files (without HTTP server) including all documents, images, styles, scripts, fonts, etc. - supports **multi-domain clones** — include subdomains (`*.mysite.tld`) or entirely different domains in a single offline export. All URLs across included domains are **intelligently rewritten to relative paths**, so the resulting offline version cross-links pages between domains seamlessly — you get one unified browsable clone. - you can **limit what assets** you want to download and export (see `--disable-*` directives) .. for some types of websites the best result is with the `--disable-javascript` option. - you can specify by `--allowed-domain-for-external-files` (short `-adf`) from which **external domains** it is possible to **download** assets (JS, CSS, fonts, images, documents) including `*` option for all domains. - you can specify by `--allowed-domain-for-crawling` (short `-adc`) which **other domains** should be included in the **crawling** if there are any links pointing to them. You can enable e.g. `mysite.*` to export all language mutations that have a different TLD or `*.mysite.tld` to export all subdomains. - you can use `--single-page` to **export only one page** to which the URL is given (and its assets), but do not follow other pages. - you can use `--single-foreign-page` to **export only one page** from another domain (if allowed by `--allowed-domain-for-crawling`), but do not follow other pages. - you can use `--replace-content` to **replace content** in HTML/JS/CSS with `foo -> bar` or regexp in PCRE format, e.g. `/card[0-9]/i -> card`. Can be specified multiple times. - you can use `--replace-query-string` to **replace chars in query string** in the filename. - you can use `--max-depth` to set the **maximum crawling depth** (for pages, not assets). `1` means `/about` or `/about/`, `2` means `/about/contacts` etc. - you can use it to **export your website to a static form** and host it on GitHub Pages, Netlify, Vercel, etc. as a static backup and part of your **disaster recovery plan** or **archival/legal needs** - works great with **older conventional websites** but also **modern ones**, built on frameworks like Next.js, Nuxt.js, SvelteKit, Astro, Gatsby, etc. When a JS framework is detected, the export also performs some framework-specific code modifications for optimal results. - **try it** for your website, and you will be very pleasantly surprised :-) ### 📝 Website to markdown converter Two export modes: - **Multi-file markdown** — exports the entire website with all subpages to a directory of **browsable `.md` files**. The markdown renders nicely when uploaded to GitHub, viewed in VS Code, or any text editor. Links between pages are converted to relative `.md` links so you can navigate between files. Optionally includes images and other files (PDF, etc.). - **Single-file markdown** — combines all pages into **one large markdown file** with smart removal of duplicate website headers and footers across pages. Ideal for **feeding entire website content to AI tools** (ChatGPT, Claude, etc.) that process markdown more effectively than raw HTML. Smart conversion features: - **collapsible accordions** — large link lists (menus, navigation, footer links with 8+ items) are automatically collapsed into `
` accordions with contextual labels ("Menu", "Links") for better readability - content before the main heading (typically h1) — such as the site header and navigation — is moved to the end of the page below a `---` separator, so the actual page content comes first - you can set multiple selectors (CSS-like) to **remove unwanted elements** from the exported markdown - **code block detection** and **syntax highlighting** for popular programming languages - HTML tables are converted to proper **markdown tables** Built-in web server: - use `--serve-markdown=` to start a **built-in HTTP server** that renders your markdown export as styled HTML pages with tables, dark/light mode, breadcrumb navigation, and accordion support — perfect for browsing and sharing the export locally or on a network Standalone HTML-to-Markdown conversion: - use `--html-to-markdown=` to convert a **local HTML file** directly to Markdown without crawling any website - outputs clean Markdown to **stdout** (pipe-friendly) or to a file with `--html-to-markdown-output=` - uses the same conversion pipeline as `--markdown-export-dir` — including all cleanup, accordion collapsing, code language detection, and implicit exclusions (cookie banners, `aria-hidden` elements, `role="menu"` dropdowns) - respects `--markdown-disable-images`, `--markdown-disable-files`, `--markdown-exclude-selector`, and `--markdown-move-content-before-h1-to-end` - does **not** rewrite links (`.html` → `.md`) since the file is standalone with no site context 💡 Tip: you can push the exported markdown folder to your GitHub repository, where it will be automatically rendered as a browsable documentation. You can look at the [examples](https://github.com/janreges/siteone-crawler-markdown-examples/) of converted websites to markdown. See all available [markdown exporter options](#markdown-exporter-options) and [HTML-to-Markdown conversion options](#html-to-markdown-conversion). ### 🗺️ Sitemap generator - will help you create a `sitemap.xml` and `sitemap.txt` for your website - you can set the priority of individual pages based on the number of slashes in the URL Don't hesitate and try it. You will love it as we do! ❤️ ## 🚀 Installation ### 📦 Pre-built binaries Download pre-built binaries from [🐙 GitHub releases](https://github.com/janreges/siteone-crawler/releases) for all major platforms (🐧 Linux, 🪟 Windows, 🍎 macOS, x64 & arm64). The binary is self-contained — no runtime dependencies required. ```bash # Linux / macOS — download, extract, run ./siteone-crawler --url=https://my.domain.tld ``` **🐧 Linux binary variants:** For Linux, two binary variants are provided: | Variant | Compatibility | Performance | |---------|--------------|-------------| | **glibc** (primary) | Requires glibc 2.39+ (Ubuntu 24.04+, Debian 13+, Fedora 40+) | Full native performance | | **musl** (compatible) | Any Linux distribution (statically linked, no dependencies) | ~50–80% slower due to musl memory allocator | The **glibc** variant is recommended for current distributions — it offers the best performance. If you are running an older distribution (e.g. Ubuntu 22.04, Debian 12) and encounter a `GLIBC_2.xx not found` error, use the **musl** variant instead. The musl binary is fully statically linked and runs on any Linux system regardless of the installed glibc version. The performance difference is mainly noticeable during CPU-intensive operations like offline and markdown exports. **Note for macOS users**: In case that Mac refuses to start the crawler from your Download folder, move the entire folder with the Crawler **via the terminal** to another location, for example to the homefolder `~`. ### 🍺 Homebrew (macOS / Linux) ```bash brew install janreges/tap/siteone-crawler siteone-crawler --url=https://my.domain.tld ``` ### 🐧 Debian / Ubuntu (apt) ```bash curl -1sLf 'https://dl.cloudsmith.io/public/janreges/siteone-crawler/setup.deb.sh' | sudo -E bash sudo apt-get install siteone-crawler ``` > **Older distributions (Ubuntu 22.04, Debian 11/12, etc.):** If you get a `GLIBC_X.XX not found` error, install the statically linked variant instead: > ```bash > sudo apt-get install siteone-crawler-static > ``` > See [Linux binary variants](#-pre-built-binaries) for details on the performance difference. ### 🎩 Fedora / RHEL (dnf) ```bash curl -1sLf 'https://dl.cloudsmith.io/public/janreges/siteone-crawler/setup.rpm.sh' | sudo -E bash sudo dnf install siteone-crawler ``` > **Older distributions:** If you get a `GLIBC_X.XX not found` error, use `sudo dnf install siteone-crawler-static` instead. > See [Linux binary variants](#-pre-built-binaries) for details. ### 🦎 openSUSE / SLES (zypper) ```bash curl -1sLf 'https://dl.cloudsmith.io/public/janreges/siteone-crawler/setup.rpm.sh' | sudo -E bash sudo zypper install siteone-crawler ``` > **Older distributions:** If you get a `GLIBC_X.XX not found` error, use `sudo zypper install siteone-crawler-static` instead. > See [Linux binary variants](#-pre-built-binaries) for details. ### 🏔️ Alpine Linux (apk) ```bash curl -1sLf 'https://dl.cloudsmith.io/public/janreges/siteone-crawler/setup.alpine.sh' | sudo -E bash sudo apk add siteone-crawler ``` ### 🔨 Build from source Requires [Rust](https://www.rust-lang.org/tools/install) 1.85 or later. ```bash git clone https://github.com/janreges/siteone-crawler.git cd siteone-crawler # Build optimized release binary cargo build --release # Run ./target/release/siteone-crawler --url=https://my.domain.tld ``` **Build statically linked (musl) binary:** ```bash # Install musl toolchain (Ubuntu/Debian) sudo apt-get install musl-tools rustup target add x86_64-unknown-linux-musl # Build static binary (no system dependencies) cargo build --release --target x86_64-unknown-linux-musl # Run — works on any Linux distribution ./target/x86_64-unknown-linux-musl/release/siteone-crawler --url=https://my.domain.tld ``` ## ▶️ Usage ### Interactive wizard Run the binary **without any arguments** and an interactive wizard will guide you through the configuration. Choose from 10 preset modes, enter the target URL, fine-tune settings with arrow keys, and the crawler starts immediately — no need to remember CLI flags. ``` ? Choose a crawl mode: ❯ Quick Audit Fast site health overview — crawls all pages and assets SEO Analysis Extract titles, descriptions, keywords, and OpenGraph tags Performance Test Measure response times with cache disabled — find bottlenecks Security Check Check SSL/TLS, security headers, and redirects site-wide Offline Clone Download entire website with all assets for offline browsing Markdown Export Convert pages to Markdown for AI models or documentation Stress Test High-concurrency load test with cache-busting random params Single Page Deep analysis of a single URL — SEO, security, performance Large Site Crawl High-throughput HTML-only crawl for large sites (100k+ pages) Custom Start from defaults and configure every option manually ────────────────────────────────────── Browse offline export Serve a previously exported offline site via HTTP Browse markdown export Serve a previously exported markdown site via HTTP [↑↓ to move, enter to select, type to filter] ``` After selecting a preset and entering the URL, the wizard shows a settings form where you can adjust workers, timeout, content types, export options, and more. A configuration summary with the equivalent CLI command is displayed before the crawl starts — copy it for future use without the wizard. If existing offline or markdown exports are detected in `./tmp/`, the wizard also offers to **serve them via the built-in HTTP server** directly from the menu. ### Basic example To run the crawler from the command line, provide the required arguments: ```bash ./siteone-crawler --url=https://mydomain.tld/ --device=mobile ``` ### CI/CD example ```bash # Fail deployment if quality score < 7.0 or any 5xx errors ./siteone-crawler --url=https://mydomain.tld/ --ci --ci-min-score=7.0 --ci-max-5xx=0 echo $? # 0 = pass, 10 = fail ``` ### Fully-featured example ```bash ./siteone-crawler --url=https://mydomain.tld/ \ --output=text \ --workers=2 \ --max-reqs-per-sec=10 \ --memory-limit=2048M \ --resolve='mydomain.tld:443:127.0.0.1' \ --timeout=5 \ --proxy=proxy.mydomain.tld:8080 \ --http-auth=myuser:secretPassword123 \ --user-agent="My User-Agent String" \ --extra-columns="DOM,X-Cache(10),Title(40),Keywords(50),Description(50>),Heading1=xpath://h1/text()(20>),ProductPrice=regexp:/Price:\s*\$?(\d+(?:\.\d{2})?)/i#1(10)" \ --accept-encoding="gzip, deflate" \ --url-column-size=100 \ --max-queue-length=3000 \ --max-visited-urls=10000 \ --max-url-length=5000 \ --max-non200-responses-per-basename=10 \ --include-regex="/^.*\/technologies.*/" \ --include-regex="/^.*\/fashion.*/" \ --ignore-regex="/^.*\/downloads\/.*\.pdf$/i" \ --analyzer-filter-regex="/^.*$/i" \ --remove-query-params \ --keep-query-param=page \ --add-random-query-params \ --transform-url="live-site.com -> local-site.local" \ --transform-url="/cdn\.live-site\.com/ -> local-site.local/cdn" \ --show-scheme-and-host \ --do-not-truncate-url \ --output-html-report=tmp/myreport.html \ --html-report-options="summary,seo-opengraph,visited-urls,security,redirects" \ --output-json-file=/dir/report.json \ --output-text-file=/dir/report.txt \ --add-timestamp-to-output-file \ --add-host-to-output-file \ --offline-export-dir=tmp/mydomain.tld \ --replace-content='/]+>/ -> ' \ --ignore-store-file-error \ --sitemap-xml-file=/dir/sitemap.xml \ --sitemap-txt-file=/dir/sitemap.txt \ --sitemap-base-priority=0.5 \ --sitemap-priority-increase=0.1 \ --markdown-export-dir=tmp/mydomain.tld.md \ --markdown-export-single-file=tmp/mydomain.tld.combined.md \ --markdown-move-content-before-h1-to-end \ --markdown-disable-images \ --markdown-disable-files \ --markdown-remove-links-and-images-from-single-file \ --markdown-exclude-selector='.exclude-me' \ --markdown-replace-content='/]+>/ -> ' \ --markdown-replace-query-string='/[a-z]+=[^&]*(&|$)/i -> $1__$2' \ --mail-to=your.name@my-mail.tld \ --mail-to=your.friend.name@my-mail.tld \ --mail-from=crawler@my-mail.tld \ --mail-from-name="SiteOne Crawler" \ --mail-subject-template="Crawler Report for %domain% (%date%)" \ --mail-smtp-host=smtp.my-mail.tld \ --mail-smtp-port=25 \ --mail-smtp-user=smtp.user \ --mail-smtp-pass=secretPassword123 \ --ci --ci-min-score=7.0 --ci-min-security=8.0 ``` ## ⚙️ Arguments For a clearer list, I recommend going to the documentation: 🌐 https://crawler.siteone.io/configuration/command-line-options/ ### Basic settings | Parameter | Description | |-----------|-------------| | `--url=` | Required. HTTP or HTTPS URL address of the website or sitemap xml to be crawled.
Use quotation marks `''` if the URL contains query parameters. | | `--single-page` | Load only one page to which the URL is given (and its assets), but do not follow other pages. | | `--max-depth=` | Maximum crawling depth (for pages, not assets). Default is `0` (no limit). `1` means `/about`
or `/about/`, `2` means `/about/contacts` etc. | | `--device=` | Device type for choosing a predefined User-Agent. Ignored when `--user-agent` is defined.
Supported values: `desktop`, `mobile`, `tablet`. Default is `desktop`. | | `--user-agent=` | Custom User-Agent header. Use quotation marks. If specified, it takes precedence over
the device parameter. If you add `!` at the end, the siteone-crawler/version will not be
added as a signature at the end of the final user-agent. | | `--timeout=` | Request timeout in seconds. Default is `5`. | | `--proxy=` | HTTP proxy to use in `host:port` format. Host can be hostname, IPv4 or IPv6. | | `--http-auth=` | Basic HTTP authentication in `username:password` format. | | `--config-file=` | Load CLI options from a config file. One option per line, `#` comments allowed.
Without this flag, auto-discovers `~/.siteone-crawler.conf` or `/etc/siteone-crawler.conf`.
CLI arguments override config file values. | ### Output settings | Parameter | Description | |-----------|-------------| | `--output=` | Output type. Supported values: `text`, `json`. Default is `text`. | | `--extra-columns=` | Comma delimited list of extra columns added to output table. You can specify HTTP headers
(e.g. `X-Cache`), predefined values (`Title`, `Keywords`, `Description`, `DOM`), or custom
extraction from text files (HTML, JS, CSS, TXT, JSON, XML, etc.) using XPath or regexp.
For custom extraction, use the format `Custom_column_name=method:pattern#group(length)`, where
`method` is `xpath` or `regexp`, `pattern` is the extraction pattern, an optional `#group` specifies the
capturing group (or node index for XPath) to return (defaulting to the entire match or first node), and an
optional `(length)` sets the maximum output length (append `>` to disable truncation).
For example, use `Heading1=xpath://h1/text()(20>)` to extract the text of the first H1 element
from the HTML document, and `ProductPrice=regexp:/Price:\s*\$?(\d+(?:\.\d{2})?)/i#1(10)`
to extract a numeric price (e.g., "29.99") from a string like "Price: $29.99". | | `--url-column-size=` | Basic URL column width. By default, it is calculated from the size of your terminal window. | | `--rows-limit=` | Max. number of rows to display in tables with analysis results.
Default is `200`. | | `--timezone=` | Timezone for datetimes in HTML reports and timestamps in output folders/files, e.g. `Europe/Prague`.
Default is `UTC`. | | `--do-not-truncate-url` | In the text output, long URLs are truncated by default to `--url-column-size` so the table does not
wrap due to long URLs. With this option, you can turn off the truncation. | | `--show-scheme-and-host` | On text output, show scheme and host also for origin domain URLs. | | `--hide-progress-bar` | Hide progress bar visible in text and JSON output for more compact view. | | `--hide-columns=` | Hide specified columns from the progress table. Comma-separated list of column names:
`type`, `time`, `size`, `cache`. Example: `--hide-columns=cache` or `--hide-columns=cache,type`. | | `--no-color` | Disable colored output. | | `--force-color` | Force colored output regardless of support detection. | | `--show-inline-criticals` | Show criticals from the analyzer directly in the URL table. | | `--show-inline-warnings` | Show warnings from the analyzer directly in the URL table. | ### Resource filtering | Parameter | Description | |-----------|-------------| | `--disable-all-assets` | Disables crawling of all assets and files and only crawls pages in href attributes.
Shortcut for calling all other `--disable-*` flags. | | `--disable-javascript` | Disables JavaScript downloading and removes all JavaScript code from HTML,
including `onclick` and other `on*` handlers. | | `--disable-styles` | Disables CSS file downloading and at the same time removes all style definitions
by `").unwrap()); let result = RE_SCRIPT.replace_all(html, " ").to_string(); RE_STYLE.replace_all(&result, " ").to_string() } ================================================ FILE: src/analysis/caching_analyzer.rs ================================================ // SiteOne Crawler - CachingAnalyzer // (c) Jan Reges use std::collections::HashMap; use crate::analysis::analyzer::Analyzer; use crate::analysis::base_analyzer::BaseAnalyzer; use crate::components::super_table::SuperTable; use crate::components::super_table_column::SuperTableColumn; use crate::output::output::Output; use crate::result::status::Status; use crate::result::visited_url::VisitedUrl; use crate::utils; const SUPER_TABLE_CACHING_PER_CONTENT_TYPE: &str = "caching-per-content-type"; const SUPER_TABLE_CACHING_PER_DOMAIN: &str = "caching-per-domain"; const SUPER_TABLE_CACHING_PER_DOMAIN_AND_CONTENT_TYPE: &str = "caching-per-domain-and-content-type"; pub struct CachingAnalyzer { base: BaseAnalyzer, } impl Default for CachingAnalyzer { fn default() -> Self { Self::new() } } impl CachingAnalyzer { pub fn new() -> Self { Self { base: BaseAnalyzer::new(), } } fn update_cache_stat(stat: &mut CacheStat, visited_url: &VisitedUrl) { stat.count += 1; if let Some(lifetime) = visited_url.cache_lifetime { stat.count_with_lifetime += 1; stat.total_lifetime += lifetime; stat.avg_lifetime = Some(stat.total_lifetime as f64 / stat.count_with_lifetime as f64); stat.min_lifetime = Some(match stat.min_lifetime { Some(min) => min.min(lifetime), None => lifetime, }); stat.max_lifetime = Some(match stat.max_lifetime { Some(max) => max.max(lifetime), None => lifetime, }); } } fn build_lifetime_columns(first_col_name: &str, first_col_key: &str) -> Vec { let mut columns = vec![SuperTableColumn::new( first_col_key.to_string(), first_col_name.to_string(), if first_col_key == "domain" { 20 } else { 12 }, None, None, false, false, false, true, None, )]; // Add cacheType column only when not the first column if first_col_key != "cacheType" { columns.push(SuperTableColumn::new( "cacheType".to_string(), "Cache type".to_string(), 12, None, None, false, false, false, true, None, )); } columns.extend(vec![ SuperTableColumn::new( "count".to_string(), "URLs".to_string(), 5, None, None, false, false, false, true, None, ), SuperTableColumn::new( "avgLifetime".to_string(), "AVG lifetime".to_string(), 10, Some(Box::new(|value: &str, _render_into: &str| { if let Ok(v) = value.parse::() { utils::get_colored_cache_lifetime(v, 6) } else { "-".to_string() } })), None, false, false, false, true, None, ), SuperTableColumn::new( "minLifetime".to_string(), "MIN lifetime".to_string(), 10, Some(Box::new(|value: &str, _render_into: &str| { if let Ok(v) = value.parse::() { utils::get_colored_cache_lifetime(v, 6) } else { "-".to_string() } })), None, false, false, false, true, None, ), SuperTableColumn::new( "maxLifetime".to_string(), "MAX lifetime".to_string(), 10, Some(Box::new(|value: &str, _render_into: &str| { if let Ok(v) = value.parse::() { utils::get_colored_cache_lifetime(v, 6) } else { "-".to_string() } })), None, false, false, false, true, None, ), ]); columns } } impl Analyzer for CachingAnalyzer { fn analyze(&mut self, status: &Status, output: &mut dyn Output) { let visited_urls = status.get_visited_urls(); let mut stats_per_content_type: HashMap = HashMap::new(); let mut stats_per_domain: HashMap = HashMap::new(); let mut stats_per_domain_and_ct: HashMap = HashMap::new(); for visited_url in &visited_urls { let content_type_name = visited_url.content_type.name().to_string(); let cache_type_label = visited_url.get_cache_type_label(); let domain_name = visited_url.get_host().unwrap_or_else(|| "unknown".to_string()); // Per domain { let key = format!("{}.{}", domain_name, cache_type_label); let stat = stats_per_domain.entry(key).or_insert_with(|| CacheStatWithDomain { domain: domain_name.clone(), cache_type: cache_type_label.clone(), stat: CacheStat::default(), }); Self::update_cache_stat(&mut stat.stat, visited_url); } // Per domain and content type { let key = format!("{}.{}.{}", domain_name, content_type_name, cache_type_label); let stat = stats_per_domain_and_ct .entry(key) .or_insert_with(|| CacheStatWithDomainAndType { domain: domain_name.clone(), content_type: content_type_name.clone(), cache_type: cache_type_label.clone(), stat: CacheStat::default(), }); Self::update_cache_stat(&mut stat.stat, visited_url); } // Per content type (only crawlable domains) if visited_url.is_allowed_for_crawling { let key = format!("{}.{}", content_type_name, cache_type_label); let stat = stats_per_content_type.entry(key).or_insert_with(|| CacheStatWithType { content_type: content_type_name.clone(), cache_type: cache_type_label.clone(), stat: CacheStat::default(), }); Self::update_cache_stat(&mut stat.stat, visited_url); } } // Per content type table if !stats_per_content_type.is_empty() { let data: Vec> = stats_per_content_type.values().map(|s| s.to_row()).collect(); let columns = Self::build_lifetime_columns("Content type", "contentType"); let mut super_table = SuperTable::new( SUPER_TABLE_CACHING_PER_CONTENT_TYPE.to_string(), "HTTP Caching by content type (only from crawlable domains)".to_string(), "No URLs found.".to_string(), columns, true, Some("count".to_string()), "DESC".to_string(), None, None, Some("HTTP cache".to_string()), ); super_table.set_data(data); status.configure_super_table_url_stripping(&mut super_table); output.add_super_table(&super_table); status.add_super_table_at_beginning(super_table); } // Per domain table { let data: Vec> = stats_per_domain.values().map(|s| s.to_row()).collect(); let columns = Self::build_lifetime_columns("Domain", "domain"); let mut super_table = SuperTable::new( SUPER_TABLE_CACHING_PER_DOMAIN.to_string(), "HTTP Caching by domain".to_string(), "No URLs found.".to_string(), columns, true, Some("count".to_string()), "DESC".to_string(), None, None, None, ); super_table.set_data(data); status.configure_super_table_url_stripping(&mut super_table); output.add_super_table(&super_table); status.add_super_table_at_beginning(super_table); } // Per domain and content type table { let data: Vec> = stats_per_domain_and_ct.values().map(|s| s.to_row()).collect(); let mut columns = Self::build_lifetime_columns("Domain", "domain"); columns.insert( 1, SuperTableColumn::new( "contentType".to_string(), "Content type".to_string(), 12, None, None, false, false, false, true, None, ), ); let mut super_table = SuperTable::new( SUPER_TABLE_CACHING_PER_DOMAIN_AND_CONTENT_TYPE.to_string(), "HTTP Caching by domain and content type".to_string(), "No URLs found.".to_string(), columns, true, Some("count".to_string()), "DESC".to_string(), None, None, None, ); super_table.set_data(data); status.configure_super_table_url_stripping(&mut super_table); output.add_super_table(&super_table); status.add_super_table_at_beginning(super_table); } } fn should_be_activated(&self) -> bool { true } fn get_order(&self) -> i32 { 116 } fn get_name(&self) -> &str { "CachingAnalyzer" } fn get_exec_times(&self) -> &HashMap { self.base.get_exec_times() } fn get_exec_counts(&self) -> &HashMap { self.base.get_exec_counts() } } #[derive(Default)] struct CacheStat { count: usize, count_with_lifetime: usize, total_lifetime: i64, avg_lifetime: Option, min_lifetime: Option, max_lifetime: Option, } struct CacheStatWithType { content_type: String, cache_type: String, stat: CacheStat, } impl CacheStatWithType { fn to_row(&self) -> HashMap { let mut row = HashMap::new(); row.insert("contentType".to_string(), self.content_type.clone()); row.insert("cacheType".to_string(), self.cache_type.clone()); row.insert("count".to_string(), self.stat.count.to_string()); row.insert( "avgLifetime".to_string(), self.stat .avg_lifetime .map(|v| format!("{}", v as i64)) .unwrap_or_default(), ); row.insert( "minLifetime".to_string(), self.stat.min_lifetime.map(|v| v.to_string()).unwrap_or_default(), ); row.insert( "maxLifetime".to_string(), self.stat.max_lifetime.map(|v| v.to_string()).unwrap_or_default(), ); row } } struct CacheStatWithDomain { domain: String, cache_type: String, stat: CacheStat, } impl CacheStatWithDomain { fn to_row(&self) -> HashMap { let mut row = HashMap::new(); row.insert("domain".to_string(), self.domain.clone()); row.insert("cacheType".to_string(), self.cache_type.clone()); row.insert("count".to_string(), self.stat.count.to_string()); row.insert( "avgLifetime".to_string(), self.stat .avg_lifetime .map(|v| format!("{}", v as i64)) .unwrap_or_default(), ); row.insert( "minLifetime".to_string(), self.stat.min_lifetime.map(|v| v.to_string()).unwrap_or_default(), ); row.insert( "maxLifetime".to_string(), self.stat.max_lifetime.map(|v| v.to_string()).unwrap_or_default(), ); row } } struct CacheStatWithDomainAndType { domain: String, content_type: String, cache_type: String, stat: CacheStat, } impl CacheStatWithDomainAndType { fn to_row(&self) -> HashMap { let mut row = HashMap::new(); row.insert("domain".to_string(), self.domain.clone()); row.insert("contentType".to_string(), self.content_type.clone()); row.insert("cacheType".to_string(), self.cache_type.clone()); row.insert("count".to_string(), self.stat.count.to_string()); row.insert( "avgLifetime".to_string(), self.stat .avg_lifetime .map(|v| format!("{}", v as i64)) .unwrap_or_default(), ); row.insert( "minLifetime".to_string(), self.stat.min_lifetime.map(|v| v.to_string()).unwrap_or_default(), ); row.insert( "maxLifetime".to_string(), self.stat.max_lifetime.map(|v| v.to_string()).unwrap_or_default(), ); row } } ================================================ FILE: src/analysis/content_type_analyzer.rs ================================================ // SiteOne Crawler - ContentTypeAnalyzer // (c) Jan Reges use std::collections::HashMap; use crate::analysis::analyzer::Analyzer; use crate::analysis::base_analyzer::BaseAnalyzer; use crate::components::super_table::SuperTable; use crate::components::super_table_column::SuperTableColumn; use crate::output::output::Output; use crate::result::status::Status; use crate::types::ContentTypeId; use crate::utils; const SUPER_TABLE_CONTENT_TYPES: &str = "content-types"; const SUPER_TABLE_CONTENT_MIME_TYPES: &str = "content-types-raw"; pub struct ContentTypeAnalyzer { base: BaseAnalyzer, } impl Default for ContentTypeAnalyzer { fn default() -> Self { Self::new() } } impl ContentTypeAnalyzer { pub fn new() -> Self { Self { base: BaseAnalyzer::new(), } } fn add_content_type_super_table(&self, status: &Status, output: &mut dyn Output) { let visited_urls = status.get_visited_urls(); let content_type_ids = get_all_content_type_ids(); let mut stats: HashMap = HashMap::new(); for ct_id in &content_type_ids { let key = format!("{:?}", ct_id); stats.insert( key, ContentTypeStat { content_type_id: *ct_id, content_type: ct_id.name().to_string(), count: 0, total_size: 0, total_time: 0.0, status_20x: 0, status_30x: 0, status_40x: 0, status_42x: 0, status_50x: 0, status_other: 0, }, ); } for visited_url in &visited_urls { if visited_url.has_error_status_code() { continue; } let key = format!("{:?}", visited_url.content_type); if let Some(stat) = stats.get_mut(&key) { stat.count += 1; stat.total_size += visited_url.size.unwrap_or(0); stat.total_time += visited_url.request_time; let status_code = visited_url.status_code; if (200..300).contains(&status_code) { stat.status_20x += 1; } else if (300..400).contains(&status_code) { stat.status_30x += 1; } else if (400..420).contains(&status_code) { stat.status_40x += 1; } else if (420..500).contains(&status_code) { stat.status_42x += 1; } else if (500..600).contains(&status_code) { stat.status_50x += 1; } else { stat.status_other += 1; } } } // Remove empty stats and compute avg time let data: Vec> = stats .values() .filter(|s| s.count > 0) .map(|s| { let avg_time = s.total_time / s.count as f64; let mut row = HashMap::new(); row.insert("contentType".to_string(), s.content_type.clone()); row.insert("count".to_string(), s.count.to_string()); row.insert("totalSize".to_string(), s.total_size.to_string()); row.insert("totalTime".to_string(), format!("{:.4}", s.total_time)); row.insert("avgTime".to_string(), format!("{:.4}", avg_time)); row.insert("status20x".to_string(), s.status_20x.to_string()); row.insert("status30x".to_string(), s.status_30x.to_string()); row.insert("status40x".to_string(), s.status_40x.to_string()); row.insert("status42x".to_string(), s.status_42x.to_string()); row.insert("status50x".to_string(), s.status_50x.to_string()); row.insert("statusOther".to_string(), s.status_other.to_string()); row }) .collect(); let columns = build_content_type_columns(); let mut super_table = SuperTable::new( SUPER_TABLE_CONTENT_TYPES.to_string(), "Content types".to_string(), "No URLs found.".to_string(), columns, true, Some("count".to_string()), "DESC".to_string(), None, None, None, ); super_table.set_show_only_columns_with_values(true); super_table.set_data(data); status.configure_super_table_url_stripping(&mut super_table); output.add_super_table(&super_table); status.add_super_table_at_beginning(super_table); } fn add_content_type_raw_super_table(&self, status: &Status, output: &mut dyn Output) { let visited_urls = status.get_visited_urls(); let mut stats: HashMap = HashMap::new(); for visited_url in &visited_urls { if visited_url.has_error_status_code() { continue; } let key = visited_url .content_type_header .clone() .unwrap_or_else(|| "unknown".to_string()); let stat = stats.entry(key.clone()).or_insert_with(|| MimeTypeStat { content_type: key, count: 0, total_size: 0, total_time: 0.0, status_20x: 0, status_30x: 0, status_40x: 0, status_42x: 0, status_50x: 0, status_other: 0, }); stat.count += 1; stat.total_size += visited_url.size.unwrap_or(0); stat.total_time += visited_url.request_time; let status_code = visited_url.status_code; if (200..300).contains(&status_code) { stat.status_20x += 1; } else if (300..400).contains(&status_code) { stat.status_30x += 1; } else if (400..420).contains(&status_code) { stat.status_40x += 1; } else if (420..500).contains(&status_code) { stat.status_42x += 1; } else if (500..600).contains(&status_code) { stat.status_50x += 1; } else { stat.status_other += 1; } } let data: Vec> = stats .values() .map(|s| { let avg_time = if s.count > 0 { s.total_time / s.count as f64 } else { 0.0 }; let mut row = HashMap::new(); row.insert("contentType".to_string(), s.content_type.clone()); row.insert("count".to_string(), s.count.to_string()); row.insert("totalSize".to_string(), s.total_size.to_string()); row.insert("totalTime".to_string(), format!("{:.4}", s.total_time)); row.insert("avgTime".to_string(), format!("{:.4}", avg_time)); row.insert("status20x".to_string(), s.status_20x.to_string()); row.insert("status30x".to_string(), s.status_30x.to_string()); row.insert("status40x".to_string(), s.status_40x.to_string()); row.insert("status42x".to_string(), s.status_42x.to_string()); row.insert("status50x".to_string(), s.status_50x.to_string()); row.insert("statusOther".to_string(), s.status_other.to_string()); row }) .collect(); let mut columns = build_content_type_columns(); // Adjust content type column width for MIME types if let Some(col) = columns.first_mut() { col.width = 26; } let mut super_table = SuperTable::new( SUPER_TABLE_CONTENT_MIME_TYPES.to_string(), "Content types (MIME types)".to_string(), "No MIME types found.".to_string(), columns, true, Some("count".to_string()), "DESC".to_string(), None, None, None, ); super_table.set_show_only_columns_with_values(true); super_table.set_data(data); status.configure_super_table_url_stripping(&mut super_table); output.add_super_table(&super_table); status.add_super_table_at_beginning(super_table); } } impl Analyzer for ContentTypeAnalyzer { fn analyze(&mut self, status: &Status, output: &mut dyn Output) { self.add_content_type_super_table(status, output); self.add_content_type_raw_super_table(status, output); } fn should_be_activated(&self) -> bool { true } fn get_order(&self) -> i32 { 210 } fn get_name(&self) -> &str { "ContentTypeAnalyzer" } fn get_exec_times(&self) -> &HashMap { self.base.get_exec_times() } fn get_exec_counts(&self) -> &HashMap { self.base.get_exec_counts() } } struct ContentTypeStat { #[allow(dead_code)] content_type_id: ContentTypeId, content_type: String, count: usize, total_size: i64, total_time: f64, status_20x: usize, status_30x: usize, status_40x: usize, status_42x: usize, status_50x: usize, status_other: usize, } struct MimeTypeStat { content_type: String, count: usize, total_size: i64, total_time: f64, status_20x: usize, status_30x: usize, status_40x: usize, status_42x: usize, status_50x: usize, status_other: usize, } fn build_content_type_columns() -> Vec { vec![ SuperTableColumn::new( "contentType".to_string(), "Content type".to_string(), 12, None, None, false, false, false, true, None, ), SuperTableColumn::new( "count".to_string(), "URLs".to_string(), 5, None, None, false, false, false, true, None, ), SuperTableColumn::new( "totalSize".to_string(), "Total size".to_string(), 10, Some(Box::new(|value: &str, _render_into: &str| { if let Ok(v) = value.parse::() { if v > 0 { utils::get_formatted_size(v, 0) } else { "-".to_string() } } else { "-".to_string() } })), None, false, false, false, true, None, ), SuperTableColumn::new( "totalTime".to_string(), "Total time".to_string(), 10, Some(Box::new(|value: &str, _render_into: &str| { if let Ok(v) = value.parse::() { utils::get_formatted_duration(v) } else { value.to_string() } })), None, false, false, false, true, None, ), SuperTableColumn::new( "avgTime".to_string(), "Avg time".to_string(), 8, Some(Box::new(|value: &str, _render_into: &str| { if let Ok(v) = value.parse::() { utils::get_colored_request_time(v, 8) } else { value.to_string() } })), None, false, false, false, true, None, ), SuperTableColumn::new( "status20x".to_string(), "Status 20x".to_string(), 10, Some(Box::new(|value: &str, _render_into: &str| { if let Ok(v) = value.parse::() { if v > 0 { utils::get_color_text(&format!("{:<10}", v), "green", false) } else { value.to_string() } } else { value.to_string() } })), None, false, false, false, true, None, ), SuperTableColumn::new( "status30x".to_string(), "Status 30x".to_string(), 10, Some(Box::new(|value: &str, _render_into: &str| { if let Ok(v) = value.parse::() { if v > 0 { utils::get_color_text(&format!("{:<10}", v), "yellow", true) } else { value.to_string() } } else { value.to_string() } })), None, false, false, false, true, None, ), SuperTableColumn::new( "status40x".to_string(), "Status 40x".to_string(), 10, Some(Box::new(|value: &str, _render_into: &str| { if let Ok(v) = value.parse::() { if v > 0 { utils::get_color_text(&format!("{:<10}", v), "magenta", true) } else { value.to_string() } } else { value.to_string() } })), None, false, false, false, true, None, ), SuperTableColumn::new( "status42x".to_string(), "Status 42x".to_string(), 10, Some(Box::new(|value: &str, _render_into: &str| { if let Ok(v) = value.parse::() { if v > 0 { utils::get_color_text(&format!("{:<10}", v), "magenta", true) } else { value.to_string() } } else { value.to_string() } })), None, false, false, false, true, None, ), SuperTableColumn::new( "status50x".to_string(), "Status 50x".to_string(), 10, Some(Box::new(|value: &str, _render_into: &str| { if let Ok(v) = value.parse::() { if v > 0 { utils::get_color_text(&format!("{:<10}", v), "red", true) } else { value.to_string() } } else { value.to_string() } })), None, false, false, false, true, None, ), SuperTableColumn::new( "statusOther".to_string(), "Status ERR".to_string(), 10, Some(Box::new(|value: &str, _render_into: &str| { if let Ok(v) = value.parse::() { if v > 0 { utils::get_color_text(&format!("{:<10}", v), "red", true) } else { value.to_string() } } else { value.to_string() } })), None, false, false, false, true, None, ), ] } fn get_all_content_type_ids() -> Vec { vec![ ContentTypeId::Html, ContentTypeId::Script, ContentTypeId::Stylesheet, ContentTypeId::Image, ContentTypeId::Video, ContentTypeId::Audio, ContentTypeId::Font, ContentTypeId::Document, ContentTypeId::Json, ContentTypeId::Xml, ContentTypeId::Redirect, ContentTypeId::Other, ] } ================================================ FILE: src/analysis/dns_analyzer.rs ================================================ // SiteOne Crawler - DnsAnalyzer // (c) Jan Reges use std::collections::HashMap; use crate::analysis::analyzer::Analyzer; use crate::analysis::base_analyzer::BaseAnalyzer; use crate::analysis::result::dns_analysis_result::DnsAnalysisResult; use crate::components::super_table::SuperTable; use crate::components::super_table_column::SuperTableColumn; use crate::output::output::Output; use crate::result::status::Status; use crate::utils; const SUPER_TABLE_DNS: &str = "dns"; pub struct DnsAnalyzer { base: BaseAnalyzer, } impl Default for DnsAnalyzer { fn default() -> Self { Self::new() } } impl DnsAnalyzer { pub fn new() -> Self { Self { base: BaseAnalyzer::new(), } } /// Resolve DNS for the given domain using hickory-resolver. fn get_dns_info(&self, domain: &str) -> Result { use hickory_resolver::Resolver; use hickory_resolver::proto::rr::RecordType; let domain_owned = domain.to_string(); // Use block_in_place to allow blocking the current thread while running async DNS lookups tokio::task::block_in_place(|| { let rt = tokio::runtime::Handle::current(); rt.block_on(async { let resolver = Resolver::builder_tokio() .map_err(|e| format!("Failed to create DNS resolver: {}", e))? .build(); let mut resolved_domains = vec![domain_owned.clone()]; let mut ipv4_addresses = Vec::new(); let mut ipv6_addresses = Vec::new(); // Resolve CNAME records if let Ok(cname_response) = resolver.lookup(domain_owned.as_str(), RecordType::CNAME).await { for record in cname_response.iter() { let cname_str = record.to_string().trim_end_matches('.').to_string(); if !resolved_domains.contains(&cname_str) { resolved_domains.push(cname_str); } } } // Resolve A records (IPv4) if let Ok(ipv4_response) = resolver.lookup(domain_owned.as_str(), RecordType::A).await { for record in ipv4_response.iter() { let ip_str = record.to_string(); if !ip_str.is_empty() { ipv4_addresses.push(ip_str); } } } // Resolve AAAA records (IPv6) if let Ok(ipv6_response) = resolver.lookup(domain_owned.as_str(), RecordType::AAAA).await { for record in ipv6_response.iter() { let ip_str = record.to_string(); if !ip_str.is_empty() { ipv6_addresses.push(ip_str); } } } if ipv4_addresses.is_empty() && ipv6_addresses.is_empty() { return Err(format!("Unable to resolve DNS records for {}", domain_owned)); } let dns_server_ip = Self::get_system_dns_server().unwrap_or_else(|| "0.0.0.0".to_string()); let dns_server_name = dns_server_ip.clone(); Ok(DnsAnalysisResult::new( dns_server_name, dns_server_ip, resolved_domains, ipv4_addresses, ipv6_addresses, )) }) }) } /// Read the first nameserver entry from /etc/resolv.conf to get the system DNS server IP. fn get_system_dns_server() -> Option { let contents = std::fs::read_to_string("/etc/resolv.conf").ok()?; for line in contents.lines() { let trimmed = line.trim(); if trimmed.starts_with("nameserver") && let Some(ip) = trimmed.split_whitespace().nth(1) { return Some(ip.to_string()); } } None } } impl Analyzer for DnsAnalyzer { fn analyze(&mut self, status: &Status, output: &mut dyn Output) { let columns = vec![SuperTableColumn::new( "info".to_string(), "DNS resolving tree".to_string(), 70, Some(Box::new(|value: &str, _render_into: &str| { let mut result = value.to_string(); // Colorize IPv4 addresses if let Ok(re) = regex::Regex::new(r"(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})") { result = re .replace_all(&result, |caps: ®ex::Captures| { let ip = &caps[1]; if ip.parse::().is_ok() { utils::get_color_text(ip, "blue", true) } else { ip.to_string() } }) .to_string(); } // Colorize IPv6 addresses if let Ok(re) = regex::Regex::new(r"([0-9a-f:]+:+)+[0-9a-f]+") { result = re .replace_all(&result, |caps: ®ex::Captures| { let ip = &caps[0]; if ip.parse::().is_ok() { utils::get_color_text(ip, "blue", true) } else { ip.to_string() } }) .to_string(); } result })), None, true, false, true, false, None, )]; let mut super_table = SuperTable::new( SUPER_TABLE_DNS.to_string(), "DNS info".to_string(), "No DNS info found.".to_string(), columns, false, None, "ASC".to_string(), None, None, None, ); let mut data: Vec> = Vec::new(); // Extract domain from the first visited URL let domain = status .get_visited_urls() .first() .and_then(|u| u.get_host()) .unwrap_or_else(|| "unknown".to_string()); match self.get_dns_info(&domain) { Ok(dns_info) => { for line in dns_info.get_txt_description().lines() { let mut row = HashMap::new(); row.insert("info".to_string(), line.to_string()); data.push(row); } let resolved_domain = dns_info .resolved_domains .first() .cloned() .unwrap_or_else(|| "unknown".to_string()); // DNS server suffix — omit when unknown (e.g. on Windows where /etc/resolv.conf doesn't exist) let dns_suffix = if dns_info.dns_server_ip_address != "0.0.0.0" { format!(" (DNS server: {})", dns_info.dns_server_name) } else { String::new() }; // IPv4 summary if !dns_info.ipv4_addresses.is_empty() { status.add_ok_to_summary( "dns-ipv4", &format!( "DNS IPv4 OK: domain {} resolved to {}{}", resolved_domain, dns_info.ipv4_addresses.join(", "), dns_suffix ), ); } else { status.add_notice_to_summary( "dns-ipv4", &format!( "DNS IPv4: domain {} does not support IPv4{}", resolved_domain, dns_suffix ), ); } // IPv6 summary if !dns_info.ipv6_addresses.is_empty() { status.add_ok_to_summary( "dns-ipv6", &format!( "DNS IPv6 OK: domain {} resolved to {}{}", resolved_domain, dns_info.ipv6_addresses.join(", "), dns_suffix ), ); } else { status.add_notice_to_summary( "dns-ipv6", &format!( "DNS IPv6: domain {} does not support IPv6{}", resolved_domain, dns_suffix ), ); } // CNAME chain summary if dns_info.resolved_domains.len() > 1 { status.add_info_to_summary( "dns-aliases", &format!( "DNS Aliases: IP(s) for domain {} were resolved by CNAME chain {}.", resolved_domain, dns_info.resolved_domains.join(" > ") ), ); } } Err(e) => { let mut row = HashMap::new(); row.insert("info".to_string(), e.clone()); data.push(row); status.add_critical_to_summary("dns", &format!("Problem with DNS analysis: {}", e)); } } super_table.set_data(data); status.configure_super_table_url_stripping(&mut super_table); output.add_super_table(&super_table); status.add_super_table_at_end(super_table); } fn should_be_activated(&self) -> bool { true } fn get_order(&self) -> i32 { 215 } fn get_name(&self) -> &str { "DnsAnalyzer" } fn get_exec_times(&self) -> &HashMap { self.base.get_exec_times() } fn get_exec_counts(&self) -> &HashMap { self.base.get_exec_counts() } } ================================================ FILE: src/analysis/external_links_analyzer.rs ================================================ // SiteOne Crawler - ExternalLinksAnalyzer // (c) Jan Reges // // Presents external URLs discovered during crawling as a dedicated section. // Groups external URLs, shows occurrence count and up to 5 source pages. use std::collections::HashMap; use crate::analysis::analyzer::Analyzer; use crate::analysis::base_analyzer::BaseAnalyzer; use crate::components::super_table::SuperTable; use crate::components::super_table_column::SuperTableColumn; use crate::output::output::Output; use crate::result::status::Status; use crate::types::SkippedReason; const SUPER_TABLE_EXTERNAL_URLS: &str = "external-urls"; const MAX_SOURCE_PAGES: usize = 5; pub struct ExternalLinksAnalyzer { base: BaseAnalyzer, } impl Default for ExternalLinksAnalyzer { fn default() -> Self { Self::new() } } impl ExternalLinksAnalyzer { pub fn new() -> Self { Self { base: BaseAnalyzer::new(), } } } impl Analyzer for ExternalLinksAnalyzer { fn analyze(&mut self, status: &Status, output: &mut dyn Output) { let skipped_entries = status.get_skipped_urls(); // Filter only external links (NotAllowedHost reason) let external_entries: Vec<_> = skipped_entries .iter() .filter(|e| matches!(e.reason, SkippedReason::NotAllowedHost)) .collect(); // Group by external URL: collect count and source page URLs let mut url_data: HashMap> = HashMap::new(); for entry in &external_entries { let source_url = status.get_url_by_uq_id(&entry.source_uq_id).unwrap_or_default(); let sources = url_data.entry(entry.url.clone()).or_default(); if !source_url.is_empty() && !sources.contains(&source_url) { sources.push(source_url); } } let total_urls = url_data.len(); let mut rows: Vec> = url_data .iter() .map(|(ext_url, sources)| { let mut row = HashMap::new(); row.insert("url".to_string(), ext_url.clone()); row.insert("count".to_string(), sources.len().to_string()); let display_sources: Vec<&str> = sources.iter().take(MAX_SOURCE_PAGES).map(|s| s.as_str()).collect(); let mut found_on = display_sources.join(", "); if sources.len() > MAX_SOURCE_PAGES { found_on.push_str(&format!(" (+{})", sources.len() - MAX_SOURCE_PAGES)); } row.insert("foundOn".to_string(), found_on); row }) .collect(); rows.sort_by(|a, b| { let count_a: usize = a.get("count").and_then(|c| c.parse().ok()).unwrap_or(0); let count_b: usize = b.get("count").and_then(|c| c.parse().ok()).unwrap_or(0); count_b.cmp(&count_a).then_with(|| a.get("url").cmp(&b.get("url"))) }); let url_column_width = 60; let columns = vec![ SuperTableColumn::new( "url".to_string(), "External URL".to_string(), url_column_width, None, None, true, true, false, true, None, ), SuperTableColumn::new( "count".to_string(), "Pages".to_string(), 5, None, None, false, false, false, true, None, ), SuperTableColumn::new( "foundOn".to_string(), "Found on URL (max 5)".to_string(), url_column_width, None, None, true, true, false, true, None, ), ]; let mut super_table = SuperTable::new( SUPER_TABLE_EXTERNAL_URLS.to_string(), "External URLs".to_string(), "No external URLs found.".to_string(), columns, true, Some("count".to_string()), "DESC".to_string(), Some(format!("{} external URL(s)", total_urls)), None, None, ); super_table.set_data(rows); status.configure_super_table_url_stripping(&mut super_table); output.add_super_table(&super_table); status.add_super_table_at_beginning(super_table); status.add_summary_item_by_ranges( "external-urls", total_urls as f64, &[(0.0, 0.0), (1.0, f64::MAX)], &[ "External URLs - no external URLs found", "External URLs - {} external URL(s) found", ], ); } fn should_be_activated(&self) -> bool { true } fn get_order(&self) -> i32 { 7 // After skipped URLs (6) } fn get_name(&self) -> &str { "ExternalLinksAnalyzer" } fn get_exec_times(&self) -> &HashMap { self.base.get_exec_times() } fn get_exec_counts(&self) -> &HashMap { self.base.get_exec_counts() } } ================================================ FILE: src/analysis/fastest_analyzer.rs ================================================ // SiteOne Crawler - FastestAnalyzer // (c) Jan Reges use std::collections::HashMap; use crate::analysis::analyzer::Analyzer; use crate::analysis::base_analyzer::BaseAnalyzer; use crate::components::super_table::SuperTable; use crate::components::super_table_column::SuperTableColumn; use crate::output::output::Output; use crate::result::status::Status; use crate::types::ContentTypeId; use crate::utils; const SUPER_TABLE_FASTEST_URLS: &str = "fastest-urls"; pub struct FastestAnalyzer { base: BaseAnalyzer, fastest_top_limit: usize, fastest_max_time: f64, } impl Default for FastestAnalyzer { fn default() -> Self { Self::new() } } impl FastestAnalyzer { pub fn new() -> Self { Self { base: BaseAnalyzer::new(), fastest_top_limit: 20, fastest_max_time: 1.0, } } /// Set configuration from CoreOptions. pub fn set_config(&mut self, fastest_top_limit: usize, fastest_max_time: f64) { self.fastest_top_limit = fastest_top_limit; self.fastest_max_time = fastest_max_time; } } impl Analyzer for FastestAnalyzer { fn analyze(&mut self, status: &Status, output: &mut dyn Output) { let visited_urls = status.get_visited_urls(); let mut fast_urls: Vec<_> = visited_urls .into_iter() .filter(|u| { u.status_code == 200 && u.is_allowed_for_crawling && u.content_type == ContentTypeId::Html && u.request_time <= self.fastest_max_time }) .collect(); fast_urls.sort_by(|a, b| { a.request_time .partial_cmp(&b.request_time) .unwrap_or(std::cmp::Ordering::Equal) }); fast_urls.truncate(self.fastest_top_limit); let console_width = utils::get_console_width(); let url_column_width = (console_width as i32 - 20).max(20); let columns = vec![ SuperTableColumn::new( "requestTime".to_string(), "Time".to_string(), 6, Some(Box::new(|value: &str, _render_into: &str| { if let Ok(v) = value.parse::() { utils::get_colored_request_time(v, 6) } else { value.to_string() } })), None, false, false, false, true, None, ), SuperTableColumn::new( "statusCode".to_string(), "Status".to_string(), 6, Some(Box::new(|value: &str, _render_into: &str| { if let Ok(v) = value.parse::() { utils::get_colored_status_code(v, 6) } else { value.to_string() } })), None, false, false, false, true, None, ), SuperTableColumn::new( "url".to_string(), "Fast URL".to_string(), url_column_width, None, None, true, true, false, true, None, ), ]; let data: Vec> = fast_urls .iter() .map(|u| { let mut row = HashMap::new(); row.insert("requestTime".to_string(), format!("{:.4}", u.request_time)); row.insert("statusCode".to_string(), u.status_code.to_string()); row.insert("url".to_string(), u.url.clone()); row }) .collect(); let mut super_table = SuperTable::new( SUPER_TABLE_FASTEST_URLS.to_string(), "TOP fastest URLs".to_string(), format!("No fast URLs faster than {} second(s) found.", self.fastest_max_time), columns, true, Some("requestTime".to_string()), "ASC".to_string(), None, None, None, ); super_table.set_data(data); status.configure_super_table_url_stripping(&mut super_table); output.add_super_table(&super_table); status.add_super_table_at_beginning(super_table); } fn should_be_activated(&self) -> bool { true } fn get_order(&self) -> i32 { 100 } fn get_name(&self) -> &str { "FastestAnalyzer" } fn get_exec_times(&self) -> &HashMap { self.base.get_exec_times() } fn get_exec_counts(&self) -> &HashMap { self.base.get_exec_counts() } } ================================================ FILE: src/analysis/headers_analyzer.rs ================================================ // SiteOne Crawler - HeadersAnalyzer // (c) Jan Reges use std::collections::HashMap; use crate::analysis::analyzer::Analyzer; use crate::analysis::base_analyzer::BaseAnalyzer; use crate::analysis::result::header_stats::HeaderStats; use crate::analysis::result::url_analysis_result::UrlAnalysisResult; use crate::components::super_table::SuperTable; use crate::components::super_table_column::SuperTableColumn; use crate::output::output::Output; use crate::result::status::Status; use crate::result::visited_url::VisitedUrl; use crate::utils; const SUPER_TABLE_HEADERS: &str = "headers"; const SUPER_TABLE_HEADERS_VALUES: &str = "headers-values"; pub struct HeadersAnalyzer { base: BaseAnalyzer, header_stats: HashMap, } impl Default for HeadersAnalyzer { fn default() -> Self { Self::new() } } impl HeadersAnalyzer { pub fn new() -> Self { Self { base: BaseAnalyzer::new(), header_stats: HashMap::new(), } } } impl Analyzer for HeadersAnalyzer { fn analyze(&mut self, status: &Status, output: &mut dyn Output) { let console_width = utils::get_console_width(); // Basic header stats table let data: Vec> = self .header_stats .values() .map(|hs| { let mut row = HashMap::new(); row.insert("header".to_string(), hs.get_formatted_header_name()); row.insert("occurrences".to_string(), hs.occurrences.to_string()); let unique_count = hs.unique_values.len(); let unique_str = if unique_count == 0 { "-".to_string() } else if hs.unique_values_limit_reached { format!("{}+", unique_count) } else { unique_count.to_string() }; row.insert("uniqueValues".to_string(), unique_str); row.insert("valuesPreview".to_string(), hs.get_values_preview(120)); let min_value = hs.get_min_value().unwrap_or_default(); let max_value = hs.get_max_value().unwrap_or_default(); // Format min/max for content-length and age if hs.header == "content-length" { if let Some(min_int) = hs.min_int_value { row.insert("minValue".to_string(), utils::get_formatted_size(min_int, 0)); } else { row.insert("minValue".to_string(), String::new()); } if let Some(max_int) = hs.max_int_value { row.insert("maxValue".to_string(), utils::get_formatted_size(max_int, 0)); } else { row.insert("maxValue".to_string(), String::new()); } } else if hs.header == "age" { if let Some(min_int) = hs.min_int_value { row.insert("minValue".to_string(), utils::get_formatted_age(min_int)); } else { row.insert("minValue".to_string(), String::new()); } if let Some(max_int) = hs.max_int_value { row.insert("maxValue".to_string(), utils::get_formatted_age(max_int)); } else { row.insert("maxValue".to_string(), String::new()); } } else { row.insert("minValue".to_string(), min_value); row.insert("maxValue".to_string(), max_value); } row }) .collect(); let columns = vec![ SuperTableColumn::new( "header".to_string(), "Header".to_string(), -1, // AUTO_WIDTH None, None, false, false, false, true, None, ), SuperTableColumn::new( "occurrences".to_string(), "Occurs".to_string(), 6, None, None, false, false, false, true, None, ), SuperTableColumn::new( "uniqueValues".to_string(), "Unique".to_string(), 6, None, None, false, false, false, true, None, ), SuperTableColumn::new( "valuesPreview".to_string(), "Values preview".to_string(), (console_width as i32 - 90).max(20), None, None, true, true, false, false, None, ), SuperTableColumn::new( "minValue".to_string(), "Min value".to_string(), 10, None, None, false, false, false, true, None, ), SuperTableColumn::new( "maxValue".to_string(), "Max value".to_string(), 10, None, None, false, false, false, true, None, ), ]; let mut super_table = SuperTable::new( SUPER_TABLE_HEADERS.to_string(), "HTTP headers".to_string(), "No HTTP headers found.".to_string(), columns, true, Some("header".to_string()), "ASC".to_string(), None, None, None, ); super_table.set_data(data); status.configure_super_table_url_stripping(&mut super_table); output.add_super_table(&super_table); status.add_super_table_at_end(super_table); let unique_count = self.header_stats.len(); status.add_summary_item_by_ranges( "unique-headers", unique_count as f64, &[(0.0, 30.0), (31.0, 40.0), (41.0, 50.0), (51.0, f64::MAX)], &[ "HTTP headers - found {} unique headers", "HTTP headers - found {} unique headers", "HTTP headers - found {} unique headers (too many)", "HTTP headers - found {} unique headers (too many)", ], ); // Detail info with header values let mut details: Vec> = Vec::new(); for header_stat in self.header_stats.values() { for (value, count) in &header_stat.unique_values { let mut row = HashMap::new(); row.insert("header".to_string(), header_stat.get_formatted_header_name()); row.insert("occurrences".to_string(), count.to_string()); row.insert("value".to_string(), value.clone()); details.push(row); } } // Sort by header asc, then by occurrences desc details.sort_by(|a, b| { let header_a = a.get("header").cloned().unwrap_or_default(); let header_b = b.get("header").cloned().unwrap_or_default(); if header_a == header_b { let occ_a = a.get("occurrences").and_then(|v| v.parse::().ok()).unwrap_or(0); let occ_b = b.get("occurrences").and_then(|v| v.parse::().ok()).unwrap_or(0); occ_b.cmp(&occ_a) } else { header_a.cmp(&header_b) } }); let detail_columns = vec![ SuperTableColumn::new( "header".to_string(), "Header".to_string(), -1, // AUTO_WIDTH None, None, false, false, false, true, None, ), SuperTableColumn::new( "occurrences".to_string(), "Occurs".to_string(), 6, None, None, false, false, false, true, None, ), SuperTableColumn::new( "value".to_string(), "Value".to_string(), (console_width as i32 - 56).max(20), None, None, true, true, false, true, None, ), ]; let mut detail_table = SuperTable::new( SUPER_TABLE_HEADERS_VALUES.to_string(), "HTTP header values".to_string(), "No HTTP headers found.".to_string(), detail_columns, true, None, "ASC".to_string(), None, None, None, ); detail_table.set_data(details); status.configure_super_table_url_stripping(&mut detail_table); output.add_super_table(&detail_table); status.add_super_table_at_end(detail_table); } fn analyze_visited_url( &mut self, visited_url: &VisitedUrl, _body: Option<&str>, headers: Option<&HashMap>, ) -> Option { let headers = headers?; if !visited_url.is_allowed_for_crawling { return None; } for (header, values) in headers { let header_lower = header.to_lowercase(); let stat = self .header_stats .entry(header_lower.clone()) .or_insert_with(|| HeaderStats::new(header_lower)); stat.add_value(values); } None } fn should_be_activated(&self) -> bool { true } fn get_order(&self) -> i32 { 115 } fn get_name(&self) -> &str { "HeadersAnalyzer" } fn get_exec_times(&self) -> &HashMap { self.base.get_exec_times() } fn get_exec_counts(&self) -> &HashMap { self.base.get_exec_counts() } } ================================================ FILE: src/analysis/manager.rs ================================================ // SiteOne Crawler - Analysis Manager // (c) Jan Reges use std::collections::HashMap; use crate::analysis::analyzer::Analyzer; use crate::analysis::result::url_analysis_result::UrlAnalysisResult; use crate::output::output::Output; use crate::result::manager_stats::ManagerStats; use crate::result::status::Status; use crate::result::visited_url::VisitedUrl; use crate::utils; pub const SUPER_TABLE_ANALYSIS_STATS: &str = "analysis-stats"; pub struct AnalysisManager { analyzers: Vec>, stats: ManagerStats, } impl AnalysisManager { pub fn new() -> Self { Self { analyzers: Vec::new(), stats: ManagerStats::new(), } } /// Register all analyzer instances. Each analyzer's should_be_activated() /// determines whether it is actually used. pub fn register_analyzer(&mut self, analyzer: Box) { self.analyzers.push(analyzer); } /// Auto-activate: remove analyzers that should not be activated based on options. pub fn auto_activate_analyzers(&mut self) { self.analyzers.retain(|a| a.should_be_activated()); } /// Filter analyzers by regex pattern. /// Only analyzers whose name matches the regex are kept. /// Supports PCRE-style delimited patterns (e.g., /security/i). pub fn filter_analyzers_by_regex(&mut self, filter_regex: &str) { let pattern = utils::extract_pcre_regex_pattern(filter_regex); if let Ok(re) = fancy_regex::Regex::new(&pattern) { self.analyzers.retain(|a| re.is_match(a.get_name()).unwrap_or(true)); } } /// Run analyze_visited_url for each active analyzer. /// Called per URL during the crawl. pub fn analyze_visited_url( &mut self, visited_url: &VisitedUrl, body: Option<&str>, headers: Option<&HashMap>, status: &Status, ) -> Vec<(String, UrlAnalysisResult)> { let mut results = Vec::new(); for analyzer in &mut self.analyzers { if let Some(result) = analyzer.analyze_visited_url(visited_url, body, headers) { let name = analyzer.get_name().to_string(); status.add_url_analysis_result( &visited_url.uq_id, crate::result::status::UrlAnalysisResultEntry { analysis_name: name.clone(), result: result.clone(), }, ); results.push((name, result)); } } results } /// Run post-crawl analysis for all active analyzers, sorted by order. pub fn run_analyzers(&mut self, status: &Status, output: &mut dyn Output) { // Check if there are any working URLs if status.get_number_of_working_visited_urls() == 0 { let error_message = "The analysis has been suspended because no working URL could be found. Please check the URL/domain."; output.add_error(error_message); status.add_critical_to_summary("analysis-manager-error", error_message); return; } // Sort analyzers by order self.analyzers.sort_by_key(|a| a.get_order()); for analyzer in &mut self.analyzers { analyzer.analyze(status, output); } // Collect and merge exec times from all analyzers if !self.analyzers.is_empty() { let mut all_exec_times: HashMap = HashMap::new(); let mut all_exec_counts: HashMap = HashMap::new(); for analyzer in &self.analyzers { for (key, time) in analyzer.get_exec_times() { *all_exec_times.entry(key.clone()).or_insert(0.0) += time; } for (key, count) in analyzer.get_exec_counts() { *all_exec_counts.entry(key.clone()).or_insert(0) += count; } } let super_table = self.stats.get_super_table( SUPER_TABLE_ANALYSIS_STATS, "Analysis stats", "No analysis stats", Some(&all_exec_times), Some(&all_exec_counts), ); let mut super_table = super_table; status.configure_super_table_url_stripping(&mut super_table); output.add_super_table(&super_table); status.add_super_table_at_end(super_table); } } /// Get all analyzers pub fn get_analyzers(&self) -> &[Box] { &self.analyzers } /// Check if analyzer with given name is active pub fn has_analyzer(&self, name: &str) -> bool { self.analyzers.iter().any(|a| a.get_name() == name) } /// Get extra columns from all analyzers that want to show results as columns. /// Returns columns in registration order (alphabetical). pub fn get_extra_columns(&self) -> Vec { self.analyzers .iter() .filter_map(|a| a.show_analyzed_visited_url_result_as_column()) .collect() } /// Map analysis results to extra column values for the progress table. /// Returns a HashMap of column_name -> colorized_value_string. pub fn get_analysis_column_values( &self, analysis_results: &[(String, UrlAnalysisResult)], ) -> HashMap { let mut result = HashMap::new(); for analyzer in &self.analyzers { if let Some(extra_col) = analyzer.show_analyzed_visited_url_result_as_column() { let analyzer_name = analyzer.get_name(); // Find the matching result for this analyzer if let Some((_, url_result)) = analysis_results.iter().find(|(name, _)| name == analyzer_name) { let colorized = url_result.to_colorized_string(true); if !colorized.is_empty() { result.insert(extra_col.name.clone(), colorized); } } } } result } } impl Default for AnalysisManager { fn default() -> Self { Self::new() } } ================================================ FILE: src/analysis/mod.rs ================================================ pub mod analyzer; pub mod base_analyzer; pub mod manager; pub mod result; // Simple analyzers pub mod caching_analyzer; pub mod content_type_analyzer; pub mod dns_analyzer; pub mod external_links_analyzer; pub mod fastest_analyzer; pub mod headers_analyzer; pub mod page404_analyzer; pub mod redirects_analyzer; pub mod skipped_urls_analyzer; pub mod slowest_analyzer; pub mod source_domains_analyzer; // Complex analyzers (DOM parsing / TLS inspection) pub mod accessibility_analyzer; pub mod best_practice_analyzer; pub mod security_analyzer; pub mod seo_opengraph_analyzer; pub mod ssl_tls_analyzer; ================================================ FILE: src/analysis/page404_analyzer.rs ================================================ // SiteOne Crawler - Page404Analyzer // (c) Jan Reges use std::collections::HashMap; use crate::analysis::analyzer::Analyzer; use crate::analysis::base_analyzer::BaseAnalyzer; use crate::components::super_table::SuperTable; use crate::components::super_table_column::SuperTableColumn; use crate::output::output::Output; use crate::result::status::Status; use crate::utils; const SUPER_TABLE_404: &str = "404"; pub struct Page404Analyzer { base: BaseAnalyzer, } impl Default for Page404Analyzer { fn default() -> Self { Self::new() } } impl Page404Analyzer { pub fn new() -> Self { Self { base: BaseAnalyzer::new(), } } } impl Analyzer for Page404Analyzer { fn analyze(&mut self, status: &Status, output: &mut dyn Output) { let visited_urls = status.get_visited_urls(); let urls_404: Vec<_> = visited_urls.iter().filter(|u| u.status_code == 404).cloned().collect(); let console_width = utils::get_console_width(); let url_column_size = ((console_width as i32 - 16) / 2).max(20); let status_ref = status; let columns = vec![ SuperTableColumn::new( "statusCode".to_string(), "Status".to_string(), 6, Some(Box::new(|value: &str, _render_into: &str| { if let Ok(v) = value.parse::() { utils::get_colored_status_code(v, 6) } else { value.to_string() } })), None, false, false, false, true, None, ), SuperTableColumn::new( "url".to_string(), "URL 404".to_string(), url_column_size, None, None, true, true, false, true, None, ), SuperTableColumn::new( "sourceUqId".to_string(), "Found at URL".to_string(), url_column_size, None, None, true, true, false, true, None, ), ]; let data: Vec> = urls_404 .iter() .map(|u| { let mut row = HashMap::new(); row.insert("statusCode".to_string(), u.status_code.to_string()); row.insert("url".to_string(), u.url.clone()); let source_url = if !u.source_uq_id.is_empty() { status_ref.get_url_by_uq_id(&u.source_uq_id).unwrap_or_default() } else { String::new() }; row.insert("sourceUqId".to_string(), source_url); row }) .collect(); let count_404 = data.len(); let mut super_table = SuperTable::new( SUPER_TABLE_404.to_string(), "404 URLs".to_string(), "No 404 URLs found.".to_string(), columns, true, Some("url".to_string()), "ASC".to_string(), None, None, None, ); super_table.set_data(data); status.configure_super_table_url_stripping(&mut super_table); output.add_super_table(&super_table); status.add_super_table_at_beginning(super_table); status.add_summary_item_by_ranges( "404", count_404 as f64, &[(0.0, 0.0), (1.0, 2.0), (3.0, 5.0), (6.0, f64::MAX)], &[ "404 OK - all pages exists, no non-existent pages found", "404 NOTICE - {} non-existent page(s) found", "404 WARNING - {} non-existent pages found", "404 CRITICAL - {} non-existent pages found", ], ); } fn should_be_activated(&self) -> bool { true } fn get_order(&self) -> i32 { 20 } fn get_name(&self) -> &str { "Page404Analyzer" } fn get_exec_times(&self) -> &HashMap { self.base.get_exec_times() } fn get_exec_counts(&self) -> &HashMap { self.base.get_exec_counts() } } ================================================ FILE: src/analysis/redirects_analyzer.rs ================================================ // SiteOne Crawler - RedirectsAnalyzer // (c) Jan Reges use std::collections::HashMap; use crate::analysis::analyzer::Analyzer; use crate::analysis::base_analyzer::BaseAnalyzer; use crate::components::super_table::SuperTable; use crate::components::super_table_column::SuperTableColumn; use crate::output::output::Output; use crate::result::status::Status; use crate::utils; const SUPER_TABLE_REDIRECTS: &str = "redirects"; pub struct RedirectsAnalyzer { base: BaseAnalyzer, } impl Default for RedirectsAnalyzer { fn default() -> Self { Self::new() } } impl RedirectsAnalyzer { pub fn new() -> Self { Self { base: BaseAnalyzer::new(), } } } impl Analyzer for RedirectsAnalyzer { fn analyze(&mut self, status: &Status, output: &mut dyn Output) { let visited_urls = status.get_visited_urls(); let url_redirects: Vec<_> = visited_urls .iter() .filter(|u| u.status_code >= 301 && u.status_code <= 308) .cloned() .collect(); let console_width = utils::get_console_width(); let url_column_width = ((console_width as i32 - 20) / 3).max(20); let columns = vec![ SuperTableColumn::new( "statusCode".to_string(), "Status".to_string(), 6, Some(Box::new(|value: &str, _render_into: &str| { if let Ok(v) = value.parse::() { utils::get_colored_status_code(v, 6) } else { value.to_string() } })), None, false, false, false, true, None, ), SuperTableColumn::new( "url".to_string(), "Redirected URL".to_string(), url_column_width, None, None, true, true, false, true, None, ), SuperTableColumn::new( "targetUrl".to_string(), "Target URL".to_string(), url_column_width, None, None, true, true, false, true, None, ), SuperTableColumn::new( "sourceUqId".to_string(), "Found at URL".to_string(), url_column_width, None, None, true, true, false, true, None, ), ]; let data: Vec> = url_redirects .iter() .map(|u| { let mut row = HashMap::new(); row.insert("statusCode".to_string(), u.status_code.to_string()); row.insert("url".to_string(), u.url.clone()); // Target URL from the Location header in extras let target = u .extras .as_ref() .and_then(|e| e.get("Location")) .cloned() .unwrap_or_else(|| "?".to_string()); row.insert("targetUrl".to_string(), target); let source_url = if !u.source_uq_id.is_empty() { status.get_url_by_uq_id(&u.source_uq_id).unwrap_or_default() } else { String::new() }; row.insert("sourceUqId".to_string(), source_url); row }) .collect(); let count_redirects = data.len(); let mut super_table = SuperTable::new( SUPER_TABLE_REDIRECTS.to_string(), "Redirected URLs".to_string(), "No redirects found.".to_string(), columns, true, Some("url".to_string()), "ASC".to_string(), None, None, None, ); super_table.set_data(data); status.configure_super_table_url_stripping(&mut super_table); output.add_super_table(&super_table); status.add_super_table_at_beginning(super_table); status.add_summary_item_by_ranges( "redirects", count_redirects as f64, &[(0.0, 0.0), (1.0, 2.0), (3.0, 9.0), (10.0, f64::MAX)], &[ "Redirects - no redirects found", "Redirects - {} redirect(s) found", "Redirects - {} redirects found", "Redirects - {} redirects found", ], ); } fn should_be_activated(&self) -> bool { true } fn get_order(&self) -> i32 { 10 } fn get_name(&self) -> &str { "RedirectsAnalyzer" } fn get_exec_times(&self) -> &HashMap { self.base.get_exec_times() } fn get_exec_counts(&self) -> &HashMap { self.base.get_exec_counts() } } ================================================ FILE: src/analysis/result/analyzer_stats.rs ================================================ // SiteOne Crawler - AnalyzerStats // (c) Jan Reges use std::collections::HashMap; #[derive(Debug, Clone, Default)] pub struct AnalyzerStats { /// analysis_name -> severity -> set of subject hashes (or just counted entries) severity_counts_per_analysis: HashMap, } #[derive(Debug, Clone, Default)] struct SeverityCounts { ok: HashMap, notice: HashMap, warning: HashMap, critical: HashMap, } impl AnalyzerStats { pub fn new() -> Self { Self::default() } pub fn add_ok(&mut self, analysis_name: &str, subject: Option<&str>) { self.add_result(analysis_name, "ok", subject); } pub fn add_warning(&mut self, analysis_name: &str, subject: Option<&str>) { self.add_result(analysis_name, "warning", subject); } pub fn add_critical(&mut self, analysis_name: &str, subject: Option<&str>) { self.add_result(analysis_name, "critical", subject); } pub fn add_notice(&mut self, analysis_name: &str, subject: Option<&str>) { self.add_result(analysis_name, "notice", subject); } pub fn to_table_data(&self) -> Vec> { let mut result = Vec::new(); for (analysis_name, counts) in &self.severity_counts_per_analysis { let mut row = HashMap::new(); row.insert("analysisName".to_string(), analysis_name.clone()); row.insert("ok".to_string(), counts.ok.len().to_string()); row.insert("notice".to_string(), counts.notice.len().to_string()); row.insert("warning".to_string(), counts.warning.len().to_string()); row.insert("critical".to_string(), counts.critical.len().to_string()); result.push(row); } result } fn add_result(&mut self, analysis_name: &str, severity: &str, subject: Option<&str>) { let counts = self .severity_counts_per_analysis .entry(analysis_name.to_string()) .or_default(); let subject_hash = subject.map(|s| { use md5::{Digest, Md5}; let mut hasher = Md5::new(); hasher.update(s.trim().as_bytes()); let result = hasher.finalize(); format!("{:x}", result)[..10].to_string() }); let map = match severity { "ok" => &mut counts.ok, "notice" => &mut counts.notice, "warning" => &mut counts.warning, "critical" => &mut counts.critical, _ => return, }; if let Some(hash) = subject_hash { map.insert(hash, true); } else { // Use a unique key based on current count let key = format!("_auto_{}", map.len()); map.insert(key, true); } } } ================================================ FILE: src/analysis/result/dns_analysis_result.rs ================================================ // SiteOne Crawler - DnsAnalysisResult // (c) Jan Reges #[derive(Debug, Clone)] pub struct DnsAnalysisResult { pub dns_server_name: String, pub dns_server_ip_address: String, /// DNS resolved domain names (aliases) with all CNAMEs. /// First is the original domain name and last is the final resolved domain name. pub resolved_domains: Vec, /// Final resolved IPv4 addresses pub ipv4_addresses: Vec, /// Final resolved IPv6 addresses (when available) pub ipv6_addresses: Vec, } impl DnsAnalysisResult { pub fn new( dns_server_name: String, dns_server_ip_address: String, resolved_domains: Vec, ipv4_addresses: Vec, ipv6_addresses: Vec, ) -> Self { Self { dns_server_name, dns_server_ip_address, resolved_domains, ipv4_addresses, ipv6_addresses, } } /// Get text description of DNS analysis result in format respecting the /// hierarchy of resolved domains/CNAMEs and IPs. pub fn get_txt_description(&self) -> String { let mut result = String::new(); for (i, domain) in self.resolved_domains.iter().enumerate() { result.push_str(&" ".repeat(i)); result.push_str(domain); result.push('\n'); } let indent = " ".repeat(self.resolved_domains.len()); for ip in &self.ipv4_addresses { result.push_str(&indent); result.push_str(&format!("IPv4: {}\n", ip)); } for ip in &self.ipv6_addresses { result.push_str(&indent); result.push_str(&format!("IPv6: {}\n", ip)); } // Add DNS server info if available (0.0.0.0 means unknown, typical for CYGWIN) if self.dns_server_ip_address != "0.0.0.0" { if self.dns_server_name != self.dns_server_ip_address { result.push_str(&format!( "\nDNS server: {} ({})\n", self.dns_server_name, self.dns_server_ip_address )); } else { result.push_str(&format!("\nDNS server: {}\n", self.dns_server_name)); } } result.trim().to_string() } } ================================================ FILE: src/analysis/result/header_stats.rs ================================================ // SiteOne Crawler - HeaderStats // (c) Jan Reges use std::collections::HashMap; use crate::utils; const MAX_UNIQUE_VALUES: usize = 20; #[derive(Debug, Clone)] pub struct HeaderStats { pub header: String, pub occurrences: usize, pub unique_values: HashMap, pub unique_values_limit_reached: bool, pub min_date_value: Option, pub max_date_value: Option, pub min_int_value: Option, pub max_int_value: Option, } impl HeaderStats { pub fn new(header: String) -> Self { Self { header, occurrences: 0, unique_values: HashMap::new(), unique_values_limit_reached: false, min_date_value: None, max_date_value: None, min_int_value: None, max_int_value: None, } } pub fn add_value(&mut self, value: &str) { self.occurrences += 1; if self.ignore_header_values(&self.header.clone()) { } else if self.is_value_for_min_max_date(&self.header.clone()) { self.add_value_for_min_max_date(value); } else if self.is_value_for_min_max_int(&self.header.clone()) { self.add_value_for_min_max_int(value); } else { if self.unique_values.len() >= MAX_UNIQUE_VALUES { self.unique_values_limit_reached = true; return; } *self.unique_values.entry(value.to_string()).or_insert(0) += 1; } } pub fn get_sorted_unique_values(&self) -> Vec<(&String, &usize)> { let mut sorted: Vec<_> = self.unique_values.iter().collect(); sorted.sort_by(|a, b| b.1.cmp(a.1)); sorted } pub fn get_formatted_header_name(&self) -> String { let words: Vec = self .header .split('-') .map(|w| { let mut chars = w.chars(); match chars.next() { Some(c) => format!("{}{}", c.to_uppercase(), chars.as_str()), None => String::new(), } }) .collect(); words.join("-").replace("Xss", "XSS") } pub fn is_value_for_min_max_int(&self, header: &str) -> bool { header == "content-length" || header == "age" } pub fn is_value_for_min_max_date(&self, header: &str) -> bool { header == "date" || header == "expires" || header == "last-modified" } pub fn ignore_header_values(&self, header: &str) -> bool { matches!(header, "etag" | "cf-ray" | "set-cookie" | "content-disposition") } pub fn get_min_value(&self) -> Option { self.min_int_value .map(|v| v.to_string()) .or_else(|| self.min_date_value.clone()) } pub fn get_max_value(&self) -> Option { self.max_int_value .map(|v| v.to_string()) .or_else(|| self.max_date_value.clone()) } pub fn get_values_preview(&self, max_length: usize) -> String { if self.unique_values.len() == 1 && let Some(first_value) = self.unique_values.keys().next() { if first_value.chars().count() > max_length { return utils::truncate_in_two_thirds(first_value, max_length, "\u{2026}", None); } return first_value.clone(); } let values_length: usize = self.unique_values.keys().map(|k| k.len()).sum(); if values_length < max_length.saturating_sub(10) { let mut sorted: Vec<_> = self.unique_values.iter().collect(); sorted.sort_by(|a, b| b.1.cmp(a.1)); let mut result = String::new(); for (value, count) in sorted { result.push_str(&format!("{} ({}) / ", value, count)); } let trimmed = result.trim().trim_end_matches(" /").to_string(); if trimmed.is_empty() { return "[ignored generic values]".to_string(); } return utils::truncate_in_two_thirds(&trimmed, max_length, "\u{2026}", None); } "[see values below]".to_string() } fn add_value_for_min_max_int(&mut self, value: &str) { if let Ok(int_val) = value.parse::() { match self.min_int_value { None => self.min_int_value = Some(int_val), Some(min) if int_val < min => self.min_int_value = Some(int_val), _ => {} } match self.max_int_value { None => self.max_int_value = Some(int_val), Some(max) if int_val > max => self.max_int_value = Some(int_val), _ => {} } } } fn add_value_for_min_max_date(&mut self, value: &str) { // Try to parse HTTP date format into a simple YYYY-MM-DD string if let Ok(dt) = chrono::DateTime::parse_from_rfc2822(value) { let date = dt.format("%Y-%m-%d").to_string(); match &self.min_date_value { None => self.min_date_value = Some(date.clone()), Some(min) if &date < min => self.min_date_value = Some(date.clone()), _ => {} } match &self.max_date_value { None => self.max_date_value = Some(date), Some(max) if &date > max => self.max_date_value = Some(date), _ => {} } } } } ================================================ FILE: src/analysis/result/heading_tree_item.rs ================================================ // SiteOne Crawler - HeadingTreeItem // (c) Jan Reges fn html_escape(s: &str) -> String { s.replace('&', "&") .replace('<', "<") .replace('>', ">") .replace('"', """) .replace('\'', "'") } #[derive(Debug, Clone)] pub struct HeadingTreeItem { /// Heading level (1-6) pub level: i32, /// Real heading level by heading structure in HTML pub real_level: Option, /// Heading text pub text: String, /// Heading ID attribute pub id: Option, /// Children headings pub children: Vec, /// Error text in case of error (typically multiple H1s or wrong heading level) pub error_text: Option, } impl HeadingTreeItem { pub fn new(level: i32, text: String, id: Option) -> Self { Self { level, real_level: None, text, id, children: Vec::new(), error_text: None, } } pub fn has_error(&self) -> bool { self.error_text.is_some() } /// Get heading tree as a plain text list pub fn get_heading_tree_txt_list(items: &[HeadingTreeItem]) -> String { let mut result = String::new(); for item in items { result.push_str(&Self::get_heading_tree_txt(item, true)); } // Collapse whitespace let re = regex::Regex::new(r"\s+").unwrap_or_else(|_| regex::Regex::new(".^").unwrap()); re.replace_all(&result, " ").trim().to_string() } fn get_heading_tree_txt(item: &HeadingTreeItem, add_item: bool) -> String { let mut result = String::new(); if add_item { result.push_str(&format!(" {}", item.level, item.text)); if let Some(ref id) = item.id { result.push_str(&format!(" [#{}]", id)); } result.push('\n'); } for child in &item.children { result.push_str(&" ".repeat((child.level - 1) as usize)); result.push_str(&format!(" {}", child.level, child.text)); if let Some(ref id) = child.id { result.push_str(&format!(" [#{}]", id)); } result.push('\n'); result.push_str(&Self::get_heading_tree_txt(child, false)); } result } /// Get heading tree as an HTML `