Repository: cornelk/goscrape
Branch: main
Commit: ee772e8edd93
Files: 39
Total size: 86.7 KB

Directory structure:
gitextract_eo_umi2a/

├── .codecov.yml
├── .github/
│   ├── ISSUE_TEMPLATE/
│   │   ├── bug_report.md
│   │   └── feature_request.md
│   └── workflows/
│       ├── codecov.yaml
│       ├── go.yaml
│       └── release.yml
├── .gitignore
├── .golangci.yml
├── .goreleaser.yaml
├── CHANGELOG.md
├── Dockerfile
├── LICENSE
├── Makefile
├── README.md
├── css/
│   └── css.go
├── go.mod
├── go.sum
├── htmlindex/
│   ├── attributes.go
│   ├── htmlindex.go
│   └── htmlindex_test.go
├── main.go
└── scraper/
    ├── checks.go
    ├── checks_test.go
    ├── cookies.go
    ├── download.go
    ├── download_test.go
    ├── fileutil.go
    ├── fileutil_test.go
    ├── fs.go
    ├── html.go
    ├── html_test.go
    ├── http.go
    ├── http_test.go
    ├── images.go
    ├── scraper.go
    ├── scraper_test.go
    ├── server.go
    ├── url.go
    └── url_test.go

================================================
FILE CONTENTS
================================================

================================================
FILE: .codecov.yml
================================================
coverage:
  status:
    patch: off
    project:
      default:
        target: 35%
        threshold: 5%


================================================
FILE: .github/ISSUE_TEMPLATE/bug_report.md
================================================
---
name: Bug report
about: Create a report to help us improve
title: ''
labels: bug
assignees: ''

---

**Describe the bug**
A clear and concise description of what the bug is.

**To Reproduce**
Steps to reproduce the behavior:

**Expected behavior**
A clear and concise description of what you expected to happen.

**Screenshots**
If applicable, add screenshots to help explain your problem.


================================================
FILE: .github/ISSUE_TEMPLATE/feature_request.md
================================================
---
name: Feature request
about: Suggest an idea for this project
title: ''
labels: enhancement
assignees: ''

---

**Is your feature request related to a problem? Please describe.**
A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]

**Describe the solution you'd like**
A clear and concise description of what you want to happen.

**Describe alternatives you've considered**
A clear and concise description of any alternative solutions or features you've considered.

**Additional context**
Add any other context or screenshots about the feature request here.


================================================
FILE: .github/workflows/codecov.yaml
================================================
name: codecov

on:
  push:
    branches:
      - main
  pull_request:

jobs:
  codecov:
    timeout-minutes: 15

    name: Coverage
    runs-on: ubuntu-latest
    steps:
      - name: Set up Go 1.x
        uses: actions/setup-go@v5
        with:
          go-version: "1.24"
        id: go

      - name: Check out code into the Go module directory
        uses: actions/checkout@v4

      - name: Get dependencies
        run: go mod download

      - name: Run tests with coverage
        run: make test-coverage

      - name: Upload coverage to Codecov
        uses: codecov/codecov-action@v4
        with:
          token: ${{ secrets.CODECOV_TOKEN }}
          file: ./.testCoverage


================================================
FILE: .github/workflows/go.yaml
================================================
name: CI

on:
  push:
    branches:
      - main
  pull_request:
    types:
      - opened
      - reopened
      - synchronize
      - ready_for_review

jobs:
  build:
    if: ${{ github.event_name == 'push' || !github.event.pull_request.draft || !contains(github.event.commits[0].message, '[skip ci]') }}
    timeout-minutes: 15

    name: Build
    runs-on: ubuntu-latest
    strategy:
      matrix:
        go: [ "1.24" ]

    steps:
      - name: Set up Go 1.x
        uses: actions/setup-go@v5
        with:
          go-version: ${{ matrix.go }}
        id: go

      - name: Check out code into the Go module directory
        uses: actions/checkout@v4

      - name: Install linters
        run: make install-linters

      - name: Get dependencies
        run: go mod download

      - name: Run tests
        run: make test

      - name: Run linter
        run: make lint


================================================
FILE: .github/workflows/release.yml
================================================
name: release
on:
  push:
    tags:
      - 'v*'
jobs:
  goreleaser:
    runs-on: ubuntu-latest
    steps:
      -
        name: Checkout
        uses: actions/checkout@v4
        with:
          fetch-depth: 0
      -
        name: Set up Go
        uses: actions/setup-go@v5
        with:
          go-version: '1.24'
          check-latest: true
          cache: true
      -
        name: Docker login
        uses: docker/login-action@v3
        with:
          registry: 'ghcr.io'
          username: ${{ github.actor }}
          password: ${{ secrets.GITHUB_TOKEN }}
      -
        name: Run GoReleaser
        uses: goreleaser/goreleaser-action@v6
        with:
          version: latest
          args: release --clean
        env:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}


================================================
FILE: .gitignore
================================================
*.db
*.dmp
*.exe
*.iml
*.local
*.out
*.prof
*.test
.DS_Store
.fleet
.idea
.vscode
/*.log

.testCoverage
dist/
goscrape


================================================
FILE: .golangci.yml
================================================
version: "2"
run:
  modules-download-mode: readonly
linters:
  enable:
    - asasalint # Check for pass []any as any in variadic func(...any)
    - asciicheck # Checks that code identifiers do not have non-ASCII symbols
    - bidichk # Checks for dangerous unicode character sequences
    - bodyclose # Checks whether HTTP response body is closed successfully
    - containedctx # Detects struct contained context.Context field
    - contextcheck # Check whether the function uses a non-inherited context
    - copyloopvar # Detects places where loop variables are copied
    - cyclop # Checks function and package cyclomatic complexity
    - decorder # Check declaration order of types, constants, variables and functions
    - dogsled # Checks assignments with too many blank identifiers
    - durationcheck # Check for two durations multiplied together
    - embeddedstructfieldcheck # Checks embedded struct field positioning and formatting
    - err113 # Check errors handling expressions
    - errcheck # Checks for unchecked errors in Go code
    - errname # Checks that sentinel errors are prefixed with Err
    - errorlint # Find code that will cause problems with error wrapping
    - funlen # Checks for long functions
    - gocheckcompilerdirectives # Checks that go compiler directives are valid
    - gocognit # Computes and checks the cognitive complexity of functions
    - goconst # Finds repeated strings that could be replaced by a constant
    - gocritic # Provides diagnostics that check for bugs, performance and style issues
    - gocyclo # Computes and checks the cyclomatic complexity of functions
    - godot # Check if comments end in a period
    - govet # Examines Go source code and reports suspicious constructs
    - grouper # Analyze expression groups
    - iface # Detect the incorrect use of interfaces
    - ineffassign # Detects when assignments to existing variables are not used
    - intrange # Find places where for loops could use an integer range
    - ireturn # Accept Interfaces, Return Concrete Types
    - maintidx # Measures the maintainability index of each function
    - makezero # Find slice declarations with non-zero initial length
    - mirror # Reports wrong mirror patterns of bytes/strings usage
    - misspell # Finds commonly misspelled English words
    - modernize # Suggest simplifications using modern Go features
    - nakedret # Finds naked returns in functions longer than a maximum size
    - nestif # Reports deeply nested if statements
    - nilerr # Finds code that returns nil even if it checks that error is not nil
    - nilnil # Checks that there is no simultaneous return of nil error and invalid value
    - noctx # Detects missing usage of context.Context
    - nosprintfhostport # Checks for misuse of Sprintf to construct a host with port
    - perfsprint # Checks that fmt.Sprintf can be replaced with a faster alternative
    - prealloc # Find slice declarations that could be pre-allocated
    - predeclared # Find code that shadows one of Go's predeclared identifiers
    - reassign # Checks that package variables are not reassigned
    - revive # Fast, configurable, extensible linter for Go
    - staticcheck # Set of rules from staticcheck
    - testifylint # Checks usage of github.com/stretchr/testify
    - thelper # Detects test helpers which do not call t.Helper()
    - tparallel # Detects inappropriate usage of t.Parallel()
    - unconvert # Remove unnecessary type conversions
    - unparam # Reports unused function parameters
    - unused # Checks Go code for unused constants, variables, functions and types
    - usestdlibvars # Detect the possibility to use variables/constants from stdlib
    - wastedassign # Finds wasted assignment statements
    - whitespace # Checks for unnecessary newlines
    - wrapcheck # Checks that errors returned from external packages are wrapped
  settings:
    cyclop:
      max-complexity: 15
    whitespace:
      multi-if: true # Enforces newlines (or comments) after every multi-line if statement
      multi-func: true # Enforces newlines (or comments) after every multi-line function signature
  exclusions:
    rules:
      - linters:
          - err113
        text: do not define dynamic errors
formatters:
  enable:
    - gci # Check if code and import statements are formatted, with additional rules
    - gofmt # Check if the code is formatted according to 'gofmt' command
    - goimports # Checks if the code and import statements are formatted according to the 'goimports' command


================================================
FILE: .goreleaser.yaml
================================================
version: 2

before:
  hooks:
    - go mod download

builds:
  - id: goscrape
    binary: goscrape
    dir: .
    env:
      - CGO_ENABLED=0
    targets:
      - go_first_class
    flags:
      - -trimpath
    ldflags:
      - -s -w -X main.version={{.Version}} -X main.commit={{.Commit}} -X main.date={{ .CommitDate }}

dockers:
  - id: goscrape
    image_templates:
      - "ghcr.io/cornelk/goscrape:{{ .Version }}"

universal_binaries:
  - replace: false

archives:
  - id: goscrape
    builds: ['goscrape']
    name_template: >-
      {{ .ProjectName }}_v{{ .Version }}_
      {{- if eq .Os "darwin" }}macos
      {{- else }}{{ .Os }}{{ end }}_
      {{- if eq .Arch "amd64" }}64bit
      {{- else if eq .Arch "386" }}32bit
      {{- else if eq .Arch "Darwin" }}macos
      {{- else }}{{ .Arch }}{{ end }}
    format_overrides:
      - goos: windows
        format: zip

checksum:
  name_template: 'checksums.txt'

snapshot:
  name_template: "{{ .Tag }}-snapshot"

changelog:
  disable: true


================================================
FILE: CHANGELOG.md
================================================
# Changelog for goscrape

All notable changes to this project will be documented in this file.

## [v0.3.0] - 2025-01-01

Added:

* body background attribute support
* fragments at the end of the URL are now filtered
* support srcset attribute for image downloading

Fixed:

* inline css gets processed correctly
* --include / --exclude parameter list handling

## [v0.2.0] - 2024-06-21

Added:

* add cookie loading and storing support
* add docker image build
* add header parameter support
* add user agent parameter support
* support serving directory in local webserver

Changed:

* improved error messages
* reduced dependencies and binary size

Fixed:

* fix asset downloading
* fix downloading of binary files
* fix logger output
* fix logger verbose flag usage
* fix stopping of program using ctrl+c

## [v0.1.1] - 2023-02-11

Added:

* binaries for releases
* http request timeout configuration support
* proxy configuration support

Fixed:

* do not abort scraping on error


## [v0.1.0] - 2022-06-26

First version of goscrape released.


================================================
FILE: Dockerfile
================================================
FROM gcr.io/distroless/static-debian12

COPY goscrape /

ENTRYPOINT ["./goscrape"]


================================================
FILE: LICENSE
================================================
MIT License

Copyright (c) 2017 Cornel

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.


================================================
FILE: Makefile
================================================
GOLANGCI_VERSION = v2.6.0

help: ## show help, shown by default if no target is specified
	@grep -E '^[0-9a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-30s\033[0m %s\n", $$1, $$2}'

lint: ## run code linters
	golangci-lint run

test: ## run tests
	go test -timeout 10s -race ./...

test-coverage: ## run unit tests and create test coverage
	go test -timeout 10s ./... -coverprofile .testCoverage -covermode=atomic -coverpkg=./...

test-coverage-web: test-coverage ## run unit tests and show test coverage in browser
	go tool cover -func .testCoverage | grep total | awk '{print "Total coverage: "$$3}'
	go tool cover -html=.testCoverage

install: ## install all binaries
	go install -buildvcs=false .

install-linters: ## install all linters
	go install github.com/golangci/golangci-lint/v2/cmd/golangci-lint@${GOLANGCI_VERSION}

release-snapshot: ## build release binaries from current git state as snapshot
	goreleaser release --snapshot --clean


================================================
FILE: README.md
================================================
# goscrape - create offline browsable copies of websites

[![Build status](https://github.com/cornelk/goscrape/actions/workflows/go.yaml/badge.svg?branch=main)](https://github.com/cornelk/goscrape/actions)
[![go.dev reference](https://img.shields.io/badge/go.dev-reference-007d9c?logo=go&logoColor=white&style=flat-square)](https://pkg.go.dev/github.com/cornelk/goscrape)
[![Go Report Card](https://goreportcard.com/badge/github.com/cornelk/goscrape)](https://goreportcard.com/report/github.com/cornelk/goscrape)
[![codecov](https://codecov.io/gh/cornelk/goscrape/branch/main/graph/badge.svg?token=NS5UY28V3A)](https://codecov.io/gh/cornelk/goscrape)

A web scraper built with Golang. It downloads the content of a website and allows it to be archived and read offline.

## Features

Features and advantages over existing tools like wget, httrack, Teleport Pro:

* Free and open source
* Available for all platforms that Golang supports
* JPEG and PNG images can be converted down in quality to save disk space
* Excluded URLS will not be fetched (unlike [wget](https://savannah.gnu.org/bugs/?20808))
* No incomplete temp files are left on disk
* Downloaded asset files are skipped in a new scraper run
* Assets from external domains are downloaded automatically
* Sane default values

## Limitations

* No GUI version, console only

## Installation

There are 2 options to install goscrape:

1. Download and unpack a binary release from [Releases](https://github.com/cornelk/goscrape/releases)

or

2. Compile the latest release from source:

```
go install github.com/cornelk/goscrape@latest
```

Compiling the tool from source code needs to have a recent version of [Golang](https://go.dev/) installed.

## Usage

Scrape a website by running
```
goscrape http://website.com
```

To serve the downloaded website directory in a local run webserver use
```
goscrape --serve website.com
```

## Options

```
  --include INCLUDE, -n INCLUDE
                         only include URLs with PERL Regular Expressions support
  --exclude EXCLUDE, -x EXCLUDE
                         exclude URLs with PERL Regular Expressions support
  --output OUTPUT, -o OUTPUT
                         output directory to write files to
  --depth DEPTH, -d DEPTH
                         download depth, 0 for unlimited [default: 10]
  --imagequality IMAGEQUALITY, -i IMAGEQUALITY
                         image quality, 0 to disable reencoding
  --timeout TIMEOUT, -t TIMEOUT
                         time limit in seconds for each HTTP request to connect and read the request body
  --serve SERVE, -s SERVE
                         serve the website using a webserver
  --serverport SERVERPORT, -r SERVERPORT
                         port to use for the webserver [default: 8080]
  --cookiefile COOKIEFILE, -c COOKIEFILE
                         file containing the cookie content
  --savecookiefile SAVECOOKIEFILE
                         file to save the cookie content
  --header HEADER, -h HEADER
                         HTTP header to use for scraping
  --proxy PROXY, -p PROXY
                         proxy to use in format scheme://[user:password@]host:port (supports HTTP, HTTPS, SOCKS5 protocols)
  --user USER, -u USER   user[:password] to use for HTTP authentication
  --useragent USERAGENT, -a USERAGENT
                         user agent to use for scraping
  --verbose, -v          verbose output
  --help, -h             display this help and exit
  --version              display version and exit
```

## Cookies

Cookies can be passed in a file using the `--cookiefile` parameter and a file containing
cookies in the following format:

```
[{"name":"user","value":"123"},{"name":"sessioe","value":"sid"}]
```

## Proxy Configuration

The `--proxy` flag supports multiple proxy protocols for scraping through different types of proxy servers:

### Supported Proxy Types

- **HTTP proxies**: `http://proxy.example.com:8080`
- **HTTPS proxies**: `https://proxy.example.com:8080`
- **SOCKS5 proxies**: `socks5://proxy.example.com:1080`

### Authentication

Proxies requiring authentication can include credentials in the URL:

```bash
# HTTP proxy with authentication
goscrape --proxy http://username:password@proxy.example.com:8080 http://website.com

# SOCKS5 proxy with authentication
goscrape --proxy socks5://user:pass@proxy.example.com:1080 http://website.com
```


================================================
FILE: css/css.go
================================================
// Package css provides a CSS parser that can process CSS data and call a processor for every found URL.
package css

import (
	"net/url"
	"regexp"
	"strings"

	"github.com/cornelk/gotokit/log"
	"github.com/gorilla/css/scanner"
)

var cssURLRe = regexp.MustCompile(`^url\(['"]?(.*?)['"]?\)$`)

// Token represents a token and the corresponding string.
type Token = scanner.Token

type urlProcessor func(token *Token, data string, url *url.URL)

// Process the CSS data and call a processor for every found URL.
func Process(logger *log.Logger, url *url.URL, data string, processor urlProcessor) {
	css := scanner.New(data)

	for {
		token := css.Next()
		if token.Type == scanner.TokenEOF || token.Type == scanner.TokenError {
			break
		}
		if token.Type != scanner.TokenURI {
			continue
		}

		match := cssURLRe.FindStringSubmatch(token.Value)
		if match == nil {
			continue
		}

		src := match[1]
		if strings.HasPrefix(strings.ToLower(src), "data:") {
			continue // skip embedded data
		}

		u, err := url.Parse(src)
		if err != nil {
			logger.Error("Parsing URL failed",
				log.String("url", src),
				log.Err(err))
			continue
		}

		processor(token, src, u)
	}
}


================================================
FILE: go.mod
================================================
module github.com/cornelk/goscrape

go 1.24.0

require (
	github.com/alexflint/go-arg v1.6.0
	github.com/cornelk/gotokit v0.0.0-20251031201833-083458d3990b
	github.com/gorilla/css v1.0.1
	github.com/h2non/filetype v1.1.4-0.20231228185113-6469358c2bcb
	github.com/stretchr/testify v1.11.1
	golang.org/x/net v0.46.0
)

require (
	github.com/alexflint/go-scalar v1.2.0 // indirect
	github.com/davecgh/go-spew v1.1.1 // indirect
	github.com/kr/pretty v0.3.0 // indirect
	github.com/pmezard/go-difflib v1.0.0 // indirect
	gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c // indirect
	gopkg.in/yaml.v3 v3.0.1 // indirect
)


================================================
FILE: go.sum
================================================
github.com/alexflint/go-arg v1.6.0 h1:wPP9TwTPO54fUVQl4nZoxbFfKCcy5E6HBCumj1XVRSo=
github.com/alexflint/go-arg v1.6.0/go.mod h1:A7vTJzvjoaSTypg4biM5uYNTkJ27SkNTArtYXnlqVO8=
github.com/alexflint/go-scalar v1.2.0 h1:WR7JPKkeNpnYIOfHRa7ivM21aWAdHD0gEWHCx+WQBRw=
github.com/alexflint/go-scalar v1.2.0/go.mod h1:LoFvNMqS1CPrMVltza4LvnGKhaSpc3oyLEBUZVhhS2o=
github.com/cornelk/gotokit v0.0.0-20251031201833-083458d3990b h1:jT7eWHlrlvjVXSV47U4IjdgI75nLhf/327EHYaGLzso=
github.com/cornelk/gotokit v0.0.0-20251031201833-083458d3990b/go.mod h1:05rfUdBvcZdkWv0/oZfXevCTUzos6WafYzkPzkSkKbY=
github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E=
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/gorilla/css v1.0.1 h1:ntNaBIghp6JmvWnxbZKANoLyuXTPZ4cAMlo6RyhlbO8=
github.com/gorilla/css v1.0.1/go.mod h1:BvnYkspnSzMmwRK+b8/xgNPLiIuNZr6vbZBTPQ2A3b0=
github.com/h2non/filetype v1.1.4-0.20231228185113-6469358c2bcb h1:GlQyMv2C48qmfPItvAXFoyN341Swxp9JNVeUZxnmbJw=
github.com/h2non/filetype v1.1.4-0.20231228185113-6469358c2bcb/go.mod h1:319b3zT68BvV+WRj7cwy856M2ehB3HqNOt6sy1HndBY=
github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo=
github.com/kr/pretty v0.2.1/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI=
github.com/kr/pretty v0.3.0 h1:WgNl7dwNpEZ6jJ9k1snq4pZsg7DOEN8hP9Xw0Tsjwk0=
github.com/kr/pretty v0.3.0/go.mod h1:640gp4NfQd8pI5XOwp5fnNeVWj67G7CFk/SaSQn7NBk=
github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ=
github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI=
github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY=
github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE=
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/rogpeppe/go-internal v1.6.1 h1:/FiVV8dS/e+YqF2JvO3yXRFbBLTIuSDkuC7aBOAvL+k=
github.com/rogpeppe/go-internal v1.6.1/go.mod h1:xXDCJY+GAPziupqXw64V24skbSoqbTEfhy4qGm1nDQc=
github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs=
github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U=
github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U=
golang.org/x/net v0.46.0 h1:giFlY12I07fugqwPuWJi68oOnpfqFnJIJzaIIm2JVV4=
golang.org/x/net v0.46.0/go.mod h1:Q9BGdFy1y4nkUwiLvT5qtyhAnEHgnQ/zd8PfU6nc210=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk=
gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q=
gopkg.in/errgo.v2 v2.1.0/go.mod h1:hNsd1EY+bozCKY1Ytp96fpM3vjJbqLJn88ws8XvfDNI=
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=


================================================
FILE: htmlindex/attributes.go
================================================
package htmlindex

import (
	"net/url"

	"github.com/cornelk/gotokit/log"
	"github.com/cornelk/gotokit/set"
	"golang.org/x/net/html"
)

type nodeAttributeParserData struct {
	logger    *log.Logger
	url       *url.URL
	node      *html.Node
	attribute string
	value     string
}

// nodeAttributeParser returns the URL values of the attribute of the node and
// whether the attribute has been processed.
type nodeAttributeParser func(data nodeAttributeParserData) ([]string, bool)

// Node describes an HTML tag and its attributes that can contain URLs.
type Node struct {
	Attributes []string

	noChildParsing bool
	parser         nodeAttributeParser
}

// nolint: revive
const (
	BackgroundAttribute = "background"
	HrefAttribute       = "href"

	DataSrcAttribute = "data-src"
	SrcAttribute     = "src"

	DataSrcSetAttribute = "data-srcset"
	SrcSetAttribute     = "srcset"
)

// nolint: revive
const (
	ATag      = "a"
	BodyTag   = "body"
	ImgTag    = "img"
	LinkTag   = "link"
	ScriptTag = "script"
	StyleTag  = "style"
)

// Nodes describes the HTML tags and their attributes that can contain URL.
var Nodes = map[string]Node{
	ATag: {
		Attributes: []string{HrefAttribute},
	},
	BodyTag: {
		Attributes: []string{BackgroundAttribute},
	},
	ImgTag: {
		Attributes: []string{SrcAttribute, DataSrcAttribute, SrcSetAttribute, DataSrcSetAttribute},
		parser:     srcSetValueSplitter,
	},
	LinkTag: {
		Attributes: []string{HrefAttribute},
	},
	ScriptTag: {
		Attributes: []string{SrcAttribute},
	},
	StyleTag: {
		noChildParsing: true,
		parser:         styleParser,
	},
}

// SrcSetAttributes contains the attributes that contain srcset values.
var SrcSetAttributes = set.NewFromSlice([]string{
	DataSrcSetAttribute,
	SrcSetAttribute,
})


================================================
FILE: htmlindex/htmlindex.go
================================================
// Package htmlindex provides an index for all HTML tags of relevance for scraping.
package htmlindex

import (
	"fmt"
	"net/url"
	"slices"
	"sort"
	"strings"

	"github.com/cornelk/goscrape/css"
	"github.com/cornelk/gotokit/log"
	"golang.org/x/net/html"
)

// Index provides an index for all HTML tags of relevance for scraping.
type Index struct {
	logger *log.Logger

	// key is HTML tag, value is a map of all its urls and the HTML nodes for it
	data map[string]map[string][]*html.Node
}

// New returns a new index.
func New(logger *log.Logger) *Index {
	return &Index{
		logger: logger,
		data:   make(map[string]map[string][]*html.Node),
	}
}

// Index the given HTML document.
func (idx *Index) Index(baseURL *url.URL, node *html.Node) {
	for child := node.FirstChild; child != nil; child = child.NextSibling {
		switch child.Type {
		case html.ElementNode:
			idx.indexElementNode(baseURL, node, child)
		default:
		}
	}
}

func (idx *Index) indexElementNode(baseURL *url.URL, node, child *html.Node) {
	var references []string

	info, ok := Nodes[child.Data]
	if ok {
		references = idx.nodeAttributeURLs(baseURL, child, info.parser, info.Attributes...)
	}

	m, ok := idx.data[child.Data]
	if !ok {
		m = map[string][]*html.Node{}
		idx.data[child.Data] = m
	}

	for _, reference := range references {
		m[reference] = append(m[reference], child)
	}

	if node.FirstChild != nil && !info.noChildParsing {
		idx.Index(baseURL, child)
	}
}

// URLs returns all URLs of the references found for a specific tag.
func (idx *Index) URLs(tag string) ([]*url.URL, error) {
	m, ok := idx.data[tag]
	if !ok {
		return nil, nil
	}

	data := make([]string, 0, len(m))
	for key := range m {
		data = append(data, key)
	}
	sort.Strings(data)

	urls := make([]*url.URL, 0, len(m))
	for _, fullURL := range data {
		u, err := url.Parse(fullURL)
		if err != nil {
			return nil, fmt.Errorf("parsing URL '%s': %w", fullURL, err)
		}
		urls = append(urls, u)
	}

	return urls, nil
}

// Nodes returns a map of all URLs and their HTML nodes.
func (idx *Index) Nodes(tag string) map[string][]*html.Node {
	m, ok := idx.data[tag]
	if ok {
		return m
	}
	return map[string][]*html.Node{}
}

// nodeAttributeURLs returns resolved URLs based on the base URL and the HTML node attribute values.
func (idx *Index) nodeAttributeURLs(baseURL *url.URL, node *html.Node,
	parser nodeAttributeParser, attributeNames ...string) []string {

	var results []string

	processReferences := func(references []string) {
		for _, reference := range references {
			ur, err := url.Parse(reference)
			if err != nil {
				continue
			}

			ur = baseURL.ResolveReference(ur)
			results = append(results, ur.String())
		}
	}

	for _, attr := range node.Attr {
		if !slices.Contains(attributeNames, attr.Key) {
			continue
		}

		var references []string
		var parserHandled bool

		if parser != nil {
			data := nodeAttributeParserData{
				logger:    idx.logger,
				url:       baseURL,
				node:      node,
				attribute: attr.Key,
				value:     strings.TrimSpace(attr.Val),
			}
			references, parserHandled = parser(data)
		}
		if parser == nil || !parserHandled {
			references = append(references, strings.TrimSpace(attr.Val))
		}

		processReferences(references)
	}

	// special case to support style tag
	if len(attributeNames) == 0 && parser != nil {
		data := nodeAttributeParserData{
			logger: idx.logger,
			url:    baseURL,
			node:   node,
		}
		references, _ := parser(data)
		processReferences(references)
	}

	return results
}

// srcSetValueSplitter returns the URL values of the srcset attribute of img nodes.
func srcSetValueSplitter(data nodeAttributeParserData) ([]string, bool) {
	if !SrcSetAttributes.Contains(data.attribute) {
		return nil, false
	}

	// split the set of responsive images
	values := strings.Split(data.value, ",")

	for i, value := range values {
		value = strings.TrimSpace(value)
		// remove the width in pixels after the url
		values[i], _, _ = strings.Cut(value, " ")
	}

	return values, true
}

// styleParser returns the URL values of a CSS style tag.
func styleParser(data nodeAttributeParserData) ([]string, bool) {
	if data.node.FirstChild == nil {
		return nil, false
	}

	var urls []string
	processor := func(_ *css.Token, _ string, url *url.URL) {
		urls = append(urls, url.String())
	}

	cssData := data.node.FirstChild.Data
	css.Process(data.logger, data.url, cssData, processor)

	return urls, true
}


================================================
FILE: htmlindex/htmlindex_test.go
================================================
package htmlindex

import (
	"bytes"
	"net/url"
	"testing"

	"github.com/cornelk/gotokit/log"
	"github.com/stretchr/testify/assert"
	"github.com/stretchr/testify/require"
	"golang.org/x/net/html"
)

func TestIndex(t *testing.T) {
	input := []byte(`
<html lang="es">
<a href="https://domain.com/wp-content/uploads/document.pdf" rel="doc">Guide</a>
<img src="/test.jpg"/> 
</html>
`)

	idx := testSetup(t, input)

	// check a tag
	nodeTag := "a"
	references, err := idx.URLs(nodeTag)
	require.NoError(t, err)
	require.Len(t, references, 1)

	tagURL := "https://domain.com/wp-content/uploads/document.pdf"
	assert.Equal(t, tagURL, references[0].String())

	urls := idx.Nodes(nodeTag)
	require.Len(t, urls, 1)
	nodes, ok := urls[tagURL]
	require.True(t, ok)
	require.Len(t, nodes, 1)
	node := nodes[0]
	assert.Equal(t, nodeTag, node.Data)

	// check img tag
	nodeTag = "img"
	references, err = idx.URLs(nodeTag)
	require.NoError(t, err)
	require.Len(t, references, 1)

	tagURL = "https://domain.com/test.jpg"
	assert.Equal(t, tagURL, references[0].String())

	// check for not existing tag
	nodeTag = "not-existing"
	references, err = idx.URLs(nodeTag)
	require.NoError(t, err)
	require.Empty(t, references)
	urls = idx.Nodes(nodeTag)
	require.Empty(t, urls)
}

func TestIndexImg(t *testing.T) {
	input := []byte(`
<html lang="es">
<body background="bg.jpg"></body>
<img src="test.jpg" srcset="test-480w.jpg 480w, test-800w.jpg 800w"/> 
</body>
</html>
`)

	idx := testSetup(t, input)
	references, err := idx.URLs(ImgTag)
	require.NoError(t, err)
	require.Len(t, references, 3)
	assert.Equal(t, "https://domain.com/test-480w.jpg", references[0].String())
	assert.Equal(t, "https://domain.com/test-800w.jpg", references[1].String())
	assert.Equal(t, "https://domain.com/test.jpg", references[2].String())

	references, err = idx.URLs(BodyTag)
	require.NoError(t, err)
	require.Len(t, references, 1)
	assert.Equal(t, "https://domain.com/bg.jpg", references[0].String())
}

func testSetup(t *testing.T, input []byte) *Index {
	t.Helper()

	buf := &bytes.Buffer{}
	_, err := buf.Write(input)
	require.NoError(t, err)

	doc, err := html.Parse(buf)
	require.NoError(t, err)

	ur, err := url.Parse("https://domain.com/")
	require.NoError(t, err)

	logger := log.NewTestLogger(t)
	idx := New(logger)
	idx.Index(ur, doc)

	return idx
}


================================================
FILE: main.go
================================================
// Package main provides a command line tool to scrape websites and create an offline browsable version on the disk.
package main

import (
	"context"
	"encoding/json"
	"errors"
	"fmt"
	"os"
	"strings"

	"github.com/alexflint/go-arg"
	"github.com/cornelk/goscrape/scraper"
	"github.com/cornelk/gotokit/app"
	"github.com/cornelk/gotokit/buildinfo"
	"github.com/cornelk/gotokit/env"
	"github.com/cornelk/gotokit/log"
)

var (
	version = "dev"
	commit  = ""
	date    = ""
)

type arguments struct {
	Include []string `arg:"-n,--include" help:"only include URLs with PERL Regular Expressions support"`
	Exclude []string `arg:"-x,--exclude" help:"exclude URLs with PERL Regular Expressions support"`
	Output  string   `arg:"-o,--output" help:"output directory to write files to"`
	URLs    []string `arg:"positional"`

	Depth        int64 `arg:"-d,--depth" help:"download depth, 0 for unlimited" default:"10"`
	ImageQuality int64 `arg:"-i,--imagequality" help:"image quality, 0 to disable reencoding"`
	Timeout      int64 `arg:"-t,--timeout" help:"time limit in seconds for each HTTP request to connect and read the request body"`

	Serve      string `arg:"-s,--serve" help:"serve the website using a webserver"`
	ServerPort int16  `arg:"-r,--serverport" help:"port to use for the webserver" default:"8080"`

	CookieFile     string `arg:"-c,--cookiefile" help:"file containing the cookie content"`
	SaveCookieFile string `arg:"--savecookiefile" help:"file to save the cookie content"`

	Headers   []string `arg:"-h,--header" help:"HTTP header to use for scraping"`
	Proxy     string   `arg:"-p,--proxy" help:"proxy to use in format scheme://[user:password@]host:port (supports HTTP, HTTPS, SOCKS5 protocols)"`
	User      string   `arg:"-u,--user" help:"user[:password] to use for HTTP authentication"`
	UserAgent string   `arg:"-a,--useragent" help:"user agent to use for scraping"`

	Verbose bool `arg:"-v,--verbose" help:"verbose output"`
}

func (arguments) Description() string {
	return "Scrape a website and create an offline browsable version on the disk.\n"
}

func (arguments) Version() string {
	return fmt.Sprintf("Version: %s\n", buildinfo.Version(version, commit, date))
}

func main() {
	args, err := readArguments()
	if err != nil {
		fmt.Printf("Reading arguments failed: %s\n", err)
		os.Exit(1)
	}

	ctx := app.Context()

	if args.Verbose {
		log.SetDefaultLevel(log.DebugLevel)
	}
	logger, err := createLogger()
	if err != nil {
		fmt.Printf("Creating logger failed: %s\n", err)
		os.Exit(1)
	}

	if args.Serve != "" {
		if err := runServer(ctx, args, logger); err != nil {
			fmt.Printf("Server execution error: %s\n", err)
			os.Exit(1)
		}
		return
	}

	if err := runScraper(ctx, args, logger); err != nil {
		fmt.Printf("Scraping execution error: %s\n", err)
		os.Exit(1)
	}
}

func readArguments() (arguments, error) {
	var args arguments
	parser, err := arg.NewParser(arg.Config{}, &args)
	if err != nil {
		return arguments{}, fmt.Errorf("creating argument parser: %w", err)
	}

	if err = parser.Parse(os.Args[1:]); err != nil {
		switch {
		case errors.Is(err, arg.ErrHelp):
			parser.WriteHelp(os.Stdout)
			os.Exit(0)
		case errors.Is(err, arg.ErrVersion):
			fmt.Println(args.Version())
			os.Exit(0)
		}

		return arguments{}, fmt.Errorf("parsing arguments: %w", err)
	}

	if len(args.URLs) == 0 && args.Serve == "" {
		parser.WriteHelp(os.Stdout)
		os.Exit(0)
	}

	return args, nil
}

func runScraper(ctx context.Context, args arguments, logger *log.Logger) error {
	if len(args.URLs) == 0 {
		return nil
	}

	var username, password string
	if args.User != "" {
		sl := strings.Split(args.User, ":")
		username = sl[0]
		if len(sl) > 1 {
			password = sl[1]
		}
	}

	imageQuality := args.ImageQuality
	if args.ImageQuality < 0 || args.ImageQuality >= 100 {
		imageQuality = 0
	}

	cookies, err := readCookieFile(args.CookieFile)
	if err != nil {
		return fmt.Errorf("reading cookie: %w", err)
	}

	cfg := scraper.Config{
		Includes: args.Include,
		Excludes: args.Exclude,

		ImageQuality: uint(imageQuality),
		MaxDepth:     uint(args.Depth),
		Timeout:      uint(args.Timeout),

		OutputDirectory: args.Output,
		Username:        username,
		Password:        password,

		Cookies:   cookies,
		Header:    scraper.Headers(args.Headers),
		Proxy:     args.Proxy,
		UserAgent: args.UserAgent,
	}

	return scrapeURLs(ctx, cfg, logger, args)
}

func scrapeURLs(ctx context.Context, cfg scraper.Config,
	logger *log.Logger, args arguments) error {

	for _, url := range args.URLs {
		cfg.URL = url
		sc, err := scraper.New(logger, cfg)
		if err != nil {
			return fmt.Errorf("initializing scraper: %w", err)
		}

		logger.Info("Scraping", log.String("url", sc.URL.String()))
		if err = sc.Start(ctx); err != nil {
			if errors.Is(err, context.Canceled) {
				os.Exit(0)
			}

			return fmt.Errorf("scraping '%s': %w", sc.URL, err)
		}

		if args.SaveCookieFile != "" {
			if err := saveCookies(args.SaveCookieFile, sc.Cookies()); err != nil {
				return fmt.Errorf("saving cookies: %w", err)
			}
		}
	}

	return nil
}

func runServer(ctx context.Context, args arguments, logger *log.Logger) error {
	if err := scraper.ServeDirectory(ctx, args.Serve, args.ServerPort, logger); err != nil {
		return fmt.Errorf("serving directory: %w", err)
	}
	return nil
}

func createLogger() (*log.Logger, error) {
	logCfg, err := log.ConfigForEnv(env.Development)
	if err != nil {
		return nil, fmt.Errorf("initializing log config: %w", err)
	}
	logCfg.JSONOutput = false
	logCfg.CallerInfo = false

	logger, err := log.NewWithConfig(logCfg)
	if err != nil {
		return nil, fmt.Errorf("initializing logger: %w", err)
	}
	return logger, nil
}

func readCookieFile(cookieFile string) ([]scraper.Cookie, error) {
	if cookieFile == "" {
		return nil, nil
	}
	b, err := os.ReadFile(cookieFile)
	if err != nil {
		return nil, fmt.Errorf("reading cookie file: %w", err)
	}

	var cookies []scraper.Cookie
	if err := json.Unmarshal(b, &cookies); err != nil {
		return nil, fmt.Errorf("unmarshaling cookies: %w", err)
	}

	return cookies, nil
}

func saveCookies(cookieFile string, cookies []scraper.Cookie) error {
	if cookieFile == "" || len(cookies) == 0 {
		return nil
	}

	b, err := json.Marshal(cookies)
	if err != nil {
		return fmt.Errorf("marshaling cookies: %w", err)
	}

	if err := os.WriteFile(cookieFile, b, 0644); err != nil {
		return fmt.Errorf("saving cookies: %w", err)
	}

	return nil
}


================================================
FILE: scraper/checks.go
================================================
// Package scraper provides a web scraper that can download a website and its assets.
package scraper

import (
	"net/url"
	"strings"

	"github.com/cornelk/gotokit/log"
)

// normalizeURLPath removes trailing slashes from URL paths for duplicate detection.
// This treats URLs with and without trailing slashes as the same resource.
func normalizeURLPath(path string) string {
	if path == "" {
		return "/"
	}
	// Keep root path as is, but remove trailing slashes from other paths
	if path != "/" && strings.HasSuffix(path, "/") {
		return strings.TrimSuffix(path, "/")
	}
	return path
}

// shouldURLBeDownloaded checks whether a page should be downloaded.
// nolint: cyclop
func (s *Scraper) shouldURLBeDownloaded(url *url.URL, currentDepth uint, isAsset bool) bool {
	if url.Scheme != "http" && url.Scheme != "https" {
		return false
	}

	p := url.String()
	if url.Host == s.URL.Host {
		p = url.Path
	}
	if p == "" {
		p = "/"
	}

	// Normalize the path for duplicate detection to handle trailing slashes
	normalizedPath := normalizeURLPath(p)

	if s.processed.Contains(normalizedPath) { // was already downloaded or checked?
		if url.Fragment != "" {
			return false
		}
		return false
	}

	s.processed.Add(normalizedPath)

	if !isAsset {
		if url.Host != s.URL.Host {
			s.logger.Debug("Skipping external host page", log.String("url", url.String()))
			return false
		}

		if s.config.MaxDepth != 0 && currentDepth == s.config.MaxDepth {
			s.logger.Debug("Skipping too deep level page", log.String("url", url.String()))
			return false
		}
	}

	if s.includes != nil && !s.isURLIncluded(url) {
		return false
	}
	if s.excludes != nil && s.isURLExcluded(url) {
		return false
	}

	s.logger.Debug("New URL to download", log.String("url", url.String()))
	return true
}

func (s *Scraper) isURLIncluded(url *url.URL) bool {
	for _, re := range s.includes {
		if re.MatchString(url.Path) {
			s.logger.Info("Including URL",
				log.String("url", url.String()),
				log.Stringer("included_expression", re))
			return true
		}
	}
	return false
}

func (s *Scraper) isURLExcluded(url *url.URL) bool {
	for _, re := range s.excludes {
		if re.MatchString(url.Path) {
			s.logger.Info("Skipping URL",
				log.String("url", url.String()),
				log.Stringer("excluded_expression", re))
			return true
		}
	}
	return false
}


================================================
FILE: scraper/checks_test.go
================================================
package scraper

import (
	"net/url"
	"testing"

	"github.com/cornelk/gotokit/log"
	"github.com/cornelk/gotokit/set"
	"github.com/stretchr/testify/assert"
	"github.com/stretchr/testify/require"
)

func TestNormalizeURLPath(t *testing.T) {
	tests := []struct {
		input    string
		expected string
	}{
		{"", "/"},
		{"/", "/"},
		{"/path", "/path"},
		{"/path/", "/path"},
		{"/path/to/resource", "/path/to/resource"},
		{"/path/to/resource/", "/path/to/resource"},
		{"/category/blog-post", "/category/blog-post"},
		{"/category/blog-post/", "/category/blog-post"},
	}

	for _, test := range tests {
		t.Run(test.input, func(t *testing.T) {
			result := normalizeURLPath(test.input)
			assert.Equal(t, test.expected, result)
		})
	}
}

func TestShouldURLBeDownloaded_TrailingSlashDuplicates(t *testing.T) {
	logger := log.NewTestLogger(t)
	cfg := Config{
		URL: "https://example.com",
	}
	scraper, err := New(logger, cfg)
	require.NoError(t, err)
	require.NotNil(t, scraper)

	// Initialize empty processed set
	scraper.processed = set.New[string]()

	// Test that URLs with and without trailing slashes are treated as duplicates
	url1, err := url.Parse("https://example.com/category/blog-post")
	require.NoError(t, err)

	url2, err := url.Parse("https://example.com/category/blog-post/")
	require.NoError(t, err)

	// First URL should be downloadable
	should1 := scraper.shouldURLBeDownloaded(url1, 0, false)
	assert.True(t, should1, "First URL should be downloadable")

	// Second URL with trailing slash should be treated as duplicate
	should2 := scraper.shouldURLBeDownloaded(url2, 0, false)
	assert.False(t, should2, "Second URL with trailing slash should be treated as duplicate")

	// Verify that the normalized path is in the processed set
	assert.True(t, scraper.processed.Contains("/category/blog-post"))
}

func TestShouldURLBeDownloaded_TrailingSlashDuplicatesReverse(t *testing.T) {
	logger := log.NewTestLogger(t)
	cfg := Config{
		URL: "https://example.com",
	}
	scraper, err := New(logger, cfg)
	require.NoError(t, err)
	require.NotNil(t, scraper)

	// Initialize empty processed set
	scraper.processed = set.New[string]()

	// Test reverse order - trailing slash first, then without
	url1, err := url.Parse("https://example.com/category/blog-post/")
	require.NoError(t, err)

	url2, err := url.Parse("https://example.com/category/blog-post")
	require.NoError(t, err)

	// First URL with trailing slash should be downloadable
	should1 := scraper.shouldURLBeDownloaded(url1, 0, false)
	assert.True(t, should1, "First URL with trailing slash should be downloadable")

	// Second URL without trailing slash should be treated as duplicate
	should2 := scraper.shouldURLBeDownloaded(url2, 0, false)
	assert.False(t, should2, "Second URL without trailing slash should be treated as duplicate")

	// Verify that the normalized path is in the processed set
	assert.True(t, scraper.processed.Contains("/category/blog-post"))
}

func TestShouldURLBeDownloaded_RootPath(t *testing.T) {
	logger := log.NewTestLogger(t)
	cfg := Config{
		URL: "https://example.com",
	}
	scraper, err := New(logger, cfg)
	require.NoError(t, err)
	require.NotNil(t, scraper)

	// Initialize empty processed set
	scraper.processed = set.New[string]()

	// Test root path normalization
	url1, err := url.Parse("https://example.com/")
	require.NoError(t, err)

	url2, err := url.Parse("https://example.com")
	require.NoError(t, err)

	// First root URL should be downloadable
	should1 := scraper.shouldURLBeDownloaded(url1, 0, false)
	assert.True(t, should1, "First root URL should be downloadable")

	// Second root URL should be treated as duplicate
	should2 := scraper.shouldURLBeDownloaded(url2, 0, false)
	assert.False(t, should2, "Second root URL should be treated as duplicate")

	// Verify that the normalized root path is in the processed set
	assert.True(t, scraper.processed.Contains("/"))
}

func TestShouldURLBeDownloaded_ExternalURLs(t *testing.T) {
	logger := log.NewTestLogger(t)
	cfg := Config{
		URL: "https://example.com",
	}
	scraper, err := New(logger, cfg)
	require.NoError(t, err)
	require.NotNil(t, scraper)

	// Initialize empty processed set
	scraper.processed = set.New[string]()

	// Test external URLs with trailing slashes as assets
	url1, err := url.Parse("https://external.com/path.css")
	require.NoError(t, err)

	url2, err := url.Parse("https://external.com/path.css/")
	require.NoError(t, err)

	// First external asset should be downloadable (if it passes other checks)
	should1 := scraper.shouldURLBeDownloaded(url1, 0, true) // asset = true

	// Second external asset with trailing slash should be treated as duplicate
	should2 := scraper.shouldURLBeDownloaded(url2, 0, true) // asset = true

	// First should pass, second should be blocked as duplicate
	assert.True(t, should1, "First external asset should be downloadable")
	assert.False(t, should2, "Second external asset with trailing slash should be treated as duplicate")

	// Verify that the normalized external URL is in the processed set
	normalizedURL1 := normalizeURLPath(url1.String())
	assert.True(t, scraper.processed.Contains(normalizedURL1))
}


================================================
FILE: scraper/cookies.go
================================================
package scraper

import (
	"fmt"
	"net/http"
	"net/http/cookiejar"
	"net/url"
	"time"
)

// Cookie represents a cookie, it copies parts of the http.Cookie struct but changes
// the JSON marshaling to exclude empty fields.
type Cookie struct {
	Name  string `json:"name"`
	Value string `json:"value,omitempty"`

	Expires *time.Time `json:"expires,omitempty"`
}

// Cookies returns the current cookies.
func (s *Scraper) Cookies() []Cookie {
	httpCookies := s.cookies.Cookies(s.URL)
	cookies := make([]Cookie, 0, len(httpCookies))

	for _, c := range httpCookies {
		cookie := Cookie{
			Name:  c.Name,
			Value: c.Value,
		}
		if !c.Expires.IsZero() {
			cookie.Expires = &c.Expires
		}
		cookies = append(cookies, cookie)
	}

	return cookies
}

func createCookieJar(u *url.URL, cookies []Cookie) (*cookiejar.Jar, error) {
	jar, err := cookiejar.New(nil)
	if err != nil {
		return nil, fmt.Errorf("creating cookie jar: %w", err)
	}

	httpCookies := make([]*http.Cookie, 0, len(cookies))
	for _, c := range cookies {
		h := &http.Cookie{
			Name:  c.Name,
			Value: c.Value,
		}
		if c.Expires != nil {
			h.Expires = *c.Expires
		}
		httpCookies = append(httpCookies, h)
	}

	jar.SetCookies(u, httpCookies)
	return jar, nil
}


================================================
FILE: scraper/download.go
================================================
package scraper

import (
	"context"
	"errors"
	"fmt"
	"net/url"
	"path"

	"github.com/cornelk/goscrape/css"
	"github.com/cornelk/goscrape/htmlindex"
	"github.com/cornelk/gotokit/log"
)

// assetProcessor is a processor of a downloaded asset that can transform
// a downloaded file content before it will be stored on disk.
type assetProcessor func(URL *url.URL, data []byte) []byte

var tagsWithReferences = []string{
	htmlindex.LinkTag,
	htmlindex.ScriptTag,
	htmlindex.BodyTag,
	htmlindex.StyleTag,
}

func (s *Scraper) downloadReferences(ctx context.Context, index *htmlindex.Index) error {
	references, err := index.URLs(htmlindex.BodyTag)
	if err != nil {
		s.logger.Error("Getting body node URLs failed", log.Err(err))
	}
	s.imagesQueue = append(s.imagesQueue, references...)

	references, err = index.URLs(htmlindex.ImgTag)
	if err != nil {
		s.logger.Error("Getting img node URLs failed", log.Err(err))
	}
	s.imagesQueue = append(s.imagesQueue, references...)

	for _, tag := range tagsWithReferences {
		references, err = index.URLs(tag)
		if err != nil {
			s.logger.Error("Getting node URLs failed",
				log.String("node", tag),
				log.Err(err))
		}

		var processor assetProcessor
		if tag == htmlindex.LinkTag {
			processor = s.cssProcessor
		}
		for _, ur := range references {
			if err := s.downloadAsset(ctx, ur, processor); err != nil && errors.Is(err, context.Canceled) {
				return err
			}
		}
	}

	for _, image := range s.imagesQueue {
		if err := s.downloadAsset(ctx, image, s.checkImageForRecode); err != nil && errors.Is(err, context.Canceled) {
			return err
		}
	}
	s.imagesQueue = nil
	return nil
}

// downloadAsset downloads an asset if it does not exist on disk yet.
func (s *Scraper) downloadAsset(ctx context.Context, u *url.URL, processor assetProcessor) error {
	u.Fragment = ""
	urlFull := u.String()

	if !s.shouldURLBeDownloaded(u, 0, true) {
		return nil
	}

	filePath := s.getFilePath(u, false)
	if s.fileExists(filePath) {
		return nil
	}

	s.logger.Info("Downloading asset", log.String("url", urlFull))
	data, _, err := s.httpDownloader(ctx, u)
	if err != nil {
		s.logger.Error("Downloading asset failed",
			log.String("url", urlFull),
			log.Err(err))
		return fmt.Errorf("downloading asset: %w", err)
	}

	if processor != nil {
		data = processor(u, data)
	}

	if err = s.fileWriter(filePath, data); err != nil {
		s.logger.Error("Writing asset file failed",
			log.String("url", urlFull),
			log.String("file", filePath),
			log.Err(err))
	}

	return nil
}

func (s *Scraper) cssProcessor(baseURL *url.URL, data []byte) []byte {
	urls := make(map[string]string)

	processor := func(token *css.Token, data string, u *url.URL) {
		s.imagesQueue = append(s.imagesQueue, u)

		cssPath := *u
		cssPath.Path = path.Dir(cssPath.Path) + "/"
		resolved := resolveURL(&cssPath, data, s.URL.Host, false, "")
		urls[token.Value] = resolved
	}

	cssData := string(data)
	css.Process(s.logger, baseURL, cssData, processor)

	if len(urls) == 0 {
		return data
	}

	for ori, filePath := range urls {
		cssData = replaceCSSUrls(ori, filePath, cssData)
		s.logger.Debug("CSS Element relinked",
			log.String("url", ori),
			log.String("fixed_url", filePath))
	}

	return []byte(cssData)
}


================================================
FILE: scraper/download_test.go
================================================
package scraper

import (
	"net/url"
	"testing"

	"github.com/cornelk/gotokit/log"
	"github.com/stretchr/testify/assert"
	"github.com/stretchr/testify/require"
)

func TestCSSProcessor(t *testing.T) {
	logger := log.NewTestLogger(t)
	cfg := Config{
		URL: "http://localhost",
	}
	s, err := New(logger, cfg)
	require.NoError(t, err)

	var fixtures = map[string]string{
		"url('http://localhost/uri/between/single/quote')": "http://localhost/uri/between/single/quote",
		`url("http://localhost/uri/between/double/quote")`: "http://localhost/uri/between/double/quote",
		"url(http://localhost/uri)":                        "http://localhost/uri",
		"url(data:image/gif;base64,R0lGODl)":               "",
		`div#gopher {
			background: url(/doc/gopher/frontpage.png) no-repeat;
			height: 155px;
			}`: "http://localhost/doc/gopher/frontpage.png",
	}

	u, _ := url.Parse("http://localhost")
	for input, expected := range fixtures {
		s.imagesQueue = nil
		s.cssProcessor(u, []byte(input))

		if expected == "" {
			assert.Empty(t, s.imagesQueue)
			continue
		}

		assert.NotEmpty(t, s.imagesQueue)

		res := s.imagesQueue[0].String()
		assert.Equal(t, expected, res)
	}
}


================================================
FILE: scraper/fileutil.go
================================================
package scraper

import (
	"fmt"
	"hash/fnv"
	"net/url"
	"path/filepath"
	"strings"
)

const (
	// PageExtension is the file extension that downloaded pages get.
	PageExtension = ".html"
	// PageDirIndex is the file name of the index file for every dir.
	PageDirIndex = "index" + PageExtension
	// MaxFilenameLength is the maximum length for a filename component to ensure filesystem compatibility.
	MaxFilenameLength = 200
)

// getFilePath returns a file path for a URL to store the URL content in.
// The isAPage parameter is crucial: it tells us whether this URL contains HTML content
// that should be treated as a web page (with .html extensions and directory indexing)
// or if it's a binary file that should keep its original path unchanged.
// Without this distinction, binary files would get corrupted paths like image.jpg.html.
func (s *Scraper) getFilePath(url *url.URL, isAPage bool) string {
	fileName := url.Path
	if isAPage {
		// This is HTML content - apply web page naming conventions
		fileName = getPageFilePath(url)
	}
	// If not a page, keep the original URL path for binary files

	var externalHost string
	if url.Host != s.URL.Host {
		externalHost = "_" + url.Host // _ is a prefix for external domains on the filesystem
	}

	// Split the file path into directory and filename components
	dir := filepath.Dir(fileName)
	base := filepath.Base(fileName)

	// Truncate the filename component if it's too long
	truncatedBase := truncateFilename(base)

	// Reconstruct the path with the truncated filename
	if dir == "." {
		fileName = truncatedBase
	} else {
		fileName = filepath.Join(dir, truncatedBase)
	}

	return filepath.Join(s.config.OutputDirectory, s.URL.Host, externalHost, fileName)
}

// getPageFilePath returns a filename for a URL that represents a web page.
// This function adds .html extensions and handles directory indexing,
// which is what we want for HTML content but NOT for binary files like images or PDFs.
func getPageFilePath(url *url.URL) string {
	fileName := url.Path

	// root of domain will be index.html
	switch {
	case fileName == "" || fileName == "/":
		fileName = PageDirIndex
		// directory index will be index.html in the directory

	case fileName[len(fileName)-1] == '/':
		fileName += PageDirIndex

	default:
		ext := filepath.Ext(fileName)
		// if file extension is missing add .html, otherwise keep the existing file extension
		if ext == "" {
			fileName += PageExtension
		}
	}

	return fileName
}

// truncateFilename truncates a filename if it exceeds MaxFilenameLength while preserving the extension.
func truncateFilename(filename string) string {
	if len(filename) <= MaxFilenameLength {
		return filename
	}

	ext := filepath.Ext(filename)
	baseName := strings.TrimSuffix(filename, ext)

	// Calculate how much space we need for hash and extension
	hashLength := 8 // Using first 8 hex characters (from 32-bit FNV)
	reservedLength := hashLength + len(ext)

	// If the extension alone is too long, truncate it too
	if reservedLength > MaxFilenameLength {
		ext = ext[:MaxFilenameLength-hashLength]
		reservedLength = hashLength + len(ext)
	}

	maxBaseLength := MaxFilenameLength - reservedLength
	if maxBaseLength <= 0 {
		maxBaseLength = 1
	}

	truncatedBase := baseName[:maxBaseLength]

	// Generate FNV-1a hash of original filename to ensure uniqueness
	h := fnv.New32a()
	_, _ = h.Write([]byte(filename))
	hashStr := fmt.Sprintf("%08x", h.Sum32())[:hashLength]

	return truncatedBase + hashStr + ext
}


================================================
FILE: scraper/fileutil_test.go
================================================
package scraper

import (
	"net/url"
	"os"
	"strings"
	"testing"

	"github.com/cornelk/gotokit/log"
	"github.com/stretchr/testify/assert"
	"github.com/stretchr/testify/require"
)

func TestGetFilePath(t *testing.T) {
	type filePathFixture struct {
		BaseURL          string
		DownloadURL      string
		ExpectedFilePath string
	}

	pathSeparator := string(os.PathSeparator)
	expectedBasePath := "google.com" + pathSeparator
	var fixtures = []filePathFixture{
		{"https://google.com/", "https://github.com/", expectedBasePath + "_github.com" + pathSeparator + "index.html"},
		{"https://google.com/", "https://github.com/#fragment", expectedBasePath + "_github.com" + pathSeparator + "index.html"},
		{"https://google.com/", "https://github.com/test", expectedBasePath + "_github.com" + pathSeparator + "test.html"},
		{"https://google.com/", "https://github.com/test/", expectedBasePath + "_github.com" + pathSeparator + "test" + pathSeparator + "index.html"},
		{"https://google.com/", "https://github.com/test.aspx", expectedBasePath + "_github.com" + pathSeparator + "test.aspx"},
		{"https://google.com/", "https://google.com/settings", expectedBasePath + "settings.html"},
	}

	var cfg Config
	logger := log.NewTestLogger(t)
	for _, fix := range fixtures {
		cfg.URL = fix.BaseURL
		s, err := New(logger, cfg)
		require.NoError(t, err)

		URL, err := url.Parse(fix.DownloadURL)
		require.NoError(t, err)

		output := s.getFilePath(URL, true)
		assert.Equal(t, fix.ExpectedFilePath, output)
	}
}

func TestTruncateFilename(t *testing.T) {
	tests := []struct {
		name     string
		filename string
		expected func(string) bool // function to validate the result
	}{
		{
			name:     "short filename unchanged",
			filename: "test.css",
			expected: func(result string) bool { return result == "test.css" },
		},
		{
			name:     "long filename gets truncated",
			filename: "very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-long-filename.css",
			expected: func(result string) bool {
				return len(result) <= MaxFilenameLength &&
					len(result) > 0 &&
					result != "very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-long-filename.css" &&
					result[len(result)-4:] == ".css"
			},
		},
		{
			name:     "filename without extension",
			filename: "very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-long-filename",
			expected: func(result string) bool {
				return len(result) <= MaxFilenameLength &&
					len(result) > 0 &&
					result != "very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-long-filename"
			},
		},
		{
			name:     "empty filename",
			filename: "",
			expected: func(result string) bool { return result == "" },
		},
		{
			name:     "filename at max length",
			filename: strings.Repeat("a", MaxFilenameLength),
			expected: func(result string) bool { return len(result) == MaxFilenameLength },
		},
	}

	for _, tt := range tests {
		t.Run(tt.name, func(t *testing.T) {
			result := truncateFilename(tt.filename)
			assert.True(t, tt.expected(result), "filename: %q, result: %q", tt.filename, result)
		})
	}
}

func TestTruncateFilenameUniqueness(t *testing.T) {
	// Test that different long filenames with the same prefix produce different results
	longPrefix := "this-is-a-very-long-filename-prefix-that-will-be-truncated-and-should-produce-different-results-based-on-the-hash-suffix-when-the-full-filename-is-different"

	filename1 := longPrefix + "-file1.css"
	filename2 := longPrefix + "-file2.css"

	result1 := truncateFilename(filename1)
	result2 := truncateFilename(filename2)

	assert.NotEqual(t, result1, result2, "Different long filenames should produce different truncated results")
	assert.LessOrEqual(t, len(result1), MaxFilenameLength, "Result1 should be within max length")
	assert.LessOrEqual(t, len(result2), MaxFilenameLength, "Result2 should be within max length")
}


================================================
FILE: scraper/fs.go
================================================
package scraper

import (
	"fmt"
	"os"
	"path/filepath"

	"github.com/cornelk/gotokit/log"
)

// createDownloadPath creates the download path if it does not exist yet.
func (s *Scraper) createDownloadPath(path string) error {
	if path == "" {
		return nil
	}

	s.logger.Debug("Creating dir", log.String("path", path))
	if err := os.MkdirAll(path, os.ModePerm); err != nil {
		return fmt.Errorf("creating directory '%s': %w", path, err)
	}
	return nil
}

func (s *Scraper) writeFile(filePath string, data []byte) error {
	dir := filepath.Dir(filePath)
	if len(dir) < len(s.URL.Host) { // nothing to append if it is the root dir
		dir = filepath.Join(".", s.URL.Host, dir)
	}

	if err := s.dirCreator(dir); err != nil {
		return err
	}

	s.logger.Debug("Creating file", log.String("path", filePath))
	f, err := os.Create(filePath)
	if err != nil {
		return fmt.Errorf("creating file '%s': %w", filePath, err)
	}

	if _, err = f.Write(data); err != nil {
		// nolint: wrapcheck
		_ = f.Close() // try to close and remove file but return the first error
		_ = os.Remove(filePath)
		return fmt.Errorf("writing to file: %w", err)
	}

	if err := f.Close(); err != nil {
		return fmt.Errorf("closing file: %w", err)
	}
	return nil
}

func (s *Scraper) fileExists(filePath string) bool {
	if _, err := os.Stat(filePath); !os.IsNotExist(err) {
		return true
	}
	return false
}


================================================
FILE: scraper/html.go
================================================
package scraper

import (
	"bytes"
	"fmt"
	"net/url"
	"slices"
	"strings"

	"github.com/cornelk/goscrape/css"
	"github.com/cornelk/goscrape/htmlindex"
	"github.com/cornelk/gotokit/log"
	"golang.org/x/net/html"
)

// ignoredURLPrefixes contains a list of URL prefixes that do not need to bo adjusted.
var ignoredURLPrefixes = []string{
	"#",       // fragment
	"/#",      // fragment
	"data:",   // embedded data
	"mailto:", // mail address
}

// fixURLReferences fixes URL references to point to relative file names.
// It returns a bool that indicates that no reference needed to be fixed,
// in this case the returned HTML string will be empty.
func (s *Scraper) fixURLReferences(url *url.URL, doc *html.Node,
	index *htmlindex.Index) ([]byte, bool, error) {

	relativeToRoot := urlRelativeToRoot(url)
	if !s.fixHTMLNodeURLs(url, relativeToRoot, index) {
		return nil, false, nil
	}

	var rendered bytes.Buffer
	if err := html.Render(&rendered, doc); err != nil {
		return nil, false, fmt.Errorf("rendering html: %w", err)
	}
	return rendered.Bytes(), true, nil
}

// fixHTMLNodeURLs processes all HTML nodes that contain URLs that need to be fixed
// to link to downloaded files. It returns whether any URLS have been fixed.
func (s *Scraper) fixHTMLNodeURLs(baseURL *url.URL, relativeToRoot string, index *htmlindex.Index) bool {
	var changed bool

	for tag, nodeInfo := range htmlindex.Nodes {
		isHyperlink := tag == htmlindex.ATag

		urls := index.Nodes(tag)
		for _, nodes := range urls {
			for _, node := range nodes {
				switch node.Data {
				case htmlindex.StyleTag:
					if s.fixScriptNodeURL(baseURL, node, isHyperlink, relativeToRoot) {
						changed = true
					}
				default:
					if s.fixNodeURL(baseURL, nodeInfo.Attributes, node, isHyperlink, relativeToRoot) {
						changed = true
					}
				}
			}
		}
	}

	return changed
}

// fixNodeURL fixes the URL references of a HTML node to point to a relative file name.
// It returns whether any attribute value bas been adjusted.
func (s *Scraper) fixNodeURL(baseURL *url.URL, attributes []string, node *html.Node,
	isHyperlink bool, relativeToRoot string) bool {

	var changed bool

	for i, attr := range node.Attr {
		if !slices.Contains(attributes, attr.Key) {
			continue
		}

		value := strings.TrimSpace(attr.Val)
		if value == "" {
			continue
		}

		for _, prefix := range ignoredURLPrefixes {
			if strings.HasPrefix(value, prefix) {
				return false
			}
		}

		var adjusted string

		if htmlindex.SrcSetAttributes.Contains(attr.Key) {
			adjusted = resolveSrcSetURLs(baseURL, value, s.URL.Host, isHyperlink, relativeToRoot)
		} else {
			adjusted = resolveURL(baseURL, value, s.URL.Host, isHyperlink, relativeToRoot)
		}

		if adjusted == value { // check for no change
			continue
		}

		s.logger.Debug("HTML node relinked",
			log.String("value", value),
			log.String("fixed_value", adjusted))

		attribute := &node.Attr[i]
		attribute.Val = adjusted
		changed = true
	}

	return changed
}

// fixScriptNodeURL fixes the URL references of a HTML script node to point to a relative file name.
// It returns whether any attribute value bas been adjusted.
func (s *Scraper) fixScriptNodeURL(baseURL *url.URL, node *html.Node,
	isHyperlink bool, relativeToRoot string) bool {

	if node.FirstChild == nil {
		return false
	}

	urls := map[string]string{}

	processor := func(_ *css.Token, before string, _ *url.URL) {
		adjusted := resolveURL(baseURL, before, s.URL.Host, isHyperlink, relativeToRoot)
		if before != adjusted {
			urls[before] = adjusted
		}
	}

	cssData := node.FirstChild.Data
	css.Process(s.logger, baseURL, cssData, processor)

	var changed bool

	for before, filePath := range urls {
		cssData = replaceCSSUrls(before, filePath, cssData)
		s.logger.Debug("CSS Element relinked",
			log.String("url", before),
			log.String("fixed_url", filePath))
		changed = true
	}

	node.FirstChild.Data = cssData

	return changed
}

func resolveSrcSetURLs(base *url.URL, srcSetValue, mainPageHost string, isHyperlink bool, relativeToRoot string) string {
	// split the set of responsive images
	values := strings.Split(srcSetValue, ",")

	for i, value := range values {
		value = strings.TrimSpace(value)
		parts := strings.Split(value, " ")
		parts[0] = resolveURL(base, parts[0], mainPageHost, isHyperlink, relativeToRoot)
		values[i] = strings.Join(parts, " ")
	}

	return strings.Join(values, ", ")
}

func replaceCSSUrls(before, after, content string) string {
	prefixes := []string{
		"\"", "'", "",
	}

	for _, prefix := range prefixes {
		wrong := fmt.Sprintf("url(%s%s%s)", prefix, before, prefix)
		fixed := "url('" + after + "')"
		content = strings.ReplaceAll(content, wrong, fixed)
	}

	return content
}


================================================
FILE: scraper/html_test.go
================================================
package scraper

import (
	"bytes"
	"testing"

	"github.com/cornelk/goscrape/htmlindex"
	"github.com/cornelk/gotokit/log"
	"github.com/stretchr/testify/assert"
	"github.com/stretchr/testify/require"
	"golang.org/x/net/html"
)

func TestFixURLReferences(t *testing.T) {
	logger := log.NewTestLogger(t)
	cfg := Config{
		URL: "http://domain.com",
	}
	s, err := New(logger, cfg)
	require.NoError(t, err)

	b := []byte(`
<html lang="es">
<a href="https://domain.com/wp-content/uploads/document.pdf" rel="doc">Guide</a>
<img src="https://domain.com/test.jpg" srcset="https://domain.com/test-480w.jpg 480w, https://domain.com/test-800w.jpg 800w"/> 
</html>
`)

	buf := &bytes.Buffer{}
	_, err = buf.Write(b)
	require.NoError(t, err)

	doc, err := html.Parse(buf)
	require.NoError(t, err)

	index := htmlindex.New(logger)
	index.Index(s.URL, doc)

	ref, fixed, err := s.fixURLReferences(s.URL, doc, index)
	require.NoError(t, err)
	assert.True(t, fixed)

	expected := "<html lang=\"es\"><head></head><body>" +
		"<a href=\"wp-content/uploads/document.pdf\" rel=\"doc\">Guide</a>\n" +
		"<img src=\"test.jpg\" srcset=\"test-480w.jpg 480w, test-800w.jpg 800w\"/> \n\n" +
		"</body></html>"
	assert.Equal(t, expected, string(ref))
}


================================================
FILE: scraper/http.go
================================================
package scraper

import (
	"bytes"
	"context"
	"errors"
	"fmt"
	"io"
	"net/http"
	"net/url"
	"strings"
	"time"

	"github.com/cornelk/gotokit/app"
	"github.com/cornelk/gotokit/log"
)

var (
	maxRetries = 10
	retryDelay = 1500 * time.Millisecond

	errExhaustedRetries = errors.New("exhausted retries")
)

func (s *Scraper) downloadURL(ctx context.Context, u *url.URL) (*http.Response, error) {
	req, err := http.NewRequestWithContext(ctx, http.MethodGet, u.String(), nil)
	if err != nil {
		return nil, fmt.Errorf("creating HTTP request: %w", err)
	}

	req.Header.Set("User-Agent", s.config.UserAgent)
	if s.auth != "" {
		req.Header.Set("Authorization", s.auth)
	}

	for key, values := range s.config.Header {
		for _, value := range values {
			req.Header.Set(key, value)
		}
	}

	resp, err := s.client.Do(req)
	if err != nil {
		return nil, fmt.Errorf("executing HTTP request: %w", err)
	}

	return resp, nil
}

func (s *Scraper) downloadURLWithRetries(ctx context.Context, u *url.URL) ([]byte, *url.URL, error) {
	var err error
	var resp *http.Response

	for retries := range maxRetries + 2 {
		if retries == maxRetries+1 {
			return nil, nil, fmt.Errorf("%w for URL %s", errExhaustedRetries, u)
		}

		resp, err = s.downloadURL(ctx, u)
		if err != nil {
			return nil, nil, err
		}

		if resp.StatusCode == http.StatusTooManyRequests {
			s.logger.Warn("Too Many Requests. Retrying again",
				log.Int("num", retries+1),
				log.Int("max", maxRetries),
				log.String("url", u.String()))

			// Wait a bit and try again using exponential backoff on each retry
			if err := app.Sleep(ctx, (time.Duration(retries)+1)*retryDelay); err != nil {
				return nil, nil, fmt.Errorf("sleeping between retries: %w", err)
			}
			continue
		}
		break
	}

	defer func() {
		if err := resp.Body.Close(); err != nil {
			s.logger.Error("Closing HTTP Request body failed",
				log.String("url", u.String()),
				log.Err(err))
		}
	}()

	if resp.StatusCode != http.StatusOK {
		return nil, nil, fmt.Errorf("unexpected HTTP request status code %d", resp.StatusCode)
	}

	buf := &bytes.Buffer{}
	if _, err := io.Copy(buf, resp.Body); err != nil {
		return nil, nil, fmt.Errorf("reading HTTP request body: %w", err)
	}
	return buf.Bytes(), resp.Request.URL, nil
}

// Headers converts a slice of strings to a http.Header.
func Headers(headers []string) http.Header {
	h := http.Header{}
	for _, header := range headers {
		sl := strings.SplitN(header, ":", 2)
		if len(sl) == 2 {
			h.Set(sl[0], sl[1])
		}
	}
	return h
}


================================================
FILE: scraper/http_test.go
================================================
package scraper

import (
	"context"
	"fmt"
	"net/http"
	"net/http/httptest"
	"net/url"
	"testing"
	"time"

	"github.com/cornelk/gotokit/log"
	"github.com/stretchr/testify/assert"
	"github.com/stretchr/testify/require"
)

func TestHeaders(t *testing.T) {
	headers := Headers([]string{"a:b", "c:d:e"})
	assert.Equal(t, "b", headers.Get("a"))
	assert.Equal(t, "d:e", headers.Get("c"))
}

func TestDownloadURLWithRetries(t *testing.T) {
	ctx := context.Background()
	expected := "ok"

	var retry int
	svr := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
		if retry < maxRetries {
			retry++
			w.WriteHeader(http.StatusTooManyRequests)
			return
		}
		_, err := fmt.Fprint(w, expected)
		assert.NoError(t, err)
	}))
	defer svr.Close()

	ur, err := url.Parse(svr.URL)
	require.NoError(t, err)

	maxRetries = 2
	retryDelay = time.Millisecond

	var cfg Config
	logger := log.NewTestLogger(t)
	s, err := New(logger, cfg)
	require.NoError(t, err)

	// download works after 2 retries
	b, urActual, err := s.downloadURLWithRetries(ctx, ur)
	require.NoError(t, err)
	require.NotNil(t, urActual)
	assert.Equal(t, svr.URL, urActual.String())
	assert.Equal(t, expected, string(b))
	assert.Equal(t, retry, maxRetries)

	// download fails after 3 retries
	retry = -100
	_, _, err = s.downloadURLWithRetries(ctx, ur)
	assert.ErrorIs(t, err, errExhaustedRetries)
}


================================================
FILE: scraper/images.go
================================================
package scraper

import (
	"bytes"
	"fmt"
	"image"
	"image/jpeg"
	"image/png"
	"net/url"

	"github.com/cornelk/gotokit/log"
	"github.com/h2non/filetype"
	"github.com/h2non/filetype/matchers"
	"github.com/h2non/filetype/types"
)

func (s *Scraper) checkImageForRecode(url *url.URL, data []byte) []byte {
	if s.config.ImageQuality == 0 {
		return data
	}

	kind, err := filetype.Match(data)
	if err != nil || kind == types.Unknown {
		return data
	}

	s.logger.Debug("File type detected",
		log.String("type", kind.MIME.Type),
		log.String("sub_type", kind.MIME.Subtype))

	if kind.MIME.Type == matchers.TypeJpeg.MIME.Type && kind.MIME.Subtype == matchers.TypeJpeg.MIME.Subtype {
		if recoded := s.recodeJPEG(url, data); recoded != nil {
			return recoded
		}
		return data
	}

	if kind.MIME.Type == matchers.TypePng.MIME.Type && kind.MIME.Subtype == matchers.TypePng.MIME.Subtype {
		if recoded := s.recodePNG(url, data); recoded != nil {
			return recoded
		}
		return data
	}

	return data
}

// encodeJPEG encodes a new JPG based on the given quality setting.
func (s *Scraper) encodeJPEG(img image.Image) []byte {
	o := &jpeg.Options{
		Quality: int(s.config.ImageQuality),
	}

	outBuf := &bytes.Buffer{}
	if err := jpeg.Encode(outBuf, img, o); err != nil {
		return nil
	}
	return outBuf.Bytes()
}

// recodeJPEG recodes the image and returns it if it is smaller than before.
func (s *Scraper) recodeJPEG(url fmt.Stringer, data []byte) []byte {
	inBuf := bytes.NewBuffer(data)
	img, err := jpeg.Decode(inBuf)
	if err != nil {
		return nil
	}

	encoded := s.encodeJPEG(img)
	if encoded == nil || len(encoded) > len(data) { // only use the new file if it is smaller
		return nil
	}

	s.logger.Debug("Recoded JPEG",
		log.String("url", url.String()),
		log.Int("size_original", len(data)),
		log.Int("size_recoded", len(encoded)))
	return encoded
}

// recodePNG recodes the image and returns it if it is smaller than before.
func (s *Scraper) recodePNG(url fmt.Stringer, data []byte) []byte {
	inBuf := bytes.NewBuffer(data)
	img, err := png.Decode(inBuf)
	if err != nil {
		return nil
	}

	encoded := s.encodeJPEG(img)
	if encoded == nil || len(encoded) > len(data) { // only use the new file if it is smaller
		return nil
	}

	s.logger.Debug("Recoded PNG",
		log.String("url", url.String()),
		log.Int("size_original", len(data)),
		log.Int("size_recoded", len(encoded)))
	return encoded
}


================================================
FILE: scraper/scraper.go
================================================
package scraper

import (
	"bytes"
	"context"
	"encoding/base64"
	"errors"
	"fmt"
	"net/http"
	"net/http/cookiejar"
	"net/url"
	"regexp"
	"time"

	"github.com/cornelk/goscrape/htmlindex"
	"github.com/cornelk/gotokit/httpclient"
	"github.com/cornelk/gotokit/log"
	"github.com/cornelk/gotokit/set"
	"github.com/h2non/filetype"
	"github.com/h2non/filetype/types"
	"golang.org/x/net/html"
)

// Config contains the scraper configuration.
type Config struct {
	URL      string
	Includes []string
	Excludes []string

	ImageQuality uint // image quality from 0 to 100%, 0 to disable reencoding
	MaxDepth     uint // download depth, 0 for unlimited
	Timeout      uint // time limit in seconds to process each http request

	OutputDirectory string
	Username        string
	Password        string

	Cookies   []Cookie
	Header    http.Header
	Proxy     string
	UserAgent string
}

type (
	httpDownloader     func(ctx context.Context, u *url.URL) ([]byte, *url.URL, error)
	dirCreator         func(path string) error
	fileExistenceCheck func(filePath string) bool
	fileWriter         func(filePath string, data []byte) error
)

// Scraper contains all scraping data.
type Scraper struct {
	config  Config
	cookies *cookiejar.Jar
	logger  *log.Logger
	URL     *url.URL // contains the main URL to parse, will be modified in case of a redirect

	auth   string
	client *http.Client

	includes []*regexp.Regexp
	excludes []*regexp.Regexp

	// key is the URL of page or asset
	processed set.Set[string]

	imagesQueue       []*url.URL
	webPageQueue      []*url.URL
	webPageQueueDepth map[string]uint

	dirCreator         dirCreator
	fileExistenceCheck fileExistenceCheck
	fileWriter         fileWriter
	httpDownloader     httpDownloader
}

// New creates a new Scraper instance.
// nolint: funlen
func New(logger *log.Logger, cfg Config) (*Scraper, error) {
	var errs []error

	u, err := url.Parse(cfg.URL)
	if err != nil {
		errs = append(errs, err)
	}
	u.Fragment = ""

	includes, err := compileRegexps(cfg.Includes)
	if err != nil {
		errs = append(errs, err)
	}

	excludes, err := compileRegexps(cfg.Excludes)
	if err != nil {
		errs = append(errs, err)
	}

	if errs != nil {
		return nil, errors.Join(errs...)
	}

	if u.Scheme == "" {
		u.Scheme = "http" // if no URL scheme was given default to http
	}

	cookies, err := createCookieJar(u, cfg.Cookies)
	if err != nil {
		return nil, err
	}

	// Create HTTP transport with proxy configuration
	transport, err := httpclient.ProxyTransportFromConfig(cfg.Proxy)
	if err != nil {
		return nil, fmt.Errorf("creating proxy transport: %w", err)
	}

	client := &http.Client{
		Jar:       cookies,
		Timeout:   time.Duration(cfg.Timeout) * time.Second,
		Transport: transport,
	}

	s := &Scraper{
		config:  cfg,
		cookies: cookies,
		logger:  logger,
		URL:     u,

		client: client,

		includes: includes,
		excludes: excludes,

		processed: set.New[string](),

		webPageQueueDepth: map[string]uint{},
	}

	s.dirCreator = s.createDownloadPath
	s.fileExistenceCheck = s.fileExists
	s.fileWriter = s.writeFile
	s.httpDownloader = s.downloadURLWithRetries

	if s.config.Username != "" {
		s.auth = "Basic " + base64.StdEncoding.EncodeToString([]byte(s.config.Username+":"+s.config.Password))
	}

	return s, nil
}

// Start starts the scraping.
func (s *Scraper) Start(ctx context.Context) error {
	if err := s.dirCreator(s.config.OutputDirectory); err != nil {
		return err
	}

	if !s.shouldURLBeDownloaded(s.URL, 0, false) {
		return errors.New("start page is excluded from downloading")
	}

	if err := s.processURL(ctx, s.URL, 0); err != nil {
		return err
	}

	for len(s.webPageQueue) > 0 {
		ur := s.webPageQueue[0]
		s.webPageQueue = s.webPageQueue[1:]
		currentDepth := s.webPageQueueDepth[ur.String()]
		if err := s.processURL(ctx, ur, currentDepth+1); err != nil && errors.Is(err, context.Canceled) {
			return err
		}
	}

	return nil
}

func (s *Scraper) processURL(ctx context.Context, u *url.URL, currentDepth uint) error {
	s.logger.Info("Downloading webpage", log.String("url", u.String()))
	data, respURL, err := s.httpDownloader(ctx, u)
	if err != nil {
		s.logger.Error("Processing HTTP Request failed",
			log.String("url", u.String()),
			log.Err(err))
		return err
	}

	fileExtension := ""
	kind, err := filetype.Match(data)
	if err == nil && kind != types.Unknown {
		fileExtension = kind.Extension
	}

	if currentDepth == 0 {
		u = respURL
		// use the URL that the website returned as new base url for the
		// scrape, in case of a redirect it changed
		s.URL = u
	}

	buf := bytes.NewBuffer(data)
	doc, err := html.Parse(buf)
	if err != nil {
		s.logger.Error("Parsing HTML failed",
			log.String("url", u.String()),
			log.Err(err))
		return fmt.Errorf("parsing HTML: %w", err)
	}

	index := htmlindex.New(s.logger)
	index.Index(u, doc)

	s.storeDownload(u, data, doc, index, fileExtension)

	if err := s.downloadReferences(ctx, index); err != nil {
		return err
	}

	// check first and download afterward to not hit max depth limit for
	// start page links because of recursive linking
	// a hrefs
	references, err := index.URLs(htmlindex.ATag)
	if err != nil {
		s.logger.Error("Parsing URL failed", log.Err(err))
	}

	for _, ur := range references {
		ur.Fragment = ""

		if s.shouldURLBeDownloaded(ur, currentDepth, false) {
			s.webPageQueue = append(s.webPageQueue, ur)
			s.webPageQueueDepth[ur.String()] = currentDepth
		}
	}

	return nil
}

// storeDownload writes the download to a file, if a known binary file is detected,
// processing of the file as page to look for links is skipped.
func (s *Scraper) storeDownload(u *url.URL, data []byte, doc *html.Node,
	index *htmlindex.Index, fileExtension string) {

	// We need to distinguish between HTML pages and binary files (images, PDFs, etc.)
	// because they need different file path handling:
	// - HTML pages: add .html extension, handle directory indexes like /about -> /about.html
	// - Binary files: keep original path, so /photo.jpg stays /photo.jpg, not /photo.jpg.html
	// This prevents breaking binary downloads that were working before.
	isAPage := false
	if fileExtension == "" {
		fixed, hasChanges, err := s.fixURLReferences(u, doc, index)
		if err != nil {
			s.logger.Error("Fixing file references failed",
				log.String("url", u.String()),
				log.Err(err))
			return
		}

		if hasChanges {
			data = fixed
		}
		// Only HTML content gets processed as a "page" - binary files stay as-is
		isAPage = true
	}

	filePath := s.getFilePath(u, isAPage)
	// always update html files, content might have changed
	if err := s.fileWriter(filePath, data); err != nil {
		s.logger.Error("Writing to file failed",
			log.String("URL", u.String()),
			log.String("file", filePath),
			log.Err(err))
	}
}

// compileRegexps compiles the given regex strings to regular expressions
// to be used in the include and exclude filters.
func compileRegexps(regexps []string) ([]*regexp.Regexp, error) {
	var errs []error
	var compiled []*regexp.Regexp

	for _, exp := range regexps {
		re, err := regexp.Compile(exp)
		if err == nil {
			compiled = append(compiled, re)
		} else {
			errs = append(errs, err)
		}
	}

	if len(errs) > 0 {
		return nil, errors.Join(errs...)
	}
	return compiled, nil
}


================================================
FILE: scraper/scraper_test.go
================================================
package scraper

import (
	"context"
	"fmt"
	"net/url"
	"testing"

	"github.com/cornelk/gotokit/log"
	"github.com/cornelk/gotokit/set"
	"github.com/stretchr/testify/assert"
	"github.com/stretchr/testify/require"
)

func newTestScraper(t *testing.T, startURL string, urls map[string][]byte) *Scraper {
	t.Helper()

	logger := log.NewTestLogger(t)
	cfg := Config{
		URL: startURL,
	}
	scraper, err := New(logger, cfg)
	require.NoError(t, err)
	require.NotNil(t, scraper)

	scraper.dirCreator = func(_ string) error {
		return nil
	}
	scraper.fileWriter = func(_ string, _ []byte) error {
		return nil
	}
	scraper.fileExistenceCheck = func(_ string) bool {
		return false
	}
	scraper.httpDownloader = func(_ context.Context, url *url.URL) ([]byte, *url.URL, error) {
		ur := url.String()
		b, ok := urls[ur]
		if ok {
			return b, url, nil
		}
		return nil, nil, fmt.Errorf("url '%s' not found in test data", ur)
	}

	return scraper
}

func TestScraperLinks(t *testing.T) {
	indexPage := []byte(`
<html>
<head>
<link href=' https://example.org/style.css#fragment' rel='stylesheet' type='text/css'>
</head>
<body>
<a href="https://example.org/page2">Example</a>
</body>
</html>
`)

	page2 := []byte(`
<html>
<body>

<!--link to index with fragment-->
<a href="/#fragment">a</a>
<!--link to page with fragment-->
<a href="/sub/#fragment">a</a>

</body>
</html>
`)

	css := []byte(``)

	startURL := "https://example.org/#fragment" // start page with fragment
	urls := map[string][]byte{
		"https://example.org/":          indexPage,
		"https://example.org/page2":     page2,
		"https://example.org/sub/":      indexPage,
		"https://example.org/style.css": css,
	}

	scraper := newTestScraper(t, startURL, urls)
	require.NotNil(t, scraper)

	ctx := context.Background()
	err := scraper.Start(ctx)
	require.NoError(t, err)

	expectedProcessed := set.NewFromSlice([]string{
		"/",
		"/page2",
		"/sub",
		"/style.css",
	})
	assert.Equal(t, expectedProcessed, scraper.processed)
}

func TestScraperAttributes(t *testing.T) {
	indexPage := []byte(`
<html>
<head>
</head>

<body background="bg.gif">

<!--embedded image-->
<img src='data:image/gif;base64,R0lGODlhAQABAAD/ACwAAAAAAQABAAACADs%3D=' />

</body>
</html>
`)
	empty := []byte(``)

	startURL := "https://example.org/"
	urls := map[string][]byte{
		"https://example.org/":       indexPage,
		"https://example.org/bg.gif": empty,
	}

	scraper := newTestScraper(t, startURL, urls)
	require.NotNil(t, scraper)

	ctx := context.Background()
	err := scraper.Start(ctx)
	require.NoError(t, err)

	expectedProcessed := set.NewFromSlice([]string{
		"/",
		"/bg.gif",
	})
	assert.Equal(t, expectedProcessed, scraper.processed)
}

func TestScraperInternalCss(t *testing.T) {
	indexPage := []byte(`
<html>
<head>
<style>
h1 {
  background-image: url('https://example.org/background.jpg');
}
h2 {
  background-image: url('/img/bg.jpg');
}
h3 {
  background-image: url(bg3.jpg);
}
</style>
</head>
<body>
</body>
</html>
`)
	empty := []byte(``)

	domain := "example.org"
	file1Reference := "background.jpg"
	file2Reference := "img/bg.jpg"
	file3Reference := "bg3.jpg"
	fullURL := "https://" + domain

	urls := map[string][]byte{
		fullURL + "/":                  indexPage,
		fullURL + "/" + file1Reference: empty,
		fullURL + "/" + file2Reference: empty,
		fullURL + "/" + file3Reference: empty,
	}

	scraper := newTestScraper(t, fullURL+"/", urls)
	require.NotNil(t, scraper)

	files := map[string][]byte{}
	scraper.fileWriter = func(filePath string, data []byte) error {
		files[filePath] = data
		return nil
	}

	ctx := context.Background()
	err := scraper.Start(ctx)
	require.NoError(t, err)

	expectedProcessed := set.NewFromSlice([]string{
		"/",
		"/" + file1Reference,
		"/" + file2Reference,
		"/" + file3Reference,
	})
	require.Equal(t, expectedProcessed, scraper.processed)

	ref := domain + "/index.html"
	content := string(files[ref])
	assert.Contains(t, content, "url('"+file1Reference+"')")
	assert.Contains(t, content, "url('"+file2Reference+"')")
	assert.Contains(t, content, "url("+file3Reference+")")
}


================================================
FILE: scraper/server.go
================================================
package scraper

import (
	"context"
	"fmt"
	"mime"
	"net/http"

	"github.com/cornelk/gotokit/log"
)

// set more mime types in the browser, this for example fixes .asp files not being
// downloaded but handled as html.
var mimeTypes = map[string]string{
	".asp": "text/html; charset=utf-8",
}

// ServeDirectory serves a directory on a given port as a web server.
func ServeDirectory(ctx context.Context, path string, port int16, logger *log.Logger) error {
	fs := http.FileServer(http.Dir(path))
	mux := http.NewServeMux()
	mux.Handle("/", fs) // server root by file system

	// update mime types
	for ext, mt := range mimeTypes {
		if err := mime.AddExtensionType(ext, mt); err != nil {
			return fmt.Errorf("adding mime type '%s': %w", ext, err)
		}
	}

	fullAddr := fmt.Sprintf("http://127.0.0.1:%d", port)
	logger.Info("Serving directory...",
		log.String("path", path),
		log.String("address", fullAddr))

	server := &http.Server{
		Addr:    fmt.Sprintf(":%d", port),
		Handler: mux,
	}

	serverErr := make(chan error, 1)
	go func() {
		serverErr <- server.ListenAndServe()
	}()

	select {
	case <-ctx.Done():
		//nolint: contextcheck
		if err := server.Shutdown(context.Background()); err != nil {
			return fmt.Errorf("shutting down webserver: %w", err)
		}
		return nil

	case err := <-serverErr:
		return fmt.Errorf("starting webserver: %w", err)
	}
}


================================================
FILE: scraper/url.go
================================================
package scraper

import (
	"net/url"
	"path"
	"path/filepath"
	"strings"
)

func resolveURL(base *url.URL, reference, mainPageHost string, isHyperlink bool, relativeToRoot string) string {
	ur, err := url.Parse(reference)
	if err != nil {
		return ""
	}

	var resolvedURL *url.URL
	if ur.Host != "" && ur.Host != mainPageHost {
		if isHyperlink { // do not change links to external websites
			return reference
		}

		resolvedURL = base.ResolveReference(ur)
		resolvedURL.Path = filepath.Join("_"+ur.Host, resolvedURL.Path)
	} else {
		if isHyperlink {
			ur.Path = getPageFilePath(ur)
			resolvedURL = base.ResolveReference(ur)
		} else {
			resolvedURL = base.ResolveReference(ur)
		}
	}

	if resolvedURL.Host == mainPageHost {
		resolvedURL.Path = urlRelativeToOther(resolvedURL, base)
		relativeToRoot = ""
	}

	resolvedURL.Host = ""   // remove host
	resolvedURL.Scheme = "" // remove http/https
	resolved := resolvedURL.String()

	if resolved == "" {
		resolved = "/" // website root
	} else {
		if resolved[0] == '/' && len(relativeToRoot) > 0 {
			resolved = relativeToRoot + resolved[1:]
		} else {
			resolved = relativeToRoot + resolved
		}
	}

	if isHyperlink {
		if resolved[len(resolved)-1] == '/' {
			resolved += PageDirIndex // link dir index to index.html
		} else {
			l := strings.LastIndexByte(resolved, '/')
			if l != -1 && l < len(resolved) && resolved[l+1] == '#' {
				resolved = resolved[:l+1] + PageDirIndex + resolved[l+1:] // link fragment correct
			}
		}
	}

	resolved = strings.TrimPrefix(resolved, "/")
	return resolved
}

func urlRelativeToRoot(url *url.URL) string {
	var rel strings.Builder
	splits := strings.Split(url.Path, "/")

	for i := range splits {
		if (len(splits[i]) > 0) && (i < len(splits)-1) {
			rel.WriteString("../")
		}
	}

	return rel.String()
}

func urlRelativeToOther(src, base *url.URL) string {
	srcSplits := strings.Split(src.Path, "/")
	baseSplits := strings.Split(getPageFilePath(base), "/")

	for len(srcSplits) > 0 && len(baseSplits) > 0 {
		if len(srcSplits[0]) == 0 {
			srcSplits = srcSplits[1:]
			continue
		}
		if len(baseSplits[0]) == 0 {
			baseSplits = baseSplits[1:]
			continue
		}

		if srcSplits[0] == baseSplits[0] {
			srcSplits = srcSplits[1:]
			baseSplits = baseSplits[1:]
		} else {
			break
		}
	}

	var upLevels strings.Builder

	for i, split := range baseSplits {
		if split == "" {
			continue
		}
		// Page filename is not a level.
		if i == len(baseSplits)-1 {
			break
		}
		upLevels.WriteString("../")
	}

	return upLevels.String() + path.Join(srcSplits...)
}


================================================
FILE: scraper/url_test.go
================================================
package scraper

import (
	"net/url"
	"testing"

	"github.com/stretchr/testify/assert"
)

func TestResolveURL(t *testing.T) {
	type filePathFixture struct {
		BaseURL        url.URL
		Reference      string
		IsHyperlink    bool
		RelativeToRoot string
		Resolved       string
	}

	pathlessURL := url.URL{
		Scheme: "https",
		Host:   "petpic.xyz",
		Path:   "",
	}

	URL := url.URL{
		Scheme: "https",
		Host:   "petpic.xyz",
		Path:   "/earth/",
	}

	var fixtures = []filePathFixture{
		{pathlessURL, "", true, "", "index.html"},
		{pathlessURL, "#contents", true, "", "#contents"},
		{URL, "brasil/index.html", true, "", "brasil/index.html"},
		{URL, "brasil/rio/index.html", true, "", "brasil/rio/index.html"},
		{URL, "../argentina/cat.jpg", false, "", "../argentina/cat.jpg"},
	}

	for _, fix := range fixtures {
		resolved := resolveURL(&fix.BaseURL, fix.Reference, URL.Host, fix.IsHyperlink, fix.RelativeToRoot)
		assert.Equal(t, fix.Resolved, resolved)
	}
}

func Test_urlRelativeToOther(t *testing.T) {
	type filePathFixture struct {
		SrcURL          url.URL
		BaseURL         url.URL
		ExpectedSrcPath string
	}

	var fixtures = []filePathFixture{
		{url.URL{Path: "/earth/brasil/rio/cat.jpg"}, url.URL{Path: "/earth/brasil/rio/"}, "cat.jpg"},
		{url.URL{Path: "/earth/brasil/rio/cat.jpg"}, url.URL{Path: "/earth/"}, "brasil/rio/cat.jpg"},
		{url.URL{Path: "/earth/cat.jpg"}, url.URL{Path: "/earth/brasil/rio/"}, "../../cat.jpg"},
		{url.URL{Path: "/earth/argentina/cat.jpg"}, url.URL{Path: "/earth/brasil/rio/"}, "../../argentina/cat.jpg"},
		{url.URL{Path: "/earth/brasil/rio/cat.jpg"}, url.URL{Path: "/mars/dogtown/"}, "../../earth/brasil/rio/cat.jpg"},
		{url.URL{Path: "///earth//////cat.jpg"}, url.URL{Path: "///earth/brasil//rio////////"}, "../../cat.jpg"},
	}

	for _, fix := range fixtures {
		relativeURL := urlRelativeToOther(&fix.SrcURL, &fix.BaseURL)
		assert.Equal(t, fix.ExpectedSrcPath, relativeURL)
	}
}

func Test_urlRelativeToRoot(t *testing.T) {
	type urlFixture struct {
		SrcURL   url.URL
		Expected string
	}

	var fixtures = []urlFixture{
		{url.URL{Path: "/earth/brasil/rio/cat.jpg"}, "../../../"},
		{url.URL{Path: "cat.jpg"}, ""},
		{url.URL{Path: "/earth/argentina"}, "../"},
		{url.URL{Path: "///earth//////cat.jpg"}, "../"},
	}

	for _, fix := range fixtures {
		relativeURL := urlRelativeToRoot(&fix.SrcURL)
		assert.Equal(t, fix.Expected, relativeURL)
	}
}