[
  {
    "path": ".codecov.yml",
    "content": "coverage:\n  status:\n    patch: off\n    project:\n      default:\n        target: 35%\n        threshold: 5%\n"
  },
  {
    "path": ".github/ISSUE_TEMPLATE/bug_report.md",
    "content": "---\nname: Bug report\nabout: Create a report to help us improve\ntitle: ''\nlabels: bug\nassignees: ''\n\n---\n\n**Describe the bug**\nA clear and concise description of what the bug is.\n\n**To Reproduce**\nSteps to reproduce the behavior:\n\n**Expected behavior**\nA clear and concise description of what you expected to happen.\n\n**Screenshots**\nIf applicable, add screenshots to help explain your problem.\n"
  },
  {
    "path": ".github/ISSUE_TEMPLATE/feature_request.md",
    "content": "---\nname: Feature request\nabout: Suggest an idea for this project\ntitle: ''\nlabels: enhancement\nassignees: ''\n\n---\n\n**Is your feature request related to a problem? Please describe.**\nA clear and concise description of what the problem is. Ex. I'm always frustrated when [...]\n\n**Describe the solution you'd like**\nA clear and concise description of what you want to happen.\n\n**Describe alternatives you've considered**\nA clear and concise description of any alternative solutions or features you've considered.\n\n**Additional context**\nAdd any other context or screenshots about the feature request here.\n"
  },
  {
    "path": ".github/workflows/codecov.yaml",
    "content": "name: codecov\n\non:\n  push:\n    branches:\n      - main\n  pull_request:\n\njobs:\n  codecov:\n    timeout-minutes: 15\n\n    name: Coverage\n    runs-on: ubuntu-latest\n    steps:\n      - name: Set up Go 1.x\n        uses: actions/setup-go@v5\n        with:\n          go-version: \"1.24\"\n        id: go\n\n      - name: Check out code into the Go module directory\n        uses: actions/checkout@v4\n\n      - name: Get dependencies\n        run: go mod download\n\n      - name: Run tests with coverage\n        run: make test-coverage\n\n      - name: Upload coverage to Codecov\n        uses: codecov/codecov-action@v4\n        with:\n          token: ${{ secrets.CODECOV_TOKEN }}\n          file: ./.testCoverage\n"
  },
  {
    "path": ".github/workflows/go.yaml",
    "content": "name: CI\n\non:\n  push:\n    branches:\n      - main\n  pull_request:\n    types:\n      - opened\n      - reopened\n      - synchronize\n      - ready_for_review\n\njobs:\n  build:\n    if: ${{ github.event_name == 'push' || !github.event.pull_request.draft || !contains(github.event.commits[0].message, '[skip ci]') }}\n    timeout-minutes: 15\n\n    name: Build\n    runs-on: ubuntu-latest\n    strategy:\n      matrix:\n        go: [ \"1.24\" ]\n\n    steps:\n      - name: Set up Go 1.x\n        uses: actions/setup-go@v5\n        with:\n          go-version: ${{ matrix.go }}\n        id: go\n\n      - name: Check out code into the Go module directory\n        uses: actions/checkout@v4\n\n      - name: Install linters\n        run: make install-linters\n\n      - name: Get dependencies\n        run: go mod download\n\n      - name: Run tests\n        run: make test\n\n      - name: Run linter\n        run: make lint\n"
  },
  {
    "path": ".github/workflows/release.yml",
    "content": "name: release\non:\n  push:\n    tags:\n      - 'v*'\njobs:\n  goreleaser:\n    runs-on: ubuntu-latest\n    steps:\n      -\n        name: Checkout\n        uses: actions/checkout@v4\n        with:\n          fetch-depth: 0\n      -\n        name: Set up Go\n        uses: actions/setup-go@v5\n        with:\n          go-version: '1.24'\n          check-latest: true\n          cache: true\n      -\n        name: Docker login\n        uses: docker/login-action@v3\n        with:\n          registry: 'ghcr.io'\n          username: ${{ github.actor }}\n          password: ${{ secrets.GITHUB_TOKEN }}\n      -\n        name: Run GoReleaser\n        uses: goreleaser/goreleaser-action@v6\n        with:\n          version: latest\n          args: release --clean\n        env:\n          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}\n"
  },
  {
    "path": ".gitignore",
    "content": "*.db\n*.dmp\n*.exe\n*.iml\n*.local\n*.out\n*.prof\n*.test\n.DS_Store\n.fleet\n.idea\n.vscode\n/*.log\n\n.testCoverage\ndist/\ngoscrape\n"
  },
  {
    "path": ".golangci.yml",
    "content": "version: \"2\"\nrun:\n  modules-download-mode: readonly\nlinters:\n  enable:\n    - asasalint # Check for pass []any as any in variadic func(...any)\n    - asciicheck # Checks that code identifiers do not have non-ASCII symbols\n    - bidichk # Checks for dangerous unicode character sequences\n    - bodyclose # Checks whether HTTP response body is closed successfully\n    - containedctx # Detects struct contained context.Context field\n    - contextcheck # Check whether the function uses a non-inherited context\n    - copyloopvar # Detects places where loop variables are copied\n    - cyclop # Checks function and package cyclomatic complexity\n    - decorder # Check declaration order of types, constants, variables and functions\n    - dogsled # Checks assignments with too many blank identifiers\n    - durationcheck # Check for two durations multiplied together\n    - embeddedstructfieldcheck # Checks embedded struct field positioning and formatting\n    - err113 # Check errors handling expressions\n    - errcheck # Checks for unchecked errors in Go code\n    - errname # Checks that sentinel errors are prefixed with Err\n    - errorlint # Find code that will cause problems with error wrapping\n    - funlen # Checks for long functions\n    - gocheckcompilerdirectives # Checks that go compiler directives are valid\n    - gocognit # Computes and checks the cognitive complexity of functions\n    - goconst # Finds repeated strings that could be replaced by a constant\n    - gocritic # Provides diagnostics that check for bugs, performance and style issues\n    - gocyclo # Computes and checks the cyclomatic complexity of functions\n    - godot # Check if comments end in a period\n    - govet # Examines Go source code and reports suspicious constructs\n    - grouper # Analyze expression groups\n    - iface # Detect the incorrect use of interfaces\n    - ineffassign # Detects when assignments to existing variables are not used\n    - intrange # Find places where for loops could use an integer range\n    - ireturn # Accept Interfaces, Return Concrete Types\n    - maintidx # Measures the maintainability index of each function\n    - makezero # Find slice declarations with non-zero initial length\n    - mirror # Reports wrong mirror patterns of bytes/strings usage\n    - misspell # Finds commonly misspelled English words\n    - modernize # Suggest simplifications using modern Go features\n    - nakedret # Finds naked returns in functions longer than a maximum size\n    - nestif # Reports deeply nested if statements\n    - nilerr # Finds code that returns nil even if it checks that error is not nil\n    - nilnil # Checks that there is no simultaneous return of nil error and invalid value\n    - noctx # Detects missing usage of context.Context\n    - nosprintfhostport # Checks for misuse of Sprintf to construct a host with port\n    - perfsprint # Checks that fmt.Sprintf can be replaced with a faster alternative\n    - prealloc # Find slice declarations that could be pre-allocated\n    - predeclared # Find code that shadows one of Go's predeclared identifiers\n    - reassign # Checks that package variables are not reassigned\n    - revive # Fast, configurable, extensible linter for Go\n    - staticcheck # Set of rules from staticcheck\n    - testifylint # Checks usage of github.com/stretchr/testify\n    - thelper # Detects test helpers which do not call t.Helper()\n    - tparallel # Detects inappropriate usage of t.Parallel()\n    - unconvert # Remove unnecessary type conversions\n    - unparam # Reports unused function parameters\n    - unused # Checks Go code for unused constants, variables, functions and types\n    - usestdlibvars # Detect the possibility to use variables/constants from stdlib\n    - wastedassign # Finds wasted assignment statements\n    - whitespace # Checks for unnecessary newlines\n    - wrapcheck # Checks that errors returned from external packages are wrapped\n  settings:\n    cyclop:\n      max-complexity: 15\n    whitespace:\n      multi-if: true # Enforces newlines (or comments) after every multi-line if statement\n      multi-func: true # Enforces newlines (or comments) after every multi-line function signature\n  exclusions:\n    rules:\n      - linters:\n          - err113\n        text: do not define dynamic errors\nformatters:\n  enable:\n    - gci # Check if code and import statements are formatted, with additional rules\n    - gofmt # Check if the code is formatted according to 'gofmt' command\n    - goimports # Checks if the code and import statements are formatted according to the 'goimports' command\n"
  },
  {
    "path": ".goreleaser.yaml",
    "content": "version: 2\n\nbefore:\n  hooks:\n    - go mod download\n\nbuilds:\n  - id: goscrape\n    binary: goscrape\n    dir: .\n    env:\n      - CGO_ENABLED=0\n    targets:\n      - go_first_class\n    flags:\n      - -trimpath\n    ldflags:\n      - -s -w -X main.version={{.Version}} -X main.commit={{.Commit}} -X main.date={{ .CommitDate }}\n\ndockers:\n  - id: goscrape\n    image_templates:\n      - \"ghcr.io/cornelk/goscrape:{{ .Version }}\"\n\nuniversal_binaries:\n  - replace: false\n\narchives:\n  - id: goscrape\n    builds: ['goscrape']\n    name_template: >-\n      {{ .ProjectName }}_v{{ .Version }}_\n      {{- if eq .Os \"darwin\" }}macos\n      {{- else }}{{ .Os }}{{ end }}_\n      {{- if eq .Arch \"amd64\" }}64bit\n      {{- else if eq .Arch \"386\" }}32bit\n      {{- else if eq .Arch \"Darwin\" }}macos\n      {{- else }}{{ .Arch }}{{ end }}\n    format_overrides:\n      - goos: windows\n        format: zip\n\nchecksum:\n  name_template: 'checksums.txt'\n\nsnapshot:\n  name_template: \"{{ .Tag }}-snapshot\"\n\nchangelog:\n  disable: true\n"
  },
  {
    "path": "CHANGELOG.md",
    "content": "# Changelog for goscrape\n\nAll notable changes to this project will be documented in this file.\n\n## [v0.3.0] - 2025-01-01\n\nAdded:\n\n* body background attribute support\n* fragments at the end of the URL are now filtered\n* support srcset attribute for image downloading\n\nFixed:\n\n* inline css gets processed correctly\n* --include / --exclude parameter list handling\n\n## [v0.2.0] - 2024-06-21\n\nAdded:\n\n* add cookie loading and storing support\n* add docker image build\n* add header parameter support\n* add user agent parameter support\n* support serving directory in local webserver\n\nChanged:\n\n* improved error messages\n* reduced dependencies and binary size\n\nFixed:\n\n* fix asset downloading\n* fix downloading of binary files\n* fix logger output\n* fix logger verbose flag usage\n* fix stopping of program using ctrl+c\n\n## [v0.1.1] - 2023-02-11\n\nAdded:\n\n* binaries for releases\n* http request timeout configuration support\n* proxy configuration support\n\nFixed:\n\n* do not abort scraping on error\n\n\n## [v0.1.0] - 2022-06-26\n\nFirst version of goscrape released.\n"
  },
  {
    "path": "Dockerfile",
    "content": "FROM gcr.io/distroless/static-debian12\n\nCOPY goscrape /\n\nENTRYPOINT [\"./goscrape\"]\n\n"
  },
  {
    "path": "LICENSE",
    "content": "MIT License\n\nCopyright (c) 2017 Cornel\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\nof this software and associated documentation files (the \"Software\"), to deal\nin the Software without restriction, including without limitation the rights\nto use, copy, modify, merge, publish, distribute, sublicense, and/or sell\ncopies of the Software, and to permit persons to whom the Software is\nfurnished to do so, subject to the following conditions:\n\nThe above copyright notice and this permission notice shall be included in all\ncopies or substantial portions of the Software.\n\nTHE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\nIMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\nFITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\nAUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\nLIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\nOUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\nSOFTWARE.\n"
  },
  {
    "path": "Makefile",
    "content": "GOLANGCI_VERSION = v2.6.0\n\nhelp: ## show help, shown by default if no target is specified\n\t@grep -E '^[0-9a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = \":.*?## \"}; {printf \"\\033[36m%-30s\\033[0m %s\\n\", $$1, $$2}'\n\nlint: ## run code linters\n\tgolangci-lint run\n\ntest: ## run tests\n\tgo test -timeout 10s -race ./...\n\ntest-coverage: ## run unit tests and create test coverage\n\tgo test -timeout 10s ./... -coverprofile .testCoverage -covermode=atomic -coverpkg=./...\n\ntest-coverage-web: test-coverage ## run unit tests and show test coverage in browser\n\tgo tool cover -func .testCoverage | grep total | awk '{print \"Total coverage: \"$$3}'\n\tgo tool cover -html=.testCoverage\n\ninstall: ## install all binaries\n\tgo install -buildvcs=false .\n\ninstall-linters: ## install all linters\n\tgo install github.com/golangci/golangci-lint/v2/cmd/golangci-lint@${GOLANGCI_VERSION}\n\nrelease-snapshot: ## build release binaries from current git state as snapshot\n\tgoreleaser release --snapshot --clean\n"
  },
  {
    "path": "README.md",
    "content": "# goscrape - create offline browsable copies of websites\n\n[![Build status](https://github.com/cornelk/goscrape/actions/workflows/go.yaml/badge.svg?branch=main)](https://github.com/cornelk/goscrape/actions)\n[![go.dev reference](https://img.shields.io/badge/go.dev-reference-007d9c?logo=go&logoColor=white&style=flat-square)](https://pkg.go.dev/github.com/cornelk/goscrape)\n[![Go Report Card](https://goreportcard.com/badge/github.com/cornelk/goscrape)](https://goreportcard.com/report/github.com/cornelk/goscrape)\n[![codecov](https://codecov.io/gh/cornelk/goscrape/branch/main/graph/badge.svg?token=NS5UY28V3A)](https://codecov.io/gh/cornelk/goscrape)\n\nA web scraper built with Golang. It downloads the content of a website and allows it to be archived and read offline.\n\n## Features\n\nFeatures and advantages over existing tools like wget, httrack, Teleport Pro:\n\n* Free and open source\n* Available for all platforms that Golang supports\n* JPEG and PNG images can be converted down in quality to save disk space\n* Excluded URLS will not be fetched (unlike [wget](https://savannah.gnu.org/bugs/?20808))\n* No incomplete temp files are left on disk\n* Downloaded asset files are skipped in a new scraper run\n* Assets from external domains are downloaded automatically\n* Sane default values\n\n## Limitations\n\n* No GUI version, console only\n\n## Installation\n\nThere are 2 options to install goscrape:\n\n1. Download and unpack a binary release from [Releases](https://github.com/cornelk/goscrape/releases)\n\nor\n\n2. Compile the latest release from source:\n\n```\ngo install github.com/cornelk/goscrape@latest\n```\n\nCompiling the tool from source code needs to have a recent version of [Golang](https://go.dev/) installed.\n\n## Usage\n\nScrape a website by running\n```\ngoscrape http://website.com\n```\n\nTo serve the downloaded website directory in a local run webserver use\n```\ngoscrape --serve website.com\n```\n\n## Options\n\n```\n  --include INCLUDE, -n INCLUDE\n                         only include URLs with PERL Regular Expressions support\n  --exclude EXCLUDE, -x EXCLUDE\n                         exclude URLs with PERL Regular Expressions support\n  --output OUTPUT, -o OUTPUT\n                         output directory to write files to\n  --depth DEPTH, -d DEPTH\n                         download depth, 0 for unlimited [default: 10]\n  --imagequality IMAGEQUALITY, -i IMAGEQUALITY\n                         image quality, 0 to disable reencoding\n  --timeout TIMEOUT, -t TIMEOUT\n                         time limit in seconds for each HTTP request to connect and read the request body\n  --serve SERVE, -s SERVE\n                         serve the website using a webserver\n  --serverport SERVERPORT, -r SERVERPORT\n                         port to use for the webserver [default: 8080]\n  --cookiefile COOKIEFILE, -c COOKIEFILE\n                         file containing the cookie content\n  --savecookiefile SAVECOOKIEFILE\n                         file to save the cookie content\n  --header HEADER, -h HEADER\n                         HTTP header to use for scraping\n  --proxy PROXY, -p PROXY\n                         proxy to use in format scheme://[user:password@]host:port (supports HTTP, HTTPS, SOCKS5 protocols)\n  --user USER, -u USER   user[:password] to use for HTTP authentication\n  --useragent USERAGENT, -a USERAGENT\n                         user agent to use for scraping\n  --verbose, -v          verbose output\n  --help, -h             display this help and exit\n  --version              display version and exit\n```\n\n## Cookies\n\nCookies can be passed in a file using the `--cookiefile` parameter and a file containing\ncookies in the following format:\n\n```\n[{\"name\":\"user\",\"value\":\"123\"},{\"name\":\"sessioe\",\"value\":\"sid\"}]\n```\n\n## Proxy Configuration\n\nThe `--proxy` flag supports multiple proxy protocols for scraping through different types of proxy servers:\n\n### Supported Proxy Types\n\n- **HTTP proxies**: `http://proxy.example.com:8080`\n- **HTTPS proxies**: `https://proxy.example.com:8080`\n- **SOCKS5 proxies**: `socks5://proxy.example.com:1080`\n\n### Authentication\n\nProxies requiring authentication can include credentials in the URL:\n\n```bash\n# HTTP proxy with authentication\ngoscrape --proxy http://username:password@proxy.example.com:8080 http://website.com\n\n# SOCKS5 proxy with authentication\ngoscrape --proxy socks5://user:pass@proxy.example.com:1080 http://website.com\n```\n"
  },
  {
    "path": "css/css.go",
    "content": "// Package css provides a CSS parser that can process CSS data and call a processor for every found URL.\npackage css\n\nimport (\n\t\"net/url\"\n\t\"regexp\"\n\t\"strings\"\n\n\t\"github.com/cornelk/gotokit/log\"\n\t\"github.com/gorilla/css/scanner\"\n)\n\nvar cssURLRe = regexp.MustCompile(`^url\\(['\"]?(.*?)['\"]?\\)$`)\n\n// Token represents a token and the corresponding string.\ntype Token = scanner.Token\n\ntype urlProcessor func(token *Token, data string, url *url.URL)\n\n// Process the CSS data and call a processor for every found URL.\nfunc Process(logger *log.Logger, url *url.URL, data string, processor urlProcessor) {\n\tcss := scanner.New(data)\n\n\tfor {\n\t\ttoken := css.Next()\n\t\tif token.Type == scanner.TokenEOF || token.Type == scanner.TokenError {\n\t\t\tbreak\n\t\t}\n\t\tif token.Type != scanner.TokenURI {\n\t\t\tcontinue\n\t\t}\n\n\t\tmatch := cssURLRe.FindStringSubmatch(token.Value)\n\t\tif match == nil {\n\t\t\tcontinue\n\t\t}\n\n\t\tsrc := match[1]\n\t\tif strings.HasPrefix(strings.ToLower(src), \"data:\") {\n\t\t\tcontinue // skip embedded data\n\t\t}\n\n\t\tu, err := url.Parse(src)\n\t\tif err != nil {\n\t\t\tlogger.Error(\"Parsing URL failed\",\n\t\t\t\tlog.String(\"url\", src),\n\t\t\t\tlog.Err(err))\n\t\t\tcontinue\n\t\t}\n\n\t\tprocessor(token, src, u)\n\t}\n}\n"
  },
  {
    "path": "go.mod",
    "content": "module github.com/cornelk/goscrape\n\ngo 1.24.0\n\nrequire (\n\tgithub.com/alexflint/go-arg v1.6.0\n\tgithub.com/cornelk/gotokit v0.0.0-20251031201833-083458d3990b\n\tgithub.com/gorilla/css v1.0.1\n\tgithub.com/h2non/filetype v1.1.4-0.20231228185113-6469358c2bcb\n\tgithub.com/stretchr/testify v1.11.1\n\tgolang.org/x/net v0.46.0\n)\n\nrequire (\n\tgithub.com/alexflint/go-scalar v1.2.0 // indirect\n\tgithub.com/davecgh/go-spew v1.1.1 // indirect\n\tgithub.com/kr/pretty v0.3.0 // indirect\n\tgithub.com/pmezard/go-difflib v1.0.0 // indirect\n\tgopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c // indirect\n\tgopkg.in/yaml.v3 v3.0.1 // indirect\n)\n"
  },
  {
    "path": "go.sum",
    "content": "github.com/alexflint/go-arg v1.6.0 h1:wPP9TwTPO54fUVQl4nZoxbFfKCcy5E6HBCumj1XVRSo=\ngithub.com/alexflint/go-arg v1.6.0/go.mod h1:A7vTJzvjoaSTypg4biM5uYNTkJ27SkNTArtYXnlqVO8=\ngithub.com/alexflint/go-scalar v1.2.0 h1:WR7JPKkeNpnYIOfHRa7ivM21aWAdHD0gEWHCx+WQBRw=\ngithub.com/alexflint/go-scalar v1.2.0/go.mod h1:LoFvNMqS1CPrMVltza4LvnGKhaSpc3oyLEBUZVhhS2o=\ngithub.com/cornelk/gotokit v0.0.0-20251031201833-083458d3990b h1:jT7eWHlrlvjVXSV47U4IjdgI75nLhf/327EHYaGLzso=\ngithub.com/cornelk/gotokit v0.0.0-20251031201833-083458d3990b/go.mod h1:05rfUdBvcZdkWv0/oZfXevCTUzos6WafYzkPzkSkKbY=\ngithub.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E=\ngithub.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=\ngithub.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=\ngithub.com/gorilla/css v1.0.1 h1:ntNaBIghp6JmvWnxbZKANoLyuXTPZ4cAMlo6RyhlbO8=\ngithub.com/gorilla/css v1.0.1/go.mod h1:BvnYkspnSzMmwRK+b8/xgNPLiIuNZr6vbZBTPQ2A3b0=\ngithub.com/h2non/filetype v1.1.4-0.20231228185113-6469358c2bcb h1:GlQyMv2C48qmfPItvAXFoyN341Swxp9JNVeUZxnmbJw=\ngithub.com/h2non/filetype v1.1.4-0.20231228185113-6469358c2bcb/go.mod h1:319b3zT68BvV+WRj7cwy856M2ehB3HqNOt6sy1HndBY=\ngithub.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo=\ngithub.com/kr/pretty v0.2.1/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI=\ngithub.com/kr/pretty v0.3.0 h1:WgNl7dwNpEZ6jJ9k1snq4pZsg7DOEN8hP9Xw0Tsjwk0=\ngithub.com/kr/pretty v0.3.0/go.mod h1:640gp4NfQd8pI5XOwp5fnNeVWj67G7CFk/SaSQn7NBk=\ngithub.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ=\ngithub.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI=\ngithub.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY=\ngithub.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE=\ngithub.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=\ngithub.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=\ngithub.com/rogpeppe/go-internal v1.6.1 h1:/FiVV8dS/e+YqF2JvO3yXRFbBLTIuSDkuC7aBOAvL+k=\ngithub.com/rogpeppe/go-internal v1.6.1/go.mod h1:xXDCJY+GAPziupqXw64V24skbSoqbTEfhy4qGm1nDQc=\ngithub.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs=\ngithub.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U=\ngithub.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U=\ngolang.org/x/net v0.46.0 h1:giFlY12I07fugqwPuWJi68oOnpfqFnJIJzaIIm2JVV4=\ngolang.org/x/net v0.46.0/go.mod h1:Q9BGdFy1y4nkUwiLvT5qtyhAnEHgnQ/zd8PfU6nc210=\ngopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=\ngopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=\ngopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk=\ngopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q=\ngopkg.in/errgo.v2 v2.1.0/go.mod h1:hNsd1EY+bozCKY1Ytp96fpM3vjJbqLJn88ws8XvfDNI=\ngopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=\ngopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=\n"
  },
  {
    "path": "htmlindex/attributes.go",
    "content": "package htmlindex\n\nimport (\n\t\"net/url\"\n\n\t\"github.com/cornelk/gotokit/log\"\n\t\"github.com/cornelk/gotokit/set\"\n\t\"golang.org/x/net/html\"\n)\n\ntype nodeAttributeParserData struct {\n\tlogger    *log.Logger\n\turl       *url.URL\n\tnode      *html.Node\n\tattribute string\n\tvalue     string\n}\n\n// nodeAttributeParser returns the URL values of the attribute of the node and\n// whether the attribute has been processed.\ntype nodeAttributeParser func(data nodeAttributeParserData) ([]string, bool)\n\n// Node describes an HTML tag and its attributes that can contain URLs.\ntype Node struct {\n\tAttributes []string\n\n\tnoChildParsing bool\n\tparser         nodeAttributeParser\n}\n\n// nolint: revive\nconst (\n\tBackgroundAttribute = \"background\"\n\tHrefAttribute       = \"href\"\n\n\tDataSrcAttribute = \"data-src\"\n\tSrcAttribute     = \"src\"\n\n\tDataSrcSetAttribute = \"data-srcset\"\n\tSrcSetAttribute     = \"srcset\"\n)\n\n// nolint: revive\nconst (\n\tATag      = \"a\"\n\tBodyTag   = \"body\"\n\tImgTag    = \"img\"\n\tLinkTag   = \"link\"\n\tScriptTag = \"script\"\n\tStyleTag  = \"style\"\n)\n\n// Nodes describes the HTML tags and their attributes that can contain URL.\nvar Nodes = map[string]Node{\n\tATag: {\n\t\tAttributes: []string{HrefAttribute},\n\t},\n\tBodyTag: {\n\t\tAttributes: []string{BackgroundAttribute},\n\t},\n\tImgTag: {\n\t\tAttributes: []string{SrcAttribute, DataSrcAttribute, SrcSetAttribute, DataSrcSetAttribute},\n\t\tparser:     srcSetValueSplitter,\n\t},\n\tLinkTag: {\n\t\tAttributes: []string{HrefAttribute},\n\t},\n\tScriptTag: {\n\t\tAttributes: []string{SrcAttribute},\n\t},\n\tStyleTag: {\n\t\tnoChildParsing: true,\n\t\tparser:         styleParser,\n\t},\n}\n\n// SrcSetAttributes contains the attributes that contain srcset values.\nvar SrcSetAttributes = set.NewFromSlice([]string{\n\tDataSrcSetAttribute,\n\tSrcSetAttribute,\n})\n"
  },
  {
    "path": "htmlindex/htmlindex.go",
    "content": "// Package htmlindex provides an index for all HTML tags of relevance for scraping.\npackage htmlindex\n\nimport (\n\t\"fmt\"\n\t\"net/url\"\n\t\"slices\"\n\t\"sort\"\n\t\"strings\"\n\n\t\"github.com/cornelk/goscrape/css\"\n\t\"github.com/cornelk/gotokit/log\"\n\t\"golang.org/x/net/html\"\n)\n\n// Index provides an index for all HTML tags of relevance for scraping.\ntype Index struct {\n\tlogger *log.Logger\n\n\t// key is HTML tag, value is a map of all its urls and the HTML nodes for it\n\tdata map[string]map[string][]*html.Node\n}\n\n// New returns a new index.\nfunc New(logger *log.Logger) *Index {\n\treturn &Index{\n\t\tlogger: logger,\n\t\tdata:   make(map[string]map[string][]*html.Node),\n\t}\n}\n\n// Index the given HTML document.\nfunc (idx *Index) Index(baseURL *url.URL, node *html.Node) {\n\tfor child := node.FirstChild; child != nil; child = child.NextSibling {\n\t\tswitch child.Type {\n\t\tcase html.ElementNode:\n\t\t\tidx.indexElementNode(baseURL, node, child)\n\t\tdefault:\n\t\t}\n\t}\n}\n\nfunc (idx *Index) indexElementNode(baseURL *url.URL, node, child *html.Node) {\n\tvar references []string\n\n\tinfo, ok := Nodes[child.Data]\n\tif ok {\n\t\treferences = idx.nodeAttributeURLs(baseURL, child, info.parser, info.Attributes...)\n\t}\n\n\tm, ok := idx.data[child.Data]\n\tif !ok {\n\t\tm = map[string][]*html.Node{}\n\t\tidx.data[child.Data] = m\n\t}\n\n\tfor _, reference := range references {\n\t\tm[reference] = append(m[reference], child)\n\t}\n\n\tif node.FirstChild != nil && !info.noChildParsing {\n\t\tidx.Index(baseURL, child)\n\t}\n}\n\n// URLs returns all URLs of the references found for a specific tag.\nfunc (idx *Index) URLs(tag string) ([]*url.URL, error) {\n\tm, ok := idx.data[tag]\n\tif !ok {\n\t\treturn nil, nil\n\t}\n\n\tdata := make([]string, 0, len(m))\n\tfor key := range m {\n\t\tdata = append(data, key)\n\t}\n\tsort.Strings(data)\n\n\turls := make([]*url.URL, 0, len(m))\n\tfor _, fullURL := range data {\n\t\tu, err := url.Parse(fullURL)\n\t\tif err != nil {\n\t\t\treturn nil, fmt.Errorf(\"parsing URL '%s': %w\", fullURL, err)\n\t\t}\n\t\turls = append(urls, u)\n\t}\n\n\treturn urls, nil\n}\n\n// Nodes returns a map of all URLs and their HTML nodes.\nfunc (idx *Index) Nodes(tag string) map[string][]*html.Node {\n\tm, ok := idx.data[tag]\n\tif ok {\n\t\treturn m\n\t}\n\treturn map[string][]*html.Node{}\n}\n\n// nodeAttributeURLs returns resolved URLs based on the base URL and the HTML node attribute values.\nfunc (idx *Index) nodeAttributeURLs(baseURL *url.URL, node *html.Node,\n\tparser nodeAttributeParser, attributeNames ...string) []string {\n\n\tvar results []string\n\n\tprocessReferences := func(references []string) {\n\t\tfor _, reference := range references {\n\t\t\tur, err := url.Parse(reference)\n\t\t\tif err != nil {\n\t\t\t\tcontinue\n\t\t\t}\n\n\t\t\tur = baseURL.ResolveReference(ur)\n\t\t\tresults = append(results, ur.String())\n\t\t}\n\t}\n\n\tfor _, attr := range node.Attr {\n\t\tif !slices.Contains(attributeNames, attr.Key) {\n\t\t\tcontinue\n\t\t}\n\n\t\tvar references []string\n\t\tvar parserHandled bool\n\n\t\tif parser != nil {\n\t\t\tdata := nodeAttributeParserData{\n\t\t\t\tlogger:    idx.logger,\n\t\t\t\turl:       baseURL,\n\t\t\t\tnode:      node,\n\t\t\t\tattribute: attr.Key,\n\t\t\t\tvalue:     strings.TrimSpace(attr.Val),\n\t\t\t}\n\t\t\treferences, parserHandled = parser(data)\n\t\t}\n\t\tif parser == nil || !parserHandled {\n\t\t\treferences = append(references, strings.TrimSpace(attr.Val))\n\t\t}\n\n\t\tprocessReferences(references)\n\t}\n\n\t// special case to support style tag\n\tif len(attributeNames) == 0 && parser != nil {\n\t\tdata := nodeAttributeParserData{\n\t\t\tlogger: idx.logger,\n\t\t\turl:    baseURL,\n\t\t\tnode:   node,\n\t\t}\n\t\treferences, _ := parser(data)\n\t\tprocessReferences(references)\n\t}\n\n\treturn results\n}\n\n// srcSetValueSplitter returns the URL values of the srcset attribute of img nodes.\nfunc srcSetValueSplitter(data nodeAttributeParserData) ([]string, bool) {\n\tif !SrcSetAttributes.Contains(data.attribute) {\n\t\treturn nil, false\n\t}\n\n\t// split the set of responsive images\n\tvalues := strings.Split(data.value, \",\")\n\n\tfor i, value := range values {\n\t\tvalue = strings.TrimSpace(value)\n\t\t// remove the width in pixels after the url\n\t\tvalues[i], _, _ = strings.Cut(value, \" \")\n\t}\n\n\treturn values, true\n}\n\n// styleParser returns the URL values of a CSS style tag.\nfunc styleParser(data nodeAttributeParserData) ([]string, bool) {\n\tif data.node.FirstChild == nil {\n\t\treturn nil, false\n\t}\n\n\tvar urls []string\n\tprocessor := func(_ *css.Token, _ string, url *url.URL) {\n\t\turls = append(urls, url.String())\n\t}\n\n\tcssData := data.node.FirstChild.Data\n\tcss.Process(data.logger, data.url, cssData, processor)\n\n\treturn urls, true\n}\n"
  },
  {
    "path": "htmlindex/htmlindex_test.go",
    "content": "package htmlindex\n\nimport (\n\t\"bytes\"\n\t\"net/url\"\n\t\"testing\"\n\n\t\"github.com/cornelk/gotokit/log\"\n\t\"github.com/stretchr/testify/assert\"\n\t\"github.com/stretchr/testify/require\"\n\t\"golang.org/x/net/html\"\n)\n\nfunc TestIndex(t *testing.T) {\n\tinput := []byte(`\n<html lang=\"es\">\n<a href=\"https://domain.com/wp-content/uploads/document.pdf\" rel=\"doc\">Guide</a>\n<img src=\"/test.jpg\"/> \n</html>\n`)\n\n\tidx := testSetup(t, input)\n\n\t// check a tag\n\tnodeTag := \"a\"\n\treferences, err := idx.URLs(nodeTag)\n\trequire.NoError(t, err)\n\trequire.Len(t, references, 1)\n\n\ttagURL := \"https://domain.com/wp-content/uploads/document.pdf\"\n\tassert.Equal(t, tagURL, references[0].String())\n\n\turls := idx.Nodes(nodeTag)\n\trequire.Len(t, urls, 1)\n\tnodes, ok := urls[tagURL]\n\trequire.True(t, ok)\n\trequire.Len(t, nodes, 1)\n\tnode := nodes[0]\n\tassert.Equal(t, nodeTag, node.Data)\n\n\t// check img tag\n\tnodeTag = \"img\"\n\treferences, err = idx.URLs(nodeTag)\n\trequire.NoError(t, err)\n\trequire.Len(t, references, 1)\n\n\ttagURL = \"https://domain.com/test.jpg\"\n\tassert.Equal(t, tagURL, references[0].String())\n\n\t// check for not existing tag\n\tnodeTag = \"not-existing\"\n\treferences, err = idx.URLs(nodeTag)\n\trequire.NoError(t, err)\n\trequire.Empty(t, references)\n\turls = idx.Nodes(nodeTag)\n\trequire.Empty(t, urls)\n}\n\nfunc TestIndexImg(t *testing.T) {\n\tinput := []byte(`\n<html lang=\"es\">\n<body background=\"bg.jpg\"></body>\n<img src=\"test.jpg\" srcset=\"test-480w.jpg 480w, test-800w.jpg 800w\"/> \n</body>\n</html>\n`)\n\n\tidx := testSetup(t, input)\n\treferences, err := idx.URLs(ImgTag)\n\trequire.NoError(t, err)\n\trequire.Len(t, references, 3)\n\tassert.Equal(t, \"https://domain.com/test-480w.jpg\", references[0].String())\n\tassert.Equal(t, \"https://domain.com/test-800w.jpg\", references[1].String())\n\tassert.Equal(t, \"https://domain.com/test.jpg\", references[2].String())\n\n\treferences, err = idx.URLs(BodyTag)\n\trequire.NoError(t, err)\n\trequire.Len(t, references, 1)\n\tassert.Equal(t, \"https://domain.com/bg.jpg\", references[0].String())\n}\n\nfunc testSetup(t *testing.T, input []byte) *Index {\n\tt.Helper()\n\n\tbuf := &bytes.Buffer{}\n\t_, err := buf.Write(input)\n\trequire.NoError(t, err)\n\n\tdoc, err := html.Parse(buf)\n\trequire.NoError(t, err)\n\n\tur, err := url.Parse(\"https://domain.com/\")\n\trequire.NoError(t, err)\n\n\tlogger := log.NewTestLogger(t)\n\tidx := New(logger)\n\tidx.Index(ur, doc)\n\n\treturn idx\n}\n"
  },
  {
    "path": "main.go",
    "content": "// Package main provides a command line tool to scrape websites and create an offline browsable version on the disk.\npackage main\n\nimport (\n\t\"context\"\n\t\"encoding/json\"\n\t\"errors\"\n\t\"fmt\"\n\t\"os\"\n\t\"strings\"\n\n\t\"github.com/alexflint/go-arg\"\n\t\"github.com/cornelk/goscrape/scraper\"\n\t\"github.com/cornelk/gotokit/app\"\n\t\"github.com/cornelk/gotokit/buildinfo\"\n\t\"github.com/cornelk/gotokit/env\"\n\t\"github.com/cornelk/gotokit/log\"\n)\n\nvar (\n\tversion = \"dev\"\n\tcommit  = \"\"\n\tdate    = \"\"\n)\n\ntype arguments struct {\n\tInclude []string `arg:\"-n,--include\" help:\"only include URLs with PERL Regular Expressions support\"`\n\tExclude []string `arg:\"-x,--exclude\" help:\"exclude URLs with PERL Regular Expressions support\"`\n\tOutput  string   `arg:\"-o,--output\" help:\"output directory to write files to\"`\n\tURLs    []string `arg:\"positional\"`\n\n\tDepth        int64 `arg:\"-d,--depth\" help:\"download depth, 0 for unlimited\" default:\"10\"`\n\tImageQuality int64 `arg:\"-i,--imagequality\" help:\"image quality, 0 to disable reencoding\"`\n\tTimeout      int64 `arg:\"-t,--timeout\" help:\"time limit in seconds for each HTTP request to connect and read the request body\"`\n\n\tServe      string `arg:\"-s,--serve\" help:\"serve the website using a webserver\"`\n\tServerPort int16  `arg:\"-r,--serverport\" help:\"port to use for the webserver\" default:\"8080\"`\n\n\tCookieFile     string `arg:\"-c,--cookiefile\" help:\"file containing the cookie content\"`\n\tSaveCookieFile string `arg:\"--savecookiefile\" help:\"file to save the cookie content\"`\n\n\tHeaders   []string `arg:\"-h,--header\" help:\"HTTP header to use for scraping\"`\n\tProxy     string   `arg:\"-p,--proxy\" help:\"proxy to use in format scheme://[user:password@]host:port (supports HTTP, HTTPS, SOCKS5 protocols)\"`\n\tUser      string   `arg:\"-u,--user\" help:\"user[:password] to use for HTTP authentication\"`\n\tUserAgent string   `arg:\"-a,--useragent\" help:\"user agent to use for scraping\"`\n\n\tVerbose bool `arg:\"-v,--verbose\" help:\"verbose output\"`\n}\n\nfunc (arguments) Description() string {\n\treturn \"Scrape a website and create an offline browsable version on the disk.\\n\"\n}\n\nfunc (arguments) Version() string {\n\treturn fmt.Sprintf(\"Version: %s\\n\", buildinfo.Version(version, commit, date))\n}\n\nfunc main() {\n\targs, err := readArguments()\n\tif err != nil {\n\t\tfmt.Printf(\"Reading arguments failed: %s\\n\", err)\n\t\tos.Exit(1)\n\t}\n\n\tctx := app.Context()\n\n\tif args.Verbose {\n\t\tlog.SetDefaultLevel(log.DebugLevel)\n\t}\n\tlogger, err := createLogger()\n\tif err != nil {\n\t\tfmt.Printf(\"Creating logger failed: %s\\n\", err)\n\t\tos.Exit(1)\n\t}\n\n\tif args.Serve != \"\" {\n\t\tif err := runServer(ctx, args, logger); err != nil {\n\t\t\tfmt.Printf(\"Server execution error: %s\\n\", err)\n\t\t\tos.Exit(1)\n\t\t}\n\t\treturn\n\t}\n\n\tif err := runScraper(ctx, args, logger); err != nil {\n\t\tfmt.Printf(\"Scraping execution error: %s\\n\", err)\n\t\tos.Exit(1)\n\t}\n}\n\nfunc readArguments() (arguments, error) {\n\tvar args arguments\n\tparser, err := arg.NewParser(arg.Config{}, &args)\n\tif err != nil {\n\t\treturn arguments{}, fmt.Errorf(\"creating argument parser: %w\", err)\n\t}\n\n\tif err = parser.Parse(os.Args[1:]); err != nil {\n\t\tswitch {\n\t\tcase errors.Is(err, arg.ErrHelp):\n\t\t\tparser.WriteHelp(os.Stdout)\n\t\t\tos.Exit(0)\n\t\tcase errors.Is(err, arg.ErrVersion):\n\t\t\tfmt.Println(args.Version())\n\t\t\tos.Exit(0)\n\t\t}\n\n\t\treturn arguments{}, fmt.Errorf(\"parsing arguments: %w\", err)\n\t}\n\n\tif len(args.URLs) == 0 && args.Serve == \"\" {\n\t\tparser.WriteHelp(os.Stdout)\n\t\tos.Exit(0)\n\t}\n\n\treturn args, nil\n}\n\nfunc runScraper(ctx context.Context, args arguments, logger *log.Logger) error {\n\tif len(args.URLs) == 0 {\n\t\treturn nil\n\t}\n\n\tvar username, password string\n\tif args.User != \"\" {\n\t\tsl := strings.Split(args.User, \":\")\n\t\tusername = sl[0]\n\t\tif len(sl) > 1 {\n\t\t\tpassword = sl[1]\n\t\t}\n\t}\n\n\timageQuality := args.ImageQuality\n\tif args.ImageQuality < 0 || args.ImageQuality >= 100 {\n\t\timageQuality = 0\n\t}\n\n\tcookies, err := readCookieFile(args.CookieFile)\n\tif err != nil {\n\t\treturn fmt.Errorf(\"reading cookie: %w\", err)\n\t}\n\n\tcfg := scraper.Config{\n\t\tIncludes: args.Include,\n\t\tExcludes: args.Exclude,\n\n\t\tImageQuality: uint(imageQuality),\n\t\tMaxDepth:     uint(args.Depth),\n\t\tTimeout:      uint(args.Timeout),\n\n\t\tOutputDirectory: args.Output,\n\t\tUsername:        username,\n\t\tPassword:        password,\n\n\t\tCookies:   cookies,\n\t\tHeader:    scraper.Headers(args.Headers),\n\t\tProxy:     args.Proxy,\n\t\tUserAgent: args.UserAgent,\n\t}\n\n\treturn scrapeURLs(ctx, cfg, logger, args)\n}\n\nfunc scrapeURLs(ctx context.Context, cfg scraper.Config,\n\tlogger *log.Logger, args arguments) error {\n\n\tfor _, url := range args.URLs {\n\t\tcfg.URL = url\n\t\tsc, err := scraper.New(logger, cfg)\n\t\tif err != nil {\n\t\t\treturn fmt.Errorf(\"initializing scraper: %w\", err)\n\t\t}\n\n\t\tlogger.Info(\"Scraping\", log.String(\"url\", sc.URL.String()))\n\t\tif err = sc.Start(ctx); err != nil {\n\t\t\tif errors.Is(err, context.Canceled) {\n\t\t\t\tos.Exit(0)\n\t\t\t}\n\n\t\t\treturn fmt.Errorf(\"scraping '%s': %w\", sc.URL, err)\n\t\t}\n\n\t\tif args.SaveCookieFile != \"\" {\n\t\t\tif err := saveCookies(args.SaveCookieFile, sc.Cookies()); err != nil {\n\t\t\t\treturn fmt.Errorf(\"saving cookies: %w\", err)\n\t\t\t}\n\t\t}\n\t}\n\n\treturn nil\n}\n\nfunc runServer(ctx context.Context, args arguments, logger *log.Logger) error {\n\tif err := scraper.ServeDirectory(ctx, args.Serve, args.ServerPort, logger); err != nil {\n\t\treturn fmt.Errorf(\"serving directory: %w\", err)\n\t}\n\treturn nil\n}\n\nfunc createLogger() (*log.Logger, error) {\n\tlogCfg, err := log.ConfigForEnv(env.Development)\n\tif err != nil {\n\t\treturn nil, fmt.Errorf(\"initializing log config: %w\", err)\n\t}\n\tlogCfg.JSONOutput = false\n\tlogCfg.CallerInfo = false\n\n\tlogger, err := log.NewWithConfig(logCfg)\n\tif err != nil {\n\t\treturn nil, fmt.Errorf(\"initializing logger: %w\", err)\n\t}\n\treturn logger, nil\n}\n\nfunc readCookieFile(cookieFile string) ([]scraper.Cookie, error) {\n\tif cookieFile == \"\" {\n\t\treturn nil, nil\n\t}\n\tb, err := os.ReadFile(cookieFile)\n\tif err != nil {\n\t\treturn nil, fmt.Errorf(\"reading cookie file: %w\", err)\n\t}\n\n\tvar cookies []scraper.Cookie\n\tif err := json.Unmarshal(b, &cookies); err != nil {\n\t\treturn nil, fmt.Errorf(\"unmarshaling cookies: %w\", err)\n\t}\n\n\treturn cookies, nil\n}\n\nfunc saveCookies(cookieFile string, cookies []scraper.Cookie) error {\n\tif cookieFile == \"\" || len(cookies) == 0 {\n\t\treturn nil\n\t}\n\n\tb, err := json.Marshal(cookies)\n\tif err != nil {\n\t\treturn fmt.Errorf(\"marshaling cookies: %w\", err)\n\t}\n\n\tif err := os.WriteFile(cookieFile, b, 0644); err != nil {\n\t\treturn fmt.Errorf(\"saving cookies: %w\", err)\n\t}\n\n\treturn nil\n}\n"
  },
  {
    "path": "scraper/checks.go",
    "content": "// Package scraper provides a web scraper that can download a website and its assets.\npackage scraper\n\nimport (\n\t\"net/url\"\n\t\"strings\"\n\n\t\"github.com/cornelk/gotokit/log\"\n)\n\n// normalizeURLPath removes trailing slashes from URL paths for duplicate detection.\n// This treats URLs with and without trailing slashes as the same resource.\nfunc normalizeURLPath(path string) string {\n\tif path == \"\" {\n\t\treturn \"/\"\n\t}\n\t// Keep root path as is, but remove trailing slashes from other paths\n\tif path != \"/\" && strings.HasSuffix(path, \"/\") {\n\t\treturn strings.TrimSuffix(path, \"/\")\n\t}\n\treturn path\n}\n\n// shouldURLBeDownloaded checks whether a page should be downloaded.\n// nolint: cyclop\nfunc (s *Scraper) shouldURLBeDownloaded(url *url.URL, currentDepth uint, isAsset bool) bool {\n\tif url.Scheme != \"http\" && url.Scheme != \"https\" {\n\t\treturn false\n\t}\n\n\tp := url.String()\n\tif url.Host == s.URL.Host {\n\t\tp = url.Path\n\t}\n\tif p == \"\" {\n\t\tp = \"/\"\n\t}\n\n\t// Normalize the path for duplicate detection to handle trailing slashes\n\tnormalizedPath := normalizeURLPath(p)\n\n\tif s.processed.Contains(normalizedPath) { // was already downloaded or checked?\n\t\tif url.Fragment != \"\" {\n\t\t\treturn false\n\t\t}\n\t\treturn false\n\t}\n\n\ts.processed.Add(normalizedPath)\n\n\tif !isAsset {\n\t\tif url.Host != s.URL.Host {\n\t\t\ts.logger.Debug(\"Skipping external host page\", log.String(\"url\", url.String()))\n\t\t\treturn false\n\t\t}\n\n\t\tif s.config.MaxDepth != 0 && currentDepth == s.config.MaxDepth {\n\t\t\ts.logger.Debug(\"Skipping too deep level page\", log.String(\"url\", url.String()))\n\t\t\treturn false\n\t\t}\n\t}\n\n\tif s.includes != nil && !s.isURLIncluded(url) {\n\t\treturn false\n\t}\n\tif s.excludes != nil && s.isURLExcluded(url) {\n\t\treturn false\n\t}\n\n\ts.logger.Debug(\"New URL to download\", log.String(\"url\", url.String()))\n\treturn true\n}\n\nfunc (s *Scraper) isURLIncluded(url *url.URL) bool {\n\tfor _, re := range s.includes {\n\t\tif re.MatchString(url.Path) {\n\t\t\ts.logger.Info(\"Including URL\",\n\t\t\t\tlog.String(\"url\", url.String()),\n\t\t\t\tlog.Stringer(\"included_expression\", re))\n\t\t\treturn true\n\t\t}\n\t}\n\treturn false\n}\n\nfunc (s *Scraper) isURLExcluded(url *url.URL) bool {\n\tfor _, re := range s.excludes {\n\t\tif re.MatchString(url.Path) {\n\t\t\ts.logger.Info(\"Skipping URL\",\n\t\t\t\tlog.String(\"url\", url.String()),\n\t\t\t\tlog.Stringer(\"excluded_expression\", re))\n\t\t\treturn true\n\t\t}\n\t}\n\treturn false\n}\n"
  },
  {
    "path": "scraper/checks_test.go",
    "content": "package scraper\n\nimport (\n\t\"net/url\"\n\t\"testing\"\n\n\t\"github.com/cornelk/gotokit/log\"\n\t\"github.com/cornelk/gotokit/set\"\n\t\"github.com/stretchr/testify/assert\"\n\t\"github.com/stretchr/testify/require\"\n)\n\nfunc TestNormalizeURLPath(t *testing.T) {\n\ttests := []struct {\n\t\tinput    string\n\t\texpected string\n\t}{\n\t\t{\"\", \"/\"},\n\t\t{\"/\", \"/\"},\n\t\t{\"/path\", \"/path\"},\n\t\t{\"/path/\", \"/path\"},\n\t\t{\"/path/to/resource\", \"/path/to/resource\"},\n\t\t{\"/path/to/resource/\", \"/path/to/resource\"},\n\t\t{\"/category/blog-post\", \"/category/blog-post\"},\n\t\t{\"/category/blog-post/\", \"/category/blog-post\"},\n\t}\n\n\tfor _, test := range tests {\n\t\tt.Run(test.input, func(t *testing.T) {\n\t\t\tresult := normalizeURLPath(test.input)\n\t\t\tassert.Equal(t, test.expected, result)\n\t\t})\n\t}\n}\n\nfunc TestShouldURLBeDownloaded_TrailingSlashDuplicates(t *testing.T) {\n\tlogger := log.NewTestLogger(t)\n\tcfg := Config{\n\t\tURL: \"https://example.com\",\n\t}\n\tscraper, err := New(logger, cfg)\n\trequire.NoError(t, err)\n\trequire.NotNil(t, scraper)\n\n\t// Initialize empty processed set\n\tscraper.processed = set.New[string]()\n\n\t// Test that URLs with and without trailing slashes are treated as duplicates\n\turl1, err := url.Parse(\"https://example.com/category/blog-post\")\n\trequire.NoError(t, err)\n\n\turl2, err := url.Parse(\"https://example.com/category/blog-post/\")\n\trequire.NoError(t, err)\n\n\t// First URL should be downloadable\n\tshould1 := scraper.shouldURLBeDownloaded(url1, 0, false)\n\tassert.True(t, should1, \"First URL should be downloadable\")\n\n\t// Second URL with trailing slash should be treated as duplicate\n\tshould2 := scraper.shouldURLBeDownloaded(url2, 0, false)\n\tassert.False(t, should2, \"Second URL with trailing slash should be treated as duplicate\")\n\n\t// Verify that the normalized path is in the processed set\n\tassert.True(t, scraper.processed.Contains(\"/category/blog-post\"))\n}\n\nfunc TestShouldURLBeDownloaded_TrailingSlashDuplicatesReverse(t *testing.T) {\n\tlogger := log.NewTestLogger(t)\n\tcfg := Config{\n\t\tURL: \"https://example.com\",\n\t}\n\tscraper, err := New(logger, cfg)\n\trequire.NoError(t, err)\n\trequire.NotNil(t, scraper)\n\n\t// Initialize empty processed set\n\tscraper.processed = set.New[string]()\n\n\t// Test reverse order - trailing slash first, then without\n\turl1, err := url.Parse(\"https://example.com/category/blog-post/\")\n\trequire.NoError(t, err)\n\n\turl2, err := url.Parse(\"https://example.com/category/blog-post\")\n\trequire.NoError(t, err)\n\n\t// First URL with trailing slash should be downloadable\n\tshould1 := scraper.shouldURLBeDownloaded(url1, 0, false)\n\tassert.True(t, should1, \"First URL with trailing slash should be downloadable\")\n\n\t// Second URL without trailing slash should be treated as duplicate\n\tshould2 := scraper.shouldURLBeDownloaded(url2, 0, false)\n\tassert.False(t, should2, \"Second URL without trailing slash should be treated as duplicate\")\n\n\t// Verify that the normalized path is in the processed set\n\tassert.True(t, scraper.processed.Contains(\"/category/blog-post\"))\n}\n\nfunc TestShouldURLBeDownloaded_RootPath(t *testing.T) {\n\tlogger := log.NewTestLogger(t)\n\tcfg := Config{\n\t\tURL: \"https://example.com\",\n\t}\n\tscraper, err := New(logger, cfg)\n\trequire.NoError(t, err)\n\trequire.NotNil(t, scraper)\n\n\t// Initialize empty processed set\n\tscraper.processed = set.New[string]()\n\n\t// Test root path normalization\n\turl1, err := url.Parse(\"https://example.com/\")\n\trequire.NoError(t, err)\n\n\turl2, err := url.Parse(\"https://example.com\")\n\trequire.NoError(t, err)\n\n\t// First root URL should be downloadable\n\tshould1 := scraper.shouldURLBeDownloaded(url1, 0, false)\n\tassert.True(t, should1, \"First root URL should be downloadable\")\n\n\t// Second root URL should be treated as duplicate\n\tshould2 := scraper.shouldURLBeDownloaded(url2, 0, false)\n\tassert.False(t, should2, \"Second root URL should be treated as duplicate\")\n\n\t// Verify that the normalized root path is in the processed set\n\tassert.True(t, scraper.processed.Contains(\"/\"))\n}\n\nfunc TestShouldURLBeDownloaded_ExternalURLs(t *testing.T) {\n\tlogger := log.NewTestLogger(t)\n\tcfg := Config{\n\t\tURL: \"https://example.com\",\n\t}\n\tscraper, err := New(logger, cfg)\n\trequire.NoError(t, err)\n\trequire.NotNil(t, scraper)\n\n\t// Initialize empty processed set\n\tscraper.processed = set.New[string]()\n\n\t// Test external URLs with trailing slashes as assets\n\turl1, err := url.Parse(\"https://external.com/path.css\")\n\trequire.NoError(t, err)\n\n\turl2, err := url.Parse(\"https://external.com/path.css/\")\n\trequire.NoError(t, err)\n\n\t// First external asset should be downloadable (if it passes other checks)\n\tshould1 := scraper.shouldURLBeDownloaded(url1, 0, true) // asset = true\n\n\t// Second external asset with trailing slash should be treated as duplicate\n\tshould2 := scraper.shouldURLBeDownloaded(url2, 0, true) // asset = true\n\n\t// First should pass, second should be blocked as duplicate\n\tassert.True(t, should1, \"First external asset should be downloadable\")\n\tassert.False(t, should2, \"Second external asset with trailing slash should be treated as duplicate\")\n\n\t// Verify that the normalized external URL is in the processed set\n\tnormalizedURL1 := normalizeURLPath(url1.String())\n\tassert.True(t, scraper.processed.Contains(normalizedURL1))\n}\n"
  },
  {
    "path": "scraper/cookies.go",
    "content": "package scraper\n\nimport (\n\t\"fmt\"\n\t\"net/http\"\n\t\"net/http/cookiejar\"\n\t\"net/url\"\n\t\"time\"\n)\n\n// Cookie represents a cookie, it copies parts of the http.Cookie struct but changes\n// the JSON marshaling to exclude empty fields.\ntype Cookie struct {\n\tName  string `json:\"name\"`\n\tValue string `json:\"value,omitempty\"`\n\n\tExpires *time.Time `json:\"expires,omitempty\"`\n}\n\n// Cookies returns the current cookies.\nfunc (s *Scraper) Cookies() []Cookie {\n\thttpCookies := s.cookies.Cookies(s.URL)\n\tcookies := make([]Cookie, 0, len(httpCookies))\n\n\tfor _, c := range httpCookies {\n\t\tcookie := Cookie{\n\t\t\tName:  c.Name,\n\t\t\tValue: c.Value,\n\t\t}\n\t\tif !c.Expires.IsZero() {\n\t\t\tcookie.Expires = &c.Expires\n\t\t}\n\t\tcookies = append(cookies, cookie)\n\t}\n\n\treturn cookies\n}\n\nfunc createCookieJar(u *url.URL, cookies []Cookie) (*cookiejar.Jar, error) {\n\tjar, err := cookiejar.New(nil)\n\tif err != nil {\n\t\treturn nil, fmt.Errorf(\"creating cookie jar: %w\", err)\n\t}\n\n\thttpCookies := make([]*http.Cookie, 0, len(cookies))\n\tfor _, c := range cookies {\n\t\th := &http.Cookie{\n\t\t\tName:  c.Name,\n\t\t\tValue: c.Value,\n\t\t}\n\t\tif c.Expires != nil {\n\t\t\th.Expires = *c.Expires\n\t\t}\n\t\thttpCookies = append(httpCookies, h)\n\t}\n\n\tjar.SetCookies(u, httpCookies)\n\treturn jar, nil\n}\n"
  },
  {
    "path": "scraper/download.go",
    "content": "package scraper\n\nimport (\n\t\"context\"\n\t\"errors\"\n\t\"fmt\"\n\t\"net/url\"\n\t\"path\"\n\n\t\"github.com/cornelk/goscrape/css\"\n\t\"github.com/cornelk/goscrape/htmlindex\"\n\t\"github.com/cornelk/gotokit/log\"\n)\n\n// assetProcessor is a processor of a downloaded asset that can transform\n// a downloaded file content before it will be stored on disk.\ntype assetProcessor func(URL *url.URL, data []byte) []byte\n\nvar tagsWithReferences = []string{\n\thtmlindex.LinkTag,\n\thtmlindex.ScriptTag,\n\thtmlindex.BodyTag,\n\thtmlindex.StyleTag,\n}\n\nfunc (s *Scraper) downloadReferences(ctx context.Context, index *htmlindex.Index) error {\n\treferences, err := index.URLs(htmlindex.BodyTag)\n\tif err != nil {\n\t\ts.logger.Error(\"Getting body node URLs failed\", log.Err(err))\n\t}\n\ts.imagesQueue = append(s.imagesQueue, references...)\n\n\treferences, err = index.URLs(htmlindex.ImgTag)\n\tif err != nil {\n\t\ts.logger.Error(\"Getting img node URLs failed\", log.Err(err))\n\t}\n\ts.imagesQueue = append(s.imagesQueue, references...)\n\n\tfor _, tag := range tagsWithReferences {\n\t\treferences, err = index.URLs(tag)\n\t\tif err != nil {\n\t\t\ts.logger.Error(\"Getting node URLs failed\",\n\t\t\t\tlog.String(\"node\", tag),\n\t\t\t\tlog.Err(err))\n\t\t}\n\n\t\tvar processor assetProcessor\n\t\tif tag == htmlindex.LinkTag {\n\t\t\tprocessor = s.cssProcessor\n\t\t}\n\t\tfor _, ur := range references {\n\t\t\tif err := s.downloadAsset(ctx, ur, processor); err != nil && errors.Is(err, context.Canceled) {\n\t\t\t\treturn err\n\t\t\t}\n\t\t}\n\t}\n\n\tfor _, image := range s.imagesQueue {\n\t\tif err := s.downloadAsset(ctx, image, s.checkImageForRecode); err != nil && errors.Is(err, context.Canceled) {\n\t\t\treturn err\n\t\t}\n\t}\n\ts.imagesQueue = nil\n\treturn nil\n}\n\n// downloadAsset downloads an asset if it does not exist on disk yet.\nfunc (s *Scraper) downloadAsset(ctx context.Context, u *url.URL, processor assetProcessor) error {\n\tu.Fragment = \"\"\n\turlFull := u.String()\n\n\tif !s.shouldURLBeDownloaded(u, 0, true) {\n\t\treturn nil\n\t}\n\n\tfilePath := s.getFilePath(u, false)\n\tif s.fileExists(filePath) {\n\t\treturn nil\n\t}\n\n\ts.logger.Info(\"Downloading asset\", log.String(\"url\", urlFull))\n\tdata, _, err := s.httpDownloader(ctx, u)\n\tif err != nil {\n\t\ts.logger.Error(\"Downloading asset failed\",\n\t\t\tlog.String(\"url\", urlFull),\n\t\t\tlog.Err(err))\n\t\treturn fmt.Errorf(\"downloading asset: %w\", err)\n\t}\n\n\tif processor != nil {\n\t\tdata = processor(u, data)\n\t}\n\n\tif err = s.fileWriter(filePath, data); err != nil {\n\t\ts.logger.Error(\"Writing asset file failed\",\n\t\t\tlog.String(\"url\", urlFull),\n\t\t\tlog.String(\"file\", filePath),\n\t\t\tlog.Err(err))\n\t}\n\n\treturn nil\n}\n\nfunc (s *Scraper) cssProcessor(baseURL *url.URL, data []byte) []byte {\n\turls := make(map[string]string)\n\n\tprocessor := func(token *css.Token, data string, u *url.URL) {\n\t\ts.imagesQueue = append(s.imagesQueue, u)\n\n\t\tcssPath := *u\n\t\tcssPath.Path = path.Dir(cssPath.Path) + \"/\"\n\t\tresolved := resolveURL(&cssPath, data, s.URL.Host, false, \"\")\n\t\turls[token.Value] = resolved\n\t}\n\n\tcssData := string(data)\n\tcss.Process(s.logger, baseURL, cssData, processor)\n\n\tif len(urls) == 0 {\n\t\treturn data\n\t}\n\n\tfor ori, filePath := range urls {\n\t\tcssData = replaceCSSUrls(ori, filePath, cssData)\n\t\ts.logger.Debug(\"CSS Element relinked\",\n\t\t\tlog.String(\"url\", ori),\n\t\t\tlog.String(\"fixed_url\", filePath))\n\t}\n\n\treturn []byte(cssData)\n}\n"
  },
  {
    "path": "scraper/download_test.go",
    "content": "package scraper\n\nimport (\n\t\"net/url\"\n\t\"testing\"\n\n\t\"github.com/cornelk/gotokit/log\"\n\t\"github.com/stretchr/testify/assert\"\n\t\"github.com/stretchr/testify/require\"\n)\n\nfunc TestCSSProcessor(t *testing.T) {\n\tlogger := log.NewTestLogger(t)\n\tcfg := Config{\n\t\tURL: \"http://localhost\",\n\t}\n\ts, err := New(logger, cfg)\n\trequire.NoError(t, err)\n\n\tvar fixtures = map[string]string{\n\t\t\"url('http://localhost/uri/between/single/quote')\": \"http://localhost/uri/between/single/quote\",\n\t\t`url(\"http://localhost/uri/between/double/quote\")`: \"http://localhost/uri/between/double/quote\",\n\t\t\"url(http://localhost/uri)\":                        \"http://localhost/uri\",\n\t\t\"url(data:image/gif;base64,R0lGODl)\":               \"\",\n\t\t`div#gopher {\n\t\t\tbackground: url(/doc/gopher/frontpage.png) no-repeat;\n\t\t\theight: 155px;\n\t\t\t}`: \"http://localhost/doc/gopher/frontpage.png\",\n\t}\n\n\tu, _ := url.Parse(\"http://localhost\")\n\tfor input, expected := range fixtures {\n\t\ts.imagesQueue = nil\n\t\ts.cssProcessor(u, []byte(input))\n\n\t\tif expected == \"\" {\n\t\t\tassert.Empty(t, s.imagesQueue)\n\t\t\tcontinue\n\t\t}\n\n\t\tassert.NotEmpty(t, s.imagesQueue)\n\n\t\tres := s.imagesQueue[0].String()\n\t\tassert.Equal(t, expected, res)\n\t}\n}\n"
  },
  {
    "path": "scraper/fileutil.go",
    "content": "package scraper\n\nimport (\n\t\"fmt\"\n\t\"hash/fnv\"\n\t\"net/url\"\n\t\"path/filepath\"\n\t\"strings\"\n)\n\nconst (\n\t// PageExtension is the file extension that downloaded pages get.\n\tPageExtension = \".html\"\n\t// PageDirIndex is the file name of the index file for every dir.\n\tPageDirIndex = \"index\" + PageExtension\n\t// MaxFilenameLength is the maximum length for a filename component to ensure filesystem compatibility.\n\tMaxFilenameLength = 200\n)\n\n// getFilePath returns a file path for a URL to store the URL content in.\n// The isAPage parameter is crucial: it tells us whether this URL contains HTML content\n// that should be treated as a web page (with .html extensions and directory indexing)\n// or if it's a binary file that should keep its original path unchanged.\n// Without this distinction, binary files would get corrupted paths like image.jpg.html.\nfunc (s *Scraper) getFilePath(url *url.URL, isAPage bool) string {\n\tfileName := url.Path\n\tif isAPage {\n\t\t// This is HTML content - apply web page naming conventions\n\t\tfileName = getPageFilePath(url)\n\t}\n\t// If not a page, keep the original URL path for binary files\n\n\tvar externalHost string\n\tif url.Host != s.URL.Host {\n\t\texternalHost = \"_\" + url.Host // _ is a prefix for external domains on the filesystem\n\t}\n\n\t// Split the file path into directory and filename components\n\tdir := filepath.Dir(fileName)\n\tbase := filepath.Base(fileName)\n\n\t// Truncate the filename component if it's too long\n\ttruncatedBase := truncateFilename(base)\n\n\t// Reconstruct the path with the truncated filename\n\tif dir == \".\" {\n\t\tfileName = truncatedBase\n\t} else {\n\t\tfileName = filepath.Join(dir, truncatedBase)\n\t}\n\n\treturn filepath.Join(s.config.OutputDirectory, s.URL.Host, externalHost, fileName)\n}\n\n// getPageFilePath returns a filename for a URL that represents a web page.\n// This function adds .html extensions and handles directory indexing,\n// which is what we want for HTML content but NOT for binary files like images or PDFs.\nfunc getPageFilePath(url *url.URL) string {\n\tfileName := url.Path\n\n\t// root of domain will be index.html\n\tswitch {\n\tcase fileName == \"\" || fileName == \"/\":\n\t\tfileName = PageDirIndex\n\t\t// directory index will be index.html in the directory\n\n\tcase fileName[len(fileName)-1] == '/':\n\t\tfileName += PageDirIndex\n\n\tdefault:\n\t\text := filepath.Ext(fileName)\n\t\t// if file extension is missing add .html, otherwise keep the existing file extension\n\t\tif ext == \"\" {\n\t\t\tfileName += PageExtension\n\t\t}\n\t}\n\n\treturn fileName\n}\n\n// truncateFilename truncates a filename if it exceeds MaxFilenameLength while preserving the extension.\nfunc truncateFilename(filename string) string {\n\tif len(filename) <= MaxFilenameLength {\n\t\treturn filename\n\t}\n\n\text := filepath.Ext(filename)\n\tbaseName := strings.TrimSuffix(filename, ext)\n\n\t// Calculate how much space we need for hash and extension\n\thashLength := 8 // Using first 8 hex characters (from 32-bit FNV)\n\treservedLength := hashLength + len(ext)\n\n\t// If the extension alone is too long, truncate it too\n\tif reservedLength > MaxFilenameLength {\n\t\text = ext[:MaxFilenameLength-hashLength]\n\t\treservedLength = hashLength + len(ext)\n\t}\n\n\tmaxBaseLength := MaxFilenameLength - reservedLength\n\tif maxBaseLength <= 0 {\n\t\tmaxBaseLength = 1\n\t}\n\n\ttruncatedBase := baseName[:maxBaseLength]\n\n\t// Generate FNV-1a hash of original filename to ensure uniqueness\n\th := fnv.New32a()\n\t_, _ = h.Write([]byte(filename))\n\thashStr := fmt.Sprintf(\"%08x\", h.Sum32())[:hashLength]\n\n\treturn truncatedBase + hashStr + ext\n}\n"
  },
  {
    "path": "scraper/fileutil_test.go",
    "content": "package scraper\n\nimport (\n\t\"net/url\"\n\t\"os\"\n\t\"strings\"\n\t\"testing\"\n\n\t\"github.com/cornelk/gotokit/log\"\n\t\"github.com/stretchr/testify/assert\"\n\t\"github.com/stretchr/testify/require\"\n)\n\nfunc TestGetFilePath(t *testing.T) {\n\ttype filePathFixture struct {\n\t\tBaseURL          string\n\t\tDownloadURL      string\n\t\tExpectedFilePath string\n\t}\n\n\tpathSeparator := string(os.PathSeparator)\n\texpectedBasePath := \"google.com\" + pathSeparator\n\tvar fixtures = []filePathFixture{\n\t\t{\"https://google.com/\", \"https://github.com/\", expectedBasePath + \"_github.com\" + pathSeparator + \"index.html\"},\n\t\t{\"https://google.com/\", \"https://github.com/#fragment\", expectedBasePath + \"_github.com\" + pathSeparator + \"index.html\"},\n\t\t{\"https://google.com/\", \"https://github.com/test\", expectedBasePath + \"_github.com\" + pathSeparator + \"test.html\"},\n\t\t{\"https://google.com/\", \"https://github.com/test/\", expectedBasePath + \"_github.com\" + pathSeparator + \"test\" + pathSeparator + \"index.html\"},\n\t\t{\"https://google.com/\", \"https://github.com/test.aspx\", expectedBasePath + \"_github.com\" + pathSeparator + \"test.aspx\"},\n\t\t{\"https://google.com/\", \"https://google.com/settings\", expectedBasePath + \"settings.html\"},\n\t}\n\n\tvar cfg Config\n\tlogger := log.NewTestLogger(t)\n\tfor _, fix := range fixtures {\n\t\tcfg.URL = fix.BaseURL\n\t\ts, err := New(logger, cfg)\n\t\trequire.NoError(t, err)\n\n\t\tURL, err := url.Parse(fix.DownloadURL)\n\t\trequire.NoError(t, err)\n\n\t\toutput := s.getFilePath(URL, true)\n\t\tassert.Equal(t, fix.ExpectedFilePath, output)\n\t}\n}\n\nfunc TestTruncateFilename(t *testing.T) {\n\ttests := []struct {\n\t\tname     string\n\t\tfilename string\n\t\texpected func(string) bool // function to validate the result\n\t}{\n\t\t{\n\t\t\tname:     \"short filename unchanged\",\n\t\t\tfilename: \"test.css\",\n\t\t\texpected: func(result string) bool { return result == \"test.css\" },\n\t\t},\n\t\t{\n\t\t\tname:     \"long filename gets truncated\",\n\t\t\tfilename: \"very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-long-filename.css\",\n\t\t\texpected: func(result string) bool {\n\t\t\t\treturn len(result) <= MaxFilenameLength &&\n\t\t\t\t\tlen(result) > 0 &&\n\t\t\t\t\tresult != \"very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-long-filename.css\" &&\n\t\t\t\t\tresult[len(result)-4:] == \".css\"\n\t\t\t},\n\t\t},\n\t\t{\n\t\t\tname:     \"filename without extension\",\n\t\t\tfilename: \"very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-long-filename\",\n\t\t\texpected: func(result string) bool {\n\t\t\t\treturn len(result) <= MaxFilenameLength &&\n\t\t\t\t\tlen(result) > 0 &&\n\t\t\t\t\tresult != \"very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-long-filename\"\n\t\t\t},\n\t\t},\n\t\t{\n\t\t\tname:     \"empty filename\",\n\t\t\tfilename: \"\",\n\t\t\texpected: func(result string) bool { return result == \"\" },\n\t\t},\n\t\t{\n\t\t\tname:     \"filename at max length\",\n\t\t\tfilename: strings.Repeat(\"a\", MaxFilenameLength),\n\t\t\texpected: func(result string) bool { return len(result) == MaxFilenameLength },\n\t\t},\n\t}\n\n\tfor _, tt := range tests {\n\t\tt.Run(tt.name, func(t *testing.T) {\n\t\t\tresult := truncateFilename(tt.filename)\n\t\t\tassert.True(t, tt.expected(result), \"filename: %q, result: %q\", tt.filename, result)\n\t\t})\n\t}\n}\n\nfunc TestTruncateFilenameUniqueness(t *testing.T) {\n\t// Test that different long filenames with the same prefix produce different results\n\tlongPrefix := \"this-is-a-very-long-filename-prefix-that-will-be-truncated-and-should-produce-different-results-based-on-the-hash-suffix-when-the-full-filename-is-different\"\n\n\tfilename1 := longPrefix + \"-file1.css\"\n\tfilename2 := longPrefix + \"-file2.css\"\n\n\tresult1 := truncateFilename(filename1)\n\tresult2 := truncateFilename(filename2)\n\n\tassert.NotEqual(t, result1, result2, \"Different long filenames should produce different truncated results\")\n\tassert.LessOrEqual(t, len(result1), MaxFilenameLength, \"Result1 should be within max length\")\n\tassert.LessOrEqual(t, len(result2), MaxFilenameLength, \"Result2 should be within max length\")\n}\n"
  },
  {
    "path": "scraper/fs.go",
    "content": "package scraper\n\nimport (\n\t\"fmt\"\n\t\"os\"\n\t\"path/filepath\"\n\n\t\"github.com/cornelk/gotokit/log\"\n)\n\n// createDownloadPath creates the download path if it does not exist yet.\nfunc (s *Scraper) createDownloadPath(path string) error {\n\tif path == \"\" {\n\t\treturn nil\n\t}\n\n\ts.logger.Debug(\"Creating dir\", log.String(\"path\", path))\n\tif err := os.MkdirAll(path, os.ModePerm); err != nil {\n\t\treturn fmt.Errorf(\"creating directory '%s': %w\", path, err)\n\t}\n\treturn nil\n}\n\nfunc (s *Scraper) writeFile(filePath string, data []byte) error {\n\tdir := filepath.Dir(filePath)\n\tif len(dir) < len(s.URL.Host) { // nothing to append if it is the root dir\n\t\tdir = filepath.Join(\".\", s.URL.Host, dir)\n\t}\n\n\tif err := s.dirCreator(dir); err != nil {\n\t\treturn err\n\t}\n\n\ts.logger.Debug(\"Creating file\", log.String(\"path\", filePath))\n\tf, err := os.Create(filePath)\n\tif err != nil {\n\t\treturn fmt.Errorf(\"creating file '%s': %w\", filePath, err)\n\t}\n\n\tif _, err = f.Write(data); err != nil {\n\t\t// nolint: wrapcheck\n\t\t_ = f.Close() // try to close and remove file but return the first error\n\t\t_ = os.Remove(filePath)\n\t\treturn fmt.Errorf(\"writing to file: %w\", err)\n\t}\n\n\tif err := f.Close(); err != nil {\n\t\treturn fmt.Errorf(\"closing file: %w\", err)\n\t}\n\treturn nil\n}\n\nfunc (s *Scraper) fileExists(filePath string) bool {\n\tif _, err := os.Stat(filePath); !os.IsNotExist(err) {\n\t\treturn true\n\t}\n\treturn false\n}\n"
  },
  {
    "path": "scraper/html.go",
    "content": "package scraper\n\nimport (\n\t\"bytes\"\n\t\"fmt\"\n\t\"net/url\"\n\t\"slices\"\n\t\"strings\"\n\n\t\"github.com/cornelk/goscrape/css\"\n\t\"github.com/cornelk/goscrape/htmlindex\"\n\t\"github.com/cornelk/gotokit/log\"\n\t\"golang.org/x/net/html\"\n)\n\n// ignoredURLPrefixes contains a list of URL prefixes that do not need to bo adjusted.\nvar ignoredURLPrefixes = []string{\n\t\"#\",       // fragment\n\t\"/#\",      // fragment\n\t\"data:\",   // embedded data\n\t\"mailto:\", // mail address\n}\n\n// fixURLReferences fixes URL references to point to relative file names.\n// It returns a bool that indicates that no reference needed to be fixed,\n// in this case the returned HTML string will be empty.\nfunc (s *Scraper) fixURLReferences(url *url.URL, doc *html.Node,\n\tindex *htmlindex.Index) ([]byte, bool, error) {\n\n\trelativeToRoot := urlRelativeToRoot(url)\n\tif !s.fixHTMLNodeURLs(url, relativeToRoot, index) {\n\t\treturn nil, false, nil\n\t}\n\n\tvar rendered bytes.Buffer\n\tif err := html.Render(&rendered, doc); err != nil {\n\t\treturn nil, false, fmt.Errorf(\"rendering html: %w\", err)\n\t}\n\treturn rendered.Bytes(), true, nil\n}\n\n// fixHTMLNodeURLs processes all HTML nodes that contain URLs that need to be fixed\n// to link to downloaded files. It returns whether any URLS have been fixed.\nfunc (s *Scraper) fixHTMLNodeURLs(baseURL *url.URL, relativeToRoot string, index *htmlindex.Index) bool {\n\tvar changed bool\n\n\tfor tag, nodeInfo := range htmlindex.Nodes {\n\t\tisHyperlink := tag == htmlindex.ATag\n\n\t\turls := index.Nodes(tag)\n\t\tfor _, nodes := range urls {\n\t\t\tfor _, node := range nodes {\n\t\t\t\tswitch node.Data {\n\t\t\t\tcase htmlindex.StyleTag:\n\t\t\t\t\tif s.fixScriptNodeURL(baseURL, node, isHyperlink, relativeToRoot) {\n\t\t\t\t\t\tchanged = true\n\t\t\t\t\t}\n\t\t\t\tdefault:\n\t\t\t\t\tif s.fixNodeURL(baseURL, nodeInfo.Attributes, node, isHyperlink, relativeToRoot) {\n\t\t\t\t\t\tchanged = true\n\t\t\t\t\t}\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t}\n\n\treturn changed\n}\n\n// fixNodeURL fixes the URL references of a HTML node to point to a relative file name.\n// It returns whether any attribute value bas been adjusted.\nfunc (s *Scraper) fixNodeURL(baseURL *url.URL, attributes []string, node *html.Node,\n\tisHyperlink bool, relativeToRoot string) bool {\n\n\tvar changed bool\n\n\tfor i, attr := range node.Attr {\n\t\tif !slices.Contains(attributes, attr.Key) {\n\t\t\tcontinue\n\t\t}\n\n\t\tvalue := strings.TrimSpace(attr.Val)\n\t\tif value == \"\" {\n\t\t\tcontinue\n\t\t}\n\n\t\tfor _, prefix := range ignoredURLPrefixes {\n\t\t\tif strings.HasPrefix(value, prefix) {\n\t\t\t\treturn false\n\t\t\t}\n\t\t}\n\n\t\tvar adjusted string\n\n\t\tif htmlindex.SrcSetAttributes.Contains(attr.Key) {\n\t\t\tadjusted = resolveSrcSetURLs(baseURL, value, s.URL.Host, isHyperlink, relativeToRoot)\n\t\t} else {\n\t\t\tadjusted = resolveURL(baseURL, value, s.URL.Host, isHyperlink, relativeToRoot)\n\t\t}\n\n\t\tif adjusted == value { // check for no change\n\t\t\tcontinue\n\t\t}\n\n\t\ts.logger.Debug(\"HTML node relinked\",\n\t\t\tlog.String(\"value\", value),\n\t\t\tlog.String(\"fixed_value\", adjusted))\n\n\t\tattribute := &node.Attr[i]\n\t\tattribute.Val = adjusted\n\t\tchanged = true\n\t}\n\n\treturn changed\n}\n\n// fixScriptNodeURL fixes the URL references of a HTML script node to point to a relative file name.\n// It returns whether any attribute value bas been adjusted.\nfunc (s *Scraper) fixScriptNodeURL(baseURL *url.URL, node *html.Node,\n\tisHyperlink bool, relativeToRoot string) bool {\n\n\tif node.FirstChild == nil {\n\t\treturn false\n\t}\n\n\turls := map[string]string{}\n\n\tprocessor := func(_ *css.Token, before string, _ *url.URL) {\n\t\tadjusted := resolveURL(baseURL, before, s.URL.Host, isHyperlink, relativeToRoot)\n\t\tif before != adjusted {\n\t\t\turls[before] = adjusted\n\t\t}\n\t}\n\n\tcssData := node.FirstChild.Data\n\tcss.Process(s.logger, baseURL, cssData, processor)\n\n\tvar changed bool\n\n\tfor before, filePath := range urls {\n\t\tcssData = replaceCSSUrls(before, filePath, cssData)\n\t\ts.logger.Debug(\"CSS Element relinked\",\n\t\t\tlog.String(\"url\", before),\n\t\t\tlog.String(\"fixed_url\", filePath))\n\t\tchanged = true\n\t}\n\n\tnode.FirstChild.Data = cssData\n\n\treturn changed\n}\n\nfunc resolveSrcSetURLs(base *url.URL, srcSetValue, mainPageHost string, isHyperlink bool, relativeToRoot string) string {\n\t// split the set of responsive images\n\tvalues := strings.Split(srcSetValue, \",\")\n\n\tfor i, value := range values {\n\t\tvalue = strings.TrimSpace(value)\n\t\tparts := strings.Split(value, \" \")\n\t\tparts[0] = resolveURL(base, parts[0], mainPageHost, isHyperlink, relativeToRoot)\n\t\tvalues[i] = strings.Join(parts, \" \")\n\t}\n\n\treturn strings.Join(values, \", \")\n}\n\nfunc replaceCSSUrls(before, after, content string) string {\n\tprefixes := []string{\n\t\t\"\\\"\", \"'\", \"\",\n\t}\n\n\tfor _, prefix := range prefixes {\n\t\twrong := fmt.Sprintf(\"url(%s%s%s)\", prefix, before, prefix)\n\t\tfixed := \"url('\" + after + \"')\"\n\t\tcontent = strings.ReplaceAll(content, wrong, fixed)\n\t}\n\n\treturn content\n}\n"
  },
  {
    "path": "scraper/html_test.go",
    "content": "package scraper\n\nimport (\n\t\"bytes\"\n\t\"testing\"\n\n\t\"github.com/cornelk/goscrape/htmlindex\"\n\t\"github.com/cornelk/gotokit/log\"\n\t\"github.com/stretchr/testify/assert\"\n\t\"github.com/stretchr/testify/require\"\n\t\"golang.org/x/net/html\"\n)\n\nfunc TestFixURLReferences(t *testing.T) {\n\tlogger := log.NewTestLogger(t)\n\tcfg := Config{\n\t\tURL: \"http://domain.com\",\n\t}\n\ts, err := New(logger, cfg)\n\trequire.NoError(t, err)\n\n\tb := []byte(`\n<html lang=\"es\">\n<a href=\"https://domain.com/wp-content/uploads/document.pdf\" rel=\"doc\">Guide</a>\n<img src=\"https://domain.com/test.jpg\" srcset=\"https://domain.com/test-480w.jpg 480w, https://domain.com/test-800w.jpg 800w\"/> \n</html>\n`)\n\n\tbuf := &bytes.Buffer{}\n\t_, err = buf.Write(b)\n\trequire.NoError(t, err)\n\n\tdoc, err := html.Parse(buf)\n\trequire.NoError(t, err)\n\n\tindex := htmlindex.New(logger)\n\tindex.Index(s.URL, doc)\n\n\tref, fixed, err := s.fixURLReferences(s.URL, doc, index)\n\trequire.NoError(t, err)\n\tassert.True(t, fixed)\n\n\texpected := \"<html lang=\\\"es\\\"><head></head><body>\" +\n\t\t\"<a href=\\\"wp-content/uploads/document.pdf\\\" rel=\\\"doc\\\">Guide</a>\\n\" +\n\t\t\"<img src=\\\"test.jpg\\\" srcset=\\\"test-480w.jpg 480w, test-800w.jpg 800w\\\"/> \\n\\n\" +\n\t\t\"</body></html>\"\n\tassert.Equal(t, expected, string(ref))\n}\n"
  },
  {
    "path": "scraper/http.go",
    "content": "package scraper\n\nimport (\n\t\"bytes\"\n\t\"context\"\n\t\"errors\"\n\t\"fmt\"\n\t\"io\"\n\t\"net/http\"\n\t\"net/url\"\n\t\"strings\"\n\t\"time\"\n\n\t\"github.com/cornelk/gotokit/app\"\n\t\"github.com/cornelk/gotokit/log\"\n)\n\nvar (\n\tmaxRetries = 10\n\tretryDelay = 1500 * time.Millisecond\n\n\terrExhaustedRetries = errors.New(\"exhausted retries\")\n)\n\nfunc (s *Scraper) downloadURL(ctx context.Context, u *url.URL) (*http.Response, error) {\n\treq, err := http.NewRequestWithContext(ctx, http.MethodGet, u.String(), nil)\n\tif err != nil {\n\t\treturn nil, fmt.Errorf(\"creating HTTP request: %w\", err)\n\t}\n\n\treq.Header.Set(\"User-Agent\", s.config.UserAgent)\n\tif s.auth != \"\" {\n\t\treq.Header.Set(\"Authorization\", s.auth)\n\t}\n\n\tfor key, values := range s.config.Header {\n\t\tfor _, value := range values {\n\t\t\treq.Header.Set(key, value)\n\t\t}\n\t}\n\n\tresp, err := s.client.Do(req)\n\tif err != nil {\n\t\treturn nil, fmt.Errorf(\"executing HTTP request: %w\", err)\n\t}\n\n\treturn resp, nil\n}\n\nfunc (s *Scraper) downloadURLWithRetries(ctx context.Context, u *url.URL) ([]byte, *url.URL, error) {\n\tvar err error\n\tvar resp *http.Response\n\n\tfor retries := range maxRetries + 2 {\n\t\tif retries == maxRetries+1 {\n\t\t\treturn nil, nil, fmt.Errorf(\"%w for URL %s\", errExhaustedRetries, u)\n\t\t}\n\n\t\tresp, err = s.downloadURL(ctx, u)\n\t\tif err != nil {\n\t\t\treturn nil, nil, err\n\t\t}\n\n\t\tif resp.StatusCode == http.StatusTooManyRequests {\n\t\t\ts.logger.Warn(\"Too Many Requests. Retrying again\",\n\t\t\t\tlog.Int(\"num\", retries+1),\n\t\t\t\tlog.Int(\"max\", maxRetries),\n\t\t\t\tlog.String(\"url\", u.String()))\n\n\t\t\t// Wait a bit and try again using exponential backoff on each retry\n\t\t\tif err := app.Sleep(ctx, (time.Duration(retries)+1)*retryDelay); err != nil {\n\t\t\t\treturn nil, nil, fmt.Errorf(\"sleeping between retries: %w\", err)\n\t\t\t}\n\t\t\tcontinue\n\t\t}\n\t\tbreak\n\t}\n\n\tdefer func() {\n\t\tif err := resp.Body.Close(); err != nil {\n\t\t\ts.logger.Error(\"Closing HTTP Request body failed\",\n\t\t\t\tlog.String(\"url\", u.String()),\n\t\t\t\tlog.Err(err))\n\t\t}\n\t}()\n\n\tif resp.StatusCode != http.StatusOK {\n\t\treturn nil, nil, fmt.Errorf(\"unexpected HTTP request status code %d\", resp.StatusCode)\n\t}\n\n\tbuf := &bytes.Buffer{}\n\tif _, err := io.Copy(buf, resp.Body); err != nil {\n\t\treturn nil, nil, fmt.Errorf(\"reading HTTP request body: %w\", err)\n\t}\n\treturn buf.Bytes(), resp.Request.URL, nil\n}\n\n// Headers converts a slice of strings to a http.Header.\nfunc Headers(headers []string) http.Header {\n\th := http.Header{}\n\tfor _, header := range headers {\n\t\tsl := strings.SplitN(header, \":\", 2)\n\t\tif len(sl) == 2 {\n\t\t\th.Set(sl[0], sl[1])\n\t\t}\n\t}\n\treturn h\n}\n"
  },
  {
    "path": "scraper/http_test.go",
    "content": "package scraper\n\nimport (\n\t\"context\"\n\t\"fmt\"\n\t\"net/http\"\n\t\"net/http/httptest\"\n\t\"net/url\"\n\t\"testing\"\n\t\"time\"\n\n\t\"github.com/cornelk/gotokit/log\"\n\t\"github.com/stretchr/testify/assert\"\n\t\"github.com/stretchr/testify/require\"\n)\n\nfunc TestHeaders(t *testing.T) {\n\theaders := Headers([]string{\"a:b\", \"c:d:e\"})\n\tassert.Equal(t, \"b\", headers.Get(\"a\"))\n\tassert.Equal(t, \"d:e\", headers.Get(\"c\"))\n}\n\nfunc TestDownloadURLWithRetries(t *testing.T) {\n\tctx := context.Background()\n\texpected := \"ok\"\n\n\tvar retry int\n\tsvr := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {\n\t\tif retry < maxRetries {\n\t\t\tretry++\n\t\t\tw.WriteHeader(http.StatusTooManyRequests)\n\t\t\treturn\n\t\t}\n\t\t_, err := fmt.Fprint(w, expected)\n\t\tassert.NoError(t, err)\n\t}))\n\tdefer svr.Close()\n\n\tur, err := url.Parse(svr.URL)\n\trequire.NoError(t, err)\n\n\tmaxRetries = 2\n\tretryDelay = time.Millisecond\n\n\tvar cfg Config\n\tlogger := log.NewTestLogger(t)\n\ts, err := New(logger, cfg)\n\trequire.NoError(t, err)\n\n\t// download works after 2 retries\n\tb, urActual, err := s.downloadURLWithRetries(ctx, ur)\n\trequire.NoError(t, err)\n\trequire.NotNil(t, urActual)\n\tassert.Equal(t, svr.URL, urActual.String())\n\tassert.Equal(t, expected, string(b))\n\tassert.Equal(t, retry, maxRetries)\n\n\t// download fails after 3 retries\n\tretry = -100\n\t_, _, err = s.downloadURLWithRetries(ctx, ur)\n\tassert.ErrorIs(t, err, errExhaustedRetries)\n}\n"
  },
  {
    "path": "scraper/images.go",
    "content": "package scraper\n\nimport (\n\t\"bytes\"\n\t\"fmt\"\n\t\"image\"\n\t\"image/jpeg\"\n\t\"image/png\"\n\t\"net/url\"\n\n\t\"github.com/cornelk/gotokit/log\"\n\t\"github.com/h2non/filetype\"\n\t\"github.com/h2non/filetype/matchers\"\n\t\"github.com/h2non/filetype/types\"\n)\n\nfunc (s *Scraper) checkImageForRecode(url *url.URL, data []byte) []byte {\n\tif s.config.ImageQuality == 0 {\n\t\treturn data\n\t}\n\n\tkind, err := filetype.Match(data)\n\tif err != nil || kind == types.Unknown {\n\t\treturn data\n\t}\n\n\ts.logger.Debug(\"File type detected\",\n\t\tlog.String(\"type\", kind.MIME.Type),\n\t\tlog.String(\"sub_type\", kind.MIME.Subtype))\n\n\tif kind.MIME.Type == matchers.TypeJpeg.MIME.Type && kind.MIME.Subtype == matchers.TypeJpeg.MIME.Subtype {\n\t\tif recoded := s.recodeJPEG(url, data); recoded != nil {\n\t\t\treturn recoded\n\t\t}\n\t\treturn data\n\t}\n\n\tif kind.MIME.Type == matchers.TypePng.MIME.Type && kind.MIME.Subtype == matchers.TypePng.MIME.Subtype {\n\t\tif recoded := s.recodePNG(url, data); recoded != nil {\n\t\t\treturn recoded\n\t\t}\n\t\treturn data\n\t}\n\n\treturn data\n}\n\n// encodeJPEG encodes a new JPG based on the given quality setting.\nfunc (s *Scraper) encodeJPEG(img image.Image) []byte {\n\to := &jpeg.Options{\n\t\tQuality: int(s.config.ImageQuality),\n\t}\n\n\toutBuf := &bytes.Buffer{}\n\tif err := jpeg.Encode(outBuf, img, o); err != nil {\n\t\treturn nil\n\t}\n\treturn outBuf.Bytes()\n}\n\n// recodeJPEG recodes the image and returns it if it is smaller than before.\nfunc (s *Scraper) recodeJPEG(url fmt.Stringer, data []byte) []byte {\n\tinBuf := bytes.NewBuffer(data)\n\timg, err := jpeg.Decode(inBuf)\n\tif err != nil {\n\t\treturn nil\n\t}\n\n\tencoded := s.encodeJPEG(img)\n\tif encoded == nil || len(encoded) > len(data) { // only use the new file if it is smaller\n\t\treturn nil\n\t}\n\n\ts.logger.Debug(\"Recoded JPEG\",\n\t\tlog.String(\"url\", url.String()),\n\t\tlog.Int(\"size_original\", len(data)),\n\t\tlog.Int(\"size_recoded\", len(encoded)))\n\treturn encoded\n}\n\n// recodePNG recodes the image and returns it if it is smaller than before.\nfunc (s *Scraper) recodePNG(url fmt.Stringer, data []byte) []byte {\n\tinBuf := bytes.NewBuffer(data)\n\timg, err := png.Decode(inBuf)\n\tif err != nil {\n\t\treturn nil\n\t}\n\n\tencoded := s.encodeJPEG(img)\n\tif encoded == nil || len(encoded) > len(data) { // only use the new file if it is smaller\n\t\treturn nil\n\t}\n\n\ts.logger.Debug(\"Recoded PNG\",\n\t\tlog.String(\"url\", url.String()),\n\t\tlog.Int(\"size_original\", len(data)),\n\t\tlog.Int(\"size_recoded\", len(encoded)))\n\treturn encoded\n}\n"
  },
  {
    "path": "scraper/scraper.go",
    "content": "package scraper\n\nimport (\n\t\"bytes\"\n\t\"context\"\n\t\"encoding/base64\"\n\t\"errors\"\n\t\"fmt\"\n\t\"net/http\"\n\t\"net/http/cookiejar\"\n\t\"net/url\"\n\t\"regexp\"\n\t\"time\"\n\n\t\"github.com/cornelk/goscrape/htmlindex\"\n\t\"github.com/cornelk/gotokit/httpclient\"\n\t\"github.com/cornelk/gotokit/log\"\n\t\"github.com/cornelk/gotokit/set\"\n\t\"github.com/h2non/filetype\"\n\t\"github.com/h2non/filetype/types\"\n\t\"golang.org/x/net/html\"\n)\n\n// Config contains the scraper configuration.\ntype Config struct {\n\tURL      string\n\tIncludes []string\n\tExcludes []string\n\n\tImageQuality uint // image quality from 0 to 100%, 0 to disable reencoding\n\tMaxDepth     uint // download depth, 0 for unlimited\n\tTimeout      uint // time limit in seconds to process each http request\n\n\tOutputDirectory string\n\tUsername        string\n\tPassword        string\n\n\tCookies   []Cookie\n\tHeader    http.Header\n\tProxy     string\n\tUserAgent string\n}\n\ntype (\n\thttpDownloader     func(ctx context.Context, u *url.URL) ([]byte, *url.URL, error)\n\tdirCreator         func(path string) error\n\tfileExistenceCheck func(filePath string) bool\n\tfileWriter         func(filePath string, data []byte) error\n)\n\n// Scraper contains all scraping data.\ntype Scraper struct {\n\tconfig  Config\n\tcookies *cookiejar.Jar\n\tlogger  *log.Logger\n\tURL     *url.URL // contains the main URL to parse, will be modified in case of a redirect\n\n\tauth   string\n\tclient *http.Client\n\n\tincludes []*regexp.Regexp\n\texcludes []*regexp.Regexp\n\n\t// key is the URL of page or asset\n\tprocessed set.Set[string]\n\n\timagesQueue       []*url.URL\n\twebPageQueue      []*url.URL\n\twebPageQueueDepth map[string]uint\n\n\tdirCreator         dirCreator\n\tfileExistenceCheck fileExistenceCheck\n\tfileWriter         fileWriter\n\thttpDownloader     httpDownloader\n}\n\n// New creates a new Scraper instance.\n// nolint: funlen\nfunc New(logger *log.Logger, cfg Config) (*Scraper, error) {\n\tvar errs []error\n\n\tu, err := url.Parse(cfg.URL)\n\tif err != nil {\n\t\terrs = append(errs, err)\n\t}\n\tu.Fragment = \"\"\n\n\tincludes, err := compileRegexps(cfg.Includes)\n\tif err != nil {\n\t\terrs = append(errs, err)\n\t}\n\n\texcludes, err := compileRegexps(cfg.Excludes)\n\tif err != nil {\n\t\terrs = append(errs, err)\n\t}\n\n\tif errs != nil {\n\t\treturn nil, errors.Join(errs...)\n\t}\n\n\tif u.Scheme == \"\" {\n\t\tu.Scheme = \"http\" // if no URL scheme was given default to http\n\t}\n\n\tcookies, err := createCookieJar(u, cfg.Cookies)\n\tif err != nil {\n\t\treturn nil, err\n\t}\n\n\t// Create HTTP transport with proxy configuration\n\ttransport, err := httpclient.ProxyTransportFromConfig(cfg.Proxy)\n\tif err != nil {\n\t\treturn nil, fmt.Errorf(\"creating proxy transport: %w\", err)\n\t}\n\n\tclient := &http.Client{\n\t\tJar:       cookies,\n\t\tTimeout:   time.Duration(cfg.Timeout) * time.Second,\n\t\tTransport: transport,\n\t}\n\n\ts := &Scraper{\n\t\tconfig:  cfg,\n\t\tcookies: cookies,\n\t\tlogger:  logger,\n\t\tURL:     u,\n\n\t\tclient: client,\n\n\t\tincludes: includes,\n\t\texcludes: excludes,\n\n\t\tprocessed: set.New[string](),\n\n\t\twebPageQueueDepth: map[string]uint{},\n\t}\n\n\ts.dirCreator = s.createDownloadPath\n\ts.fileExistenceCheck = s.fileExists\n\ts.fileWriter = s.writeFile\n\ts.httpDownloader = s.downloadURLWithRetries\n\n\tif s.config.Username != \"\" {\n\t\ts.auth = \"Basic \" + base64.StdEncoding.EncodeToString([]byte(s.config.Username+\":\"+s.config.Password))\n\t}\n\n\treturn s, nil\n}\n\n// Start starts the scraping.\nfunc (s *Scraper) Start(ctx context.Context) error {\n\tif err := s.dirCreator(s.config.OutputDirectory); err != nil {\n\t\treturn err\n\t}\n\n\tif !s.shouldURLBeDownloaded(s.URL, 0, false) {\n\t\treturn errors.New(\"start page is excluded from downloading\")\n\t}\n\n\tif err := s.processURL(ctx, s.URL, 0); err != nil {\n\t\treturn err\n\t}\n\n\tfor len(s.webPageQueue) > 0 {\n\t\tur := s.webPageQueue[0]\n\t\ts.webPageQueue = s.webPageQueue[1:]\n\t\tcurrentDepth := s.webPageQueueDepth[ur.String()]\n\t\tif err := s.processURL(ctx, ur, currentDepth+1); err != nil && errors.Is(err, context.Canceled) {\n\t\t\treturn err\n\t\t}\n\t}\n\n\treturn nil\n}\n\nfunc (s *Scraper) processURL(ctx context.Context, u *url.URL, currentDepth uint) error {\n\ts.logger.Info(\"Downloading webpage\", log.String(\"url\", u.String()))\n\tdata, respURL, err := s.httpDownloader(ctx, u)\n\tif err != nil {\n\t\ts.logger.Error(\"Processing HTTP Request failed\",\n\t\t\tlog.String(\"url\", u.String()),\n\t\t\tlog.Err(err))\n\t\treturn err\n\t}\n\n\tfileExtension := \"\"\n\tkind, err := filetype.Match(data)\n\tif err == nil && kind != types.Unknown {\n\t\tfileExtension = kind.Extension\n\t}\n\n\tif currentDepth == 0 {\n\t\tu = respURL\n\t\t// use the URL that the website returned as new base url for the\n\t\t// scrape, in case of a redirect it changed\n\t\ts.URL = u\n\t}\n\n\tbuf := bytes.NewBuffer(data)\n\tdoc, err := html.Parse(buf)\n\tif err != nil {\n\t\ts.logger.Error(\"Parsing HTML failed\",\n\t\t\tlog.String(\"url\", u.String()),\n\t\t\tlog.Err(err))\n\t\treturn fmt.Errorf(\"parsing HTML: %w\", err)\n\t}\n\n\tindex := htmlindex.New(s.logger)\n\tindex.Index(u, doc)\n\n\ts.storeDownload(u, data, doc, index, fileExtension)\n\n\tif err := s.downloadReferences(ctx, index); err != nil {\n\t\treturn err\n\t}\n\n\t// check first and download afterward to not hit max depth limit for\n\t// start page links because of recursive linking\n\t// a hrefs\n\treferences, err := index.URLs(htmlindex.ATag)\n\tif err != nil {\n\t\ts.logger.Error(\"Parsing URL failed\", log.Err(err))\n\t}\n\n\tfor _, ur := range references {\n\t\tur.Fragment = \"\"\n\n\t\tif s.shouldURLBeDownloaded(ur, currentDepth, false) {\n\t\t\ts.webPageQueue = append(s.webPageQueue, ur)\n\t\t\ts.webPageQueueDepth[ur.String()] = currentDepth\n\t\t}\n\t}\n\n\treturn nil\n}\n\n// storeDownload writes the download to a file, if a known binary file is detected,\n// processing of the file as page to look for links is skipped.\nfunc (s *Scraper) storeDownload(u *url.URL, data []byte, doc *html.Node,\n\tindex *htmlindex.Index, fileExtension string) {\n\n\t// We need to distinguish between HTML pages and binary files (images, PDFs, etc.)\n\t// because they need different file path handling:\n\t// - HTML pages: add .html extension, handle directory indexes like /about -> /about.html\n\t// - Binary files: keep original path, so /photo.jpg stays /photo.jpg, not /photo.jpg.html\n\t// This prevents breaking binary downloads that were working before.\n\tisAPage := false\n\tif fileExtension == \"\" {\n\t\tfixed, hasChanges, err := s.fixURLReferences(u, doc, index)\n\t\tif err != nil {\n\t\t\ts.logger.Error(\"Fixing file references failed\",\n\t\t\t\tlog.String(\"url\", u.String()),\n\t\t\t\tlog.Err(err))\n\t\t\treturn\n\t\t}\n\n\t\tif hasChanges {\n\t\t\tdata = fixed\n\t\t}\n\t\t// Only HTML content gets processed as a \"page\" - binary files stay as-is\n\t\tisAPage = true\n\t}\n\n\tfilePath := s.getFilePath(u, isAPage)\n\t// always update html files, content might have changed\n\tif err := s.fileWriter(filePath, data); err != nil {\n\t\ts.logger.Error(\"Writing to file failed\",\n\t\t\tlog.String(\"URL\", u.String()),\n\t\t\tlog.String(\"file\", filePath),\n\t\t\tlog.Err(err))\n\t}\n}\n\n// compileRegexps compiles the given regex strings to regular expressions\n// to be used in the include and exclude filters.\nfunc compileRegexps(regexps []string) ([]*regexp.Regexp, error) {\n\tvar errs []error\n\tvar compiled []*regexp.Regexp\n\n\tfor _, exp := range regexps {\n\t\tre, err := regexp.Compile(exp)\n\t\tif err == nil {\n\t\t\tcompiled = append(compiled, re)\n\t\t} else {\n\t\t\terrs = append(errs, err)\n\t\t}\n\t}\n\n\tif len(errs) > 0 {\n\t\treturn nil, errors.Join(errs...)\n\t}\n\treturn compiled, nil\n}\n"
  },
  {
    "path": "scraper/scraper_test.go",
    "content": "package scraper\n\nimport (\n\t\"context\"\n\t\"fmt\"\n\t\"net/url\"\n\t\"testing\"\n\n\t\"github.com/cornelk/gotokit/log\"\n\t\"github.com/cornelk/gotokit/set\"\n\t\"github.com/stretchr/testify/assert\"\n\t\"github.com/stretchr/testify/require\"\n)\n\nfunc newTestScraper(t *testing.T, startURL string, urls map[string][]byte) *Scraper {\n\tt.Helper()\n\n\tlogger := log.NewTestLogger(t)\n\tcfg := Config{\n\t\tURL: startURL,\n\t}\n\tscraper, err := New(logger, cfg)\n\trequire.NoError(t, err)\n\trequire.NotNil(t, scraper)\n\n\tscraper.dirCreator = func(_ string) error {\n\t\treturn nil\n\t}\n\tscraper.fileWriter = func(_ string, _ []byte) error {\n\t\treturn nil\n\t}\n\tscraper.fileExistenceCheck = func(_ string) bool {\n\t\treturn false\n\t}\n\tscraper.httpDownloader = func(_ context.Context, url *url.URL) ([]byte, *url.URL, error) {\n\t\tur := url.String()\n\t\tb, ok := urls[ur]\n\t\tif ok {\n\t\t\treturn b, url, nil\n\t\t}\n\t\treturn nil, nil, fmt.Errorf(\"url '%s' not found in test data\", ur)\n\t}\n\n\treturn scraper\n}\n\nfunc TestScraperLinks(t *testing.T) {\n\tindexPage := []byte(`\n<html>\n<head>\n<link href=' https://example.org/style.css#fragment' rel='stylesheet' type='text/css'>\n</head>\n<body>\n<a href=\"https://example.org/page2\">Example</a>\n</body>\n</html>\n`)\n\n\tpage2 := []byte(`\n<html>\n<body>\n\n<!--link to index with fragment-->\n<a href=\"/#fragment\">a</a>\n<!--link to page with fragment-->\n<a href=\"/sub/#fragment\">a</a>\n\n</body>\n</html>\n`)\n\n\tcss := []byte(``)\n\n\tstartURL := \"https://example.org/#fragment\" // start page with fragment\n\turls := map[string][]byte{\n\t\t\"https://example.org/\":          indexPage,\n\t\t\"https://example.org/page2\":     page2,\n\t\t\"https://example.org/sub/\":      indexPage,\n\t\t\"https://example.org/style.css\": css,\n\t}\n\n\tscraper := newTestScraper(t, startURL, urls)\n\trequire.NotNil(t, scraper)\n\n\tctx := context.Background()\n\terr := scraper.Start(ctx)\n\trequire.NoError(t, err)\n\n\texpectedProcessed := set.NewFromSlice([]string{\n\t\t\"/\",\n\t\t\"/page2\",\n\t\t\"/sub\",\n\t\t\"/style.css\",\n\t})\n\tassert.Equal(t, expectedProcessed, scraper.processed)\n}\n\nfunc TestScraperAttributes(t *testing.T) {\n\tindexPage := []byte(`\n<html>\n<head>\n</head>\n\n<body background=\"bg.gif\">\n\n<!--embedded image-->\n<img src='data:image/gif;base64,R0lGODlhAQABAAD/ACwAAAAAAQABAAACADs%3D=' />\n\n</body>\n</html>\n`)\n\tempty := []byte(``)\n\n\tstartURL := \"https://example.org/\"\n\turls := map[string][]byte{\n\t\t\"https://example.org/\":       indexPage,\n\t\t\"https://example.org/bg.gif\": empty,\n\t}\n\n\tscraper := newTestScraper(t, startURL, urls)\n\trequire.NotNil(t, scraper)\n\n\tctx := context.Background()\n\terr := scraper.Start(ctx)\n\trequire.NoError(t, err)\n\n\texpectedProcessed := set.NewFromSlice([]string{\n\t\t\"/\",\n\t\t\"/bg.gif\",\n\t})\n\tassert.Equal(t, expectedProcessed, scraper.processed)\n}\n\nfunc TestScraperInternalCss(t *testing.T) {\n\tindexPage := []byte(`\n<html>\n<head>\n<style>\nh1 {\n  background-image: url('https://example.org/background.jpg');\n}\nh2 {\n  background-image: url('/img/bg.jpg');\n}\nh3 {\n  background-image: url(bg3.jpg);\n}\n</style>\n</head>\n<body>\n</body>\n</html>\n`)\n\tempty := []byte(``)\n\n\tdomain := \"example.org\"\n\tfile1Reference := \"background.jpg\"\n\tfile2Reference := \"img/bg.jpg\"\n\tfile3Reference := \"bg3.jpg\"\n\tfullURL := \"https://\" + domain\n\n\turls := map[string][]byte{\n\t\tfullURL + \"/\":                  indexPage,\n\t\tfullURL + \"/\" + file1Reference: empty,\n\t\tfullURL + \"/\" + file2Reference: empty,\n\t\tfullURL + \"/\" + file3Reference: empty,\n\t}\n\n\tscraper := newTestScraper(t, fullURL+\"/\", urls)\n\trequire.NotNil(t, scraper)\n\n\tfiles := map[string][]byte{}\n\tscraper.fileWriter = func(filePath string, data []byte) error {\n\t\tfiles[filePath] = data\n\t\treturn nil\n\t}\n\n\tctx := context.Background()\n\terr := scraper.Start(ctx)\n\trequire.NoError(t, err)\n\n\texpectedProcessed := set.NewFromSlice([]string{\n\t\t\"/\",\n\t\t\"/\" + file1Reference,\n\t\t\"/\" + file2Reference,\n\t\t\"/\" + file3Reference,\n\t})\n\trequire.Equal(t, expectedProcessed, scraper.processed)\n\n\tref := domain + \"/index.html\"\n\tcontent := string(files[ref])\n\tassert.Contains(t, content, \"url('\"+file1Reference+\"')\")\n\tassert.Contains(t, content, \"url('\"+file2Reference+\"')\")\n\tassert.Contains(t, content, \"url(\"+file3Reference+\")\")\n}\n"
  },
  {
    "path": "scraper/server.go",
    "content": "package scraper\n\nimport (\n\t\"context\"\n\t\"fmt\"\n\t\"mime\"\n\t\"net/http\"\n\n\t\"github.com/cornelk/gotokit/log\"\n)\n\n// set more mime types in the browser, this for example fixes .asp files not being\n// downloaded but handled as html.\nvar mimeTypes = map[string]string{\n\t\".asp\": \"text/html; charset=utf-8\",\n}\n\n// ServeDirectory serves a directory on a given port as a web server.\nfunc ServeDirectory(ctx context.Context, path string, port int16, logger *log.Logger) error {\n\tfs := http.FileServer(http.Dir(path))\n\tmux := http.NewServeMux()\n\tmux.Handle(\"/\", fs) // server root by file system\n\n\t// update mime types\n\tfor ext, mt := range mimeTypes {\n\t\tif err := mime.AddExtensionType(ext, mt); err != nil {\n\t\t\treturn fmt.Errorf(\"adding mime type '%s': %w\", ext, err)\n\t\t}\n\t}\n\n\tfullAddr := fmt.Sprintf(\"http://127.0.0.1:%d\", port)\n\tlogger.Info(\"Serving directory...\",\n\t\tlog.String(\"path\", path),\n\t\tlog.String(\"address\", fullAddr))\n\n\tserver := &http.Server{\n\t\tAddr:    fmt.Sprintf(\":%d\", port),\n\t\tHandler: mux,\n\t}\n\n\tserverErr := make(chan error, 1)\n\tgo func() {\n\t\tserverErr <- server.ListenAndServe()\n\t}()\n\n\tselect {\n\tcase <-ctx.Done():\n\t\t//nolint: contextcheck\n\t\tif err := server.Shutdown(context.Background()); err != nil {\n\t\t\treturn fmt.Errorf(\"shutting down webserver: %w\", err)\n\t\t}\n\t\treturn nil\n\n\tcase err := <-serverErr:\n\t\treturn fmt.Errorf(\"starting webserver: %w\", err)\n\t}\n}\n"
  },
  {
    "path": "scraper/url.go",
    "content": "package scraper\n\nimport (\n\t\"net/url\"\n\t\"path\"\n\t\"path/filepath\"\n\t\"strings\"\n)\n\nfunc resolveURL(base *url.URL, reference, mainPageHost string, isHyperlink bool, relativeToRoot string) string {\n\tur, err := url.Parse(reference)\n\tif err != nil {\n\t\treturn \"\"\n\t}\n\n\tvar resolvedURL *url.URL\n\tif ur.Host != \"\" && ur.Host != mainPageHost {\n\t\tif isHyperlink { // do not change links to external websites\n\t\t\treturn reference\n\t\t}\n\n\t\tresolvedURL = base.ResolveReference(ur)\n\t\tresolvedURL.Path = filepath.Join(\"_\"+ur.Host, resolvedURL.Path)\n\t} else {\n\t\tif isHyperlink {\n\t\t\tur.Path = getPageFilePath(ur)\n\t\t\tresolvedURL = base.ResolveReference(ur)\n\t\t} else {\n\t\t\tresolvedURL = base.ResolveReference(ur)\n\t\t}\n\t}\n\n\tif resolvedURL.Host == mainPageHost {\n\t\tresolvedURL.Path = urlRelativeToOther(resolvedURL, base)\n\t\trelativeToRoot = \"\"\n\t}\n\n\tresolvedURL.Host = \"\"   // remove host\n\tresolvedURL.Scheme = \"\" // remove http/https\n\tresolved := resolvedURL.String()\n\n\tif resolved == \"\" {\n\t\tresolved = \"/\" // website root\n\t} else {\n\t\tif resolved[0] == '/' && len(relativeToRoot) > 0 {\n\t\t\tresolved = relativeToRoot + resolved[1:]\n\t\t} else {\n\t\t\tresolved = relativeToRoot + resolved\n\t\t}\n\t}\n\n\tif isHyperlink {\n\t\tif resolved[len(resolved)-1] == '/' {\n\t\t\tresolved += PageDirIndex // link dir index to index.html\n\t\t} else {\n\t\t\tl := strings.LastIndexByte(resolved, '/')\n\t\t\tif l != -1 && l < len(resolved) && resolved[l+1] == '#' {\n\t\t\t\tresolved = resolved[:l+1] + PageDirIndex + resolved[l+1:] // link fragment correct\n\t\t\t}\n\t\t}\n\t}\n\n\tresolved = strings.TrimPrefix(resolved, \"/\")\n\treturn resolved\n}\n\nfunc urlRelativeToRoot(url *url.URL) string {\n\tvar rel strings.Builder\n\tsplits := strings.Split(url.Path, \"/\")\n\n\tfor i := range splits {\n\t\tif (len(splits[i]) > 0) && (i < len(splits)-1) {\n\t\t\trel.WriteString(\"../\")\n\t\t}\n\t}\n\n\treturn rel.String()\n}\n\nfunc urlRelativeToOther(src, base *url.URL) string {\n\tsrcSplits := strings.Split(src.Path, \"/\")\n\tbaseSplits := strings.Split(getPageFilePath(base), \"/\")\n\n\tfor len(srcSplits) > 0 && len(baseSplits) > 0 {\n\t\tif len(srcSplits[0]) == 0 {\n\t\t\tsrcSplits = srcSplits[1:]\n\t\t\tcontinue\n\t\t}\n\t\tif len(baseSplits[0]) == 0 {\n\t\t\tbaseSplits = baseSplits[1:]\n\t\t\tcontinue\n\t\t}\n\n\t\tif srcSplits[0] == baseSplits[0] {\n\t\t\tsrcSplits = srcSplits[1:]\n\t\t\tbaseSplits = baseSplits[1:]\n\t\t} else {\n\t\t\tbreak\n\t\t}\n\t}\n\n\tvar upLevels strings.Builder\n\n\tfor i, split := range baseSplits {\n\t\tif split == \"\" {\n\t\t\tcontinue\n\t\t}\n\t\t// Page filename is not a level.\n\t\tif i == len(baseSplits)-1 {\n\t\t\tbreak\n\t\t}\n\t\tupLevels.WriteString(\"../\")\n\t}\n\n\treturn upLevels.String() + path.Join(srcSplits...)\n}\n"
  },
  {
    "path": "scraper/url_test.go",
    "content": "package scraper\n\nimport (\n\t\"net/url\"\n\t\"testing\"\n\n\t\"github.com/stretchr/testify/assert\"\n)\n\nfunc TestResolveURL(t *testing.T) {\n\ttype filePathFixture struct {\n\t\tBaseURL        url.URL\n\t\tReference      string\n\t\tIsHyperlink    bool\n\t\tRelativeToRoot string\n\t\tResolved       string\n\t}\n\n\tpathlessURL := url.URL{\n\t\tScheme: \"https\",\n\t\tHost:   \"petpic.xyz\",\n\t\tPath:   \"\",\n\t}\n\n\tURL := url.URL{\n\t\tScheme: \"https\",\n\t\tHost:   \"petpic.xyz\",\n\t\tPath:   \"/earth/\",\n\t}\n\n\tvar fixtures = []filePathFixture{\n\t\t{pathlessURL, \"\", true, \"\", \"index.html\"},\n\t\t{pathlessURL, \"#contents\", true, \"\", \"#contents\"},\n\t\t{URL, \"brasil/index.html\", true, \"\", \"brasil/index.html\"},\n\t\t{URL, \"brasil/rio/index.html\", true, \"\", \"brasil/rio/index.html\"},\n\t\t{URL, \"../argentina/cat.jpg\", false, \"\", \"../argentina/cat.jpg\"},\n\t}\n\n\tfor _, fix := range fixtures {\n\t\tresolved := resolveURL(&fix.BaseURL, fix.Reference, URL.Host, fix.IsHyperlink, fix.RelativeToRoot)\n\t\tassert.Equal(t, fix.Resolved, resolved)\n\t}\n}\n\nfunc Test_urlRelativeToOther(t *testing.T) {\n\ttype filePathFixture struct {\n\t\tSrcURL          url.URL\n\t\tBaseURL         url.URL\n\t\tExpectedSrcPath string\n\t}\n\n\tvar fixtures = []filePathFixture{\n\t\t{url.URL{Path: \"/earth/brasil/rio/cat.jpg\"}, url.URL{Path: \"/earth/brasil/rio/\"}, \"cat.jpg\"},\n\t\t{url.URL{Path: \"/earth/brasil/rio/cat.jpg\"}, url.URL{Path: \"/earth/\"}, \"brasil/rio/cat.jpg\"},\n\t\t{url.URL{Path: \"/earth/cat.jpg\"}, url.URL{Path: \"/earth/brasil/rio/\"}, \"../../cat.jpg\"},\n\t\t{url.URL{Path: \"/earth/argentina/cat.jpg\"}, url.URL{Path: \"/earth/brasil/rio/\"}, \"../../argentina/cat.jpg\"},\n\t\t{url.URL{Path: \"/earth/brasil/rio/cat.jpg\"}, url.URL{Path: \"/mars/dogtown/\"}, \"../../earth/brasil/rio/cat.jpg\"},\n\t\t{url.URL{Path: \"///earth//////cat.jpg\"}, url.URL{Path: \"///earth/brasil//rio////////\"}, \"../../cat.jpg\"},\n\t}\n\n\tfor _, fix := range fixtures {\n\t\trelativeURL := urlRelativeToOther(&fix.SrcURL, &fix.BaseURL)\n\t\tassert.Equal(t, fix.ExpectedSrcPath, relativeURL)\n\t}\n}\n\nfunc Test_urlRelativeToRoot(t *testing.T) {\n\ttype urlFixture struct {\n\t\tSrcURL   url.URL\n\t\tExpected string\n\t}\n\n\tvar fixtures = []urlFixture{\n\t\t{url.URL{Path: \"/earth/brasil/rio/cat.jpg\"}, \"../../../\"},\n\t\t{url.URL{Path: \"cat.jpg\"}, \"\"},\n\t\t{url.URL{Path: \"/earth/argentina\"}, \"../\"},\n\t\t{url.URL{Path: \"///earth//////cat.jpg\"}, \"../\"},\n\t}\n\n\tfor _, fix := range fixtures {\n\t\trelativeURL := urlRelativeToRoot(&fix.SrcURL)\n\t\tassert.Equal(t, fix.Expected, relativeURL)\n\t}\n}\n"
  }
]