Repository: cornelk/goscrape Branch: main Commit: ee772e8edd93 Files: 39 Total size: 86.7 KB Directory structure: gitextract_eo_umi2a/ ├── .codecov.yml ├── .github/ │ ├── ISSUE_TEMPLATE/ │ │ ├── bug_report.md │ │ └── feature_request.md │ └── workflows/ │ ├── codecov.yaml │ ├── go.yaml │ └── release.yml ├── .gitignore ├── .golangci.yml ├── .goreleaser.yaml ├── CHANGELOG.md ├── Dockerfile ├── LICENSE ├── Makefile ├── README.md ├── css/ │ └── css.go ├── go.mod ├── go.sum ├── htmlindex/ │ ├── attributes.go │ ├── htmlindex.go │ └── htmlindex_test.go ├── main.go └── scraper/ ├── checks.go ├── checks_test.go ├── cookies.go ├── download.go ├── download_test.go ├── fileutil.go ├── fileutil_test.go ├── fs.go ├── html.go ├── html_test.go ├── http.go ├── http_test.go ├── images.go ├── scraper.go ├── scraper_test.go ├── server.go ├── url.go └── url_test.go ================================================ FILE CONTENTS ================================================ ================================================ FILE: .codecov.yml ================================================ coverage: status: patch: off project: default: target: 35% threshold: 5% ================================================ FILE: .github/ISSUE_TEMPLATE/bug_report.md ================================================ --- name: Bug report about: Create a report to help us improve title: '' labels: bug assignees: '' --- **Describe the bug** A clear and concise description of what the bug is. **To Reproduce** Steps to reproduce the behavior: **Expected behavior** A clear and concise description of what you expected to happen. **Screenshots** If applicable, add screenshots to help explain your problem. ================================================ FILE: .github/ISSUE_TEMPLATE/feature_request.md ================================================ --- name: Feature request about: Suggest an idea for this project title: '' labels: enhancement assignees: '' --- **Is your feature request related to a problem? Please describe.** A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] **Describe the solution you'd like** A clear and concise description of what you want to happen. **Describe alternatives you've considered** A clear and concise description of any alternative solutions or features you've considered. **Additional context** Add any other context or screenshots about the feature request here. ================================================ FILE: .github/workflows/codecov.yaml ================================================ name: codecov on: push: branches: - main pull_request: jobs: codecov: timeout-minutes: 15 name: Coverage runs-on: ubuntu-latest steps: - name: Set up Go 1.x uses: actions/setup-go@v5 with: go-version: "1.24" id: go - name: Check out code into the Go module directory uses: actions/checkout@v4 - name: Get dependencies run: go mod download - name: Run tests with coverage run: make test-coverage - name: Upload coverage to Codecov uses: codecov/codecov-action@v4 with: token: ${{ secrets.CODECOV_TOKEN }} file: ./.testCoverage ================================================ FILE: .github/workflows/go.yaml ================================================ name: CI on: push: branches: - main pull_request: types: - opened - reopened - synchronize - ready_for_review jobs: build: if: ${{ github.event_name == 'push' || !github.event.pull_request.draft || !contains(github.event.commits[0].message, '[skip ci]') }} timeout-minutes: 15 name: Build runs-on: ubuntu-latest strategy: matrix: go: [ "1.24" ] steps: - name: Set up Go 1.x uses: actions/setup-go@v5 with: go-version: ${{ matrix.go }} id: go - name: Check out code into the Go module directory uses: actions/checkout@v4 - name: Install linters run: make install-linters - name: Get dependencies run: go mod download - name: Run tests run: make test - name: Run linter run: make lint ================================================ FILE: .github/workflows/release.yml ================================================ name: release on: push: tags: - 'v*' jobs: goreleaser: runs-on: ubuntu-latest steps: - name: Checkout uses: actions/checkout@v4 with: fetch-depth: 0 - name: Set up Go uses: actions/setup-go@v5 with: go-version: '1.24' check-latest: true cache: true - name: Docker login uses: docker/login-action@v3 with: registry: 'ghcr.io' username: ${{ github.actor }} password: ${{ secrets.GITHUB_TOKEN }} - name: Run GoReleaser uses: goreleaser/goreleaser-action@v6 with: version: latest args: release --clean env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} ================================================ FILE: .gitignore ================================================ *.db *.dmp *.exe *.iml *.local *.out *.prof *.test .DS_Store .fleet .idea .vscode /*.log .testCoverage dist/ goscrape ================================================ FILE: .golangci.yml ================================================ version: "2" run: modules-download-mode: readonly linters: enable: - asasalint # Check for pass []any as any in variadic func(...any) - asciicheck # Checks that code identifiers do not have non-ASCII symbols - bidichk # Checks for dangerous unicode character sequences - bodyclose # Checks whether HTTP response body is closed successfully - containedctx # Detects struct contained context.Context field - contextcheck # Check whether the function uses a non-inherited context - copyloopvar # Detects places where loop variables are copied - cyclop # Checks function and package cyclomatic complexity - decorder # Check declaration order of types, constants, variables and functions - dogsled # Checks assignments with too many blank identifiers - durationcheck # Check for two durations multiplied together - embeddedstructfieldcheck # Checks embedded struct field positioning and formatting - err113 # Check errors handling expressions - errcheck # Checks for unchecked errors in Go code - errname # Checks that sentinel errors are prefixed with Err - errorlint # Find code that will cause problems with error wrapping - funlen # Checks for long functions - gocheckcompilerdirectives # Checks that go compiler directives are valid - gocognit # Computes and checks the cognitive complexity of functions - goconst # Finds repeated strings that could be replaced by a constant - gocritic # Provides diagnostics that check for bugs, performance and style issues - gocyclo # Computes and checks the cyclomatic complexity of functions - godot # Check if comments end in a period - govet # Examines Go source code and reports suspicious constructs - grouper # Analyze expression groups - iface # Detect the incorrect use of interfaces - ineffassign # Detects when assignments to existing variables are not used - intrange # Find places where for loops could use an integer range - ireturn # Accept Interfaces, Return Concrete Types - maintidx # Measures the maintainability index of each function - makezero # Find slice declarations with non-zero initial length - mirror # Reports wrong mirror patterns of bytes/strings usage - misspell # Finds commonly misspelled English words - modernize # Suggest simplifications using modern Go features - nakedret # Finds naked returns in functions longer than a maximum size - nestif # Reports deeply nested if statements - nilerr # Finds code that returns nil even if it checks that error is not nil - nilnil # Checks that there is no simultaneous return of nil error and invalid value - noctx # Detects missing usage of context.Context - nosprintfhostport # Checks for misuse of Sprintf to construct a host with port - perfsprint # Checks that fmt.Sprintf can be replaced with a faster alternative - prealloc # Find slice declarations that could be pre-allocated - predeclared # Find code that shadows one of Go's predeclared identifiers - reassign # Checks that package variables are not reassigned - revive # Fast, configurable, extensible linter for Go - staticcheck # Set of rules from staticcheck - testifylint # Checks usage of github.com/stretchr/testify - thelper # Detects test helpers which do not call t.Helper() - tparallel # Detects inappropriate usage of t.Parallel() - unconvert # Remove unnecessary type conversions - unparam # Reports unused function parameters - unused # Checks Go code for unused constants, variables, functions and types - usestdlibvars # Detect the possibility to use variables/constants from stdlib - wastedassign # Finds wasted assignment statements - whitespace # Checks for unnecessary newlines - wrapcheck # Checks that errors returned from external packages are wrapped settings: cyclop: max-complexity: 15 whitespace: multi-if: true # Enforces newlines (or comments) after every multi-line if statement multi-func: true # Enforces newlines (or comments) after every multi-line function signature exclusions: rules: - linters: - err113 text: do not define dynamic errors formatters: enable: - gci # Check if code and import statements are formatted, with additional rules - gofmt # Check if the code is formatted according to 'gofmt' command - goimports # Checks if the code and import statements are formatted according to the 'goimports' command ================================================ FILE: .goreleaser.yaml ================================================ version: 2 before: hooks: - go mod download builds: - id: goscrape binary: goscrape dir: . env: - CGO_ENABLED=0 targets: - go_first_class flags: - -trimpath ldflags: - -s -w -X main.version={{.Version}} -X main.commit={{.Commit}} -X main.date={{ .CommitDate }} dockers: - id: goscrape image_templates: - "ghcr.io/cornelk/goscrape:{{ .Version }}" universal_binaries: - replace: false archives: - id: goscrape builds: ['goscrape'] name_template: >- {{ .ProjectName }}_v{{ .Version }}_ {{- if eq .Os "darwin" }}macos {{- else }}{{ .Os }}{{ end }}_ {{- if eq .Arch "amd64" }}64bit {{- else if eq .Arch "386" }}32bit {{- else if eq .Arch "Darwin" }}macos {{- else }}{{ .Arch }}{{ end }} format_overrides: - goos: windows format: zip checksum: name_template: 'checksums.txt' snapshot: name_template: "{{ .Tag }}-snapshot" changelog: disable: true ================================================ FILE: CHANGELOG.md ================================================ # Changelog for goscrape All notable changes to this project will be documented in this file. ## [v0.3.0] - 2025-01-01 Added: * body background attribute support * fragments at the end of the URL are now filtered * support srcset attribute for image downloading Fixed: * inline css gets processed correctly * --include / --exclude parameter list handling ## [v0.2.0] - 2024-06-21 Added: * add cookie loading and storing support * add docker image build * add header parameter support * add user agent parameter support * support serving directory in local webserver Changed: * improved error messages * reduced dependencies and binary size Fixed: * fix asset downloading * fix downloading of binary files * fix logger output * fix logger verbose flag usage * fix stopping of program using ctrl+c ## [v0.1.1] - 2023-02-11 Added: * binaries for releases * http request timeout configuration support * proxy configuration support Fixed: * do not abort scraping on error ## [v0.1.0] - 2022-06-26 First version of goscrape released. ================================================ FILE: Dockerfile ================================================ FROM gcr.io/distroless/static-debian12 COPY goscrape / ENTRYPOINT ["./goscrape"] ================================================ FILE: LICENSE ================================================ MIT License Copyright (c) 2017 Cornel Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: Makefile ================================================ GOLANGCI_VERSION = v2.6.0 help: ## show help, shown by default if no target is specified @grep -E '^[0-9a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-30s\033[0m %s\n", $$1, $$2}' lint: ## run code linters golangci-lint run test: ## run tests go test -timeout 10s -race ./... test-coverage: ## run unit tests and create test coverage go test -timeout 10s ./... -coverprofile .testCoverage -covermode=atomic -coverpkg=./... test-coverage-web: test-coverage ## run unit tests and show test coverage in browser go tool cover -func .testCoverage | grep total | awk '{print "Total coverage: "$$3}' go tool cover -html=.testCoverage install: ## install all binaries go install -buildvcs=false . install-linters: ## install all linters go install github.com/golangci/golangci-lint/v2/cmd/golangci-lint@${GOLANGCI_VERSION} release-snapshot: ## build release binaries from current git state as snapshot goreleaser release --snapshot --clean ================================================ FILE: README.md ================================================ # goscrape - create offline browsable copies of websites [![Build status](https://github.com/cornelk/goscrape/actions/workflows/go.yaml/badge.svg?branch=main)](https://github.com/cornelk/goscrape/actions) [![go.dev reference](https://img.shields.io/badge/go.dev-reference-007d9c?logo=go&logoColor=white&style=flat-square)](https://pkg.go.dev/github.com/cornelk/goscrape) [![Go Report Card](https://goreportcard.com/badge/github.com/cornelk/goscrape)](https://goreportcard.com/report/github.com/cornelk/goscrape) [![codecov](https://codecov.io/gh/cornelk/goscrape/branch/main/graph/badge.svg?token=NS5UY28V3A)](https://codecov.io/gh/cornelk/goscrape) A web scraper built with Golang. It downloads the content of a website and allows it to be archived and read offline. ## Features Features and advantages over existing tools like wget, httrack, Teleport Pro: * Free and open source * Available for all platforms that Golang supports * JPEG and PNG images can be converted down in quality to save disk space * Excluded URLS will not be fetched (unlike [wget](https://savannah.gnu.org/bugs/?20808)) * No incomplete temp files are left on disk * Downloaded asset files are skipped in a new scraper run * Assets from external domains are downloaded automatically * Sane default values ## Limitations * No GUI version, console only ## Installation There are 2 options to install goscrape: 1. Download and unpack a binary release from [Releases](https://github.com/cornelk/goscrape/releases) or 2. Compile the latest release from source: ``` go install github.com/cornelk/goscrape@latest ``` Compiling the tool from source code needs to have a recent version of [Golang](https://go.dev/) installed. ## Usage Scrape a website by running ``` goscrape http://website.com ``` To serve the downloaded website directory in a local run webserver use ``` goscrape --serve website.com ``` ## Options ``` --include INCLUDE, -n INCLUDE only include URLs with PERL Regular Expressions support --exclude EXCLUDE, -x EXCLUDE exclude URLs with PERL Regular Expressions support --output OUTPUT, -o OUTPUT output directory to write files to --depth DEPTH, -d DEPTH download depth, 0 for unlimited [default: 10] --imagequality IMAGEQUALITY, -i IMAGEQUALITY image quality, 0 to disable reencoding --timeout TIMEOUT, -t TIMEOUT time limit in seconds for each HTTP request to connect and read the request body --serve SERVE, -s SERVE serve the website using a webserver --serverport SERVERPORT, -r SERVERPORT port to use for the webserver [default: 8080] --cookiefile COOKIEFILE, -c COOKIEFILE file containing the cookie content --savecookiefile SAVECOOKIEFILE file to save the cookie content --header HEADER, -h HEADER HTTP header to use for scraping --proxy PROXY, -p PROXY proxy to use in format scheme://[user:password@]host:port (supports HTTP, HTTPS, SOCKS5 protocols) --user USER, -u USER user[:password] to use for HTTP authentication --useragent USERAGENT, -a USERAGENT user agent to use for scraping --verbose, -v verbose output --help, -h display this help and exit --version display version and exit ``` ## Cookies Cookies can be passed in a file using the `--cookiefile` parameter and a file containing cookies in the following format: ``` [{"name":"user","value":"123"},{"name":"sessioe","value":"sid"}] ``` ## Proxy Configuration The `--proxy` flag supports multiple proxy protocols for scraping through different types of proxy servers: ### Supported Proxy Types - **HTTP proxies**: `http://proxy.example.com:8080` - **HTTPS proxies**: `https://proxy.example.com:8080` - **SOCKS5 proxies**: `socks5://proxy.example.com:1080` ### Authentication Proxies requiring authentication can include credentials in the URL: ```bash # HTTP proxy with authentication goscrape --proxy http://username:password@proxy.example.com:8080 http://website.com # SOCKS5 proxy with authentication goscrape --proxy socks5://user:pass@proxy.example.com:1080 http://website.com ``` ================================================ FILE: css/css.go ================================================ // Package css provides a CSS parser that can process CSS data and call a processor for every found URL. package css import ( "net/url" "regexp" "strings" "github.com/cornelk/gotokit/log" "github.com/gorilla/css/scanner" ) var cssURLRe = regexp.MustCompile(`^url\(['"]?(.*?)['"]?\)$`) // Token represents a token and the corresponding string. type Token = scanner.Token type urlProcessor func(token *Token, data string, url *url.URL) // Process the CSS data and call a processor for every found URL. func Process(logger *log.Logger, url *url.URL, data string, processor urlProcessor) { css := scanner.New(data) for { token := css.Next() if token.Type == scanner.TokenEOF || token.Type == scanner.TokenError { break } if token.Type != scanner.TokenURI { continue } match := cssURLRe.FindStringSubmatch(token.Value) if match == nil { continue } src := match[1] if strings.HasPrefix(strings.ToLower(src), "data:") { continue // skip embedded data } u, err := url.Parse(src) if err != nil { logger.Error("Parsing URL failed", log.String("url", src), log.Err(err)) continue } processor(token, src, u) } } ================================================ FILE: go.mod ================================================ module github.com/cornelk/goscrape go 1.24.0 require ( github.com/alexflint/go-arg v1.6.0 github.com/cornelk/gotokit v0.0.0-20251031201833-083458d3990b github.com/gorilla/css v1.0.1 github.com/h2non/filetype v1.1.4-0.20231228185113-6469358c2bcb github.com/stretchr/testify v1.11.1 golang.org/x/net v0.46.0 ) require ( github.com/alexflint/go-scalar v1.2.0 // indirect github.com/davecgh/go-spew v1.1.1 // indirect github.com/kr/pretty v0.3.0 // indirect github.com/pmezard/go-difflib v1.0.0 // indirect gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c // indirect gopkg.in/yaml.v3 v3.0.1 // indirect ) ================================================ FILE: go.sum ================================================ github.com/alexflint/go-arg v1.6.0 h1:wPP9TwTPO54fUVQl4nZoxbFfKCcy5E6HBCumj1XVRSo= github.com/alexflint/go-arg v1.6.0/go.mod h1:A7vTJzvjoaSTypg4biM5uYNTkJ27SkNTArtYXnlqVO8= github.com/alexflint/go-scalar v1.2.0 h1:WR7JPKkeNpnYIOfHRa7ivM21aWAdHD0gEWHCx+WQBRw= github.com/alexflint/go-scalar v1.2.0/go.mod h1:LoFvNMqS1CPrMVltza4LvnGKhaSpc3oyLEBUZVhhS2o= github.com/cornelk/gotokit v0.0.0-20251031201833-083458d3990b h1:jT7eWHlrlvjVXSV47U4IjdgI75nLhf/327EHYaGLzso= github.com/cornelk/gotokit v0.0.0-20251031201833-083458d3990b/go.mod h1:05rfUdBvcZdkWv0/oZfXevCTUzos6WafYzkPzkSkKbY= github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/gorilla/css v1.0.1 h1:ntNaBIghp6JmvWnxbZKANoLyuXTPZ4cAMlo6RyhlbO8= github.com/gorilla/css v1.0.1/go.mod h1:BvnYkspnSzMmwRK+b8/xgNPLiIuNZr6vbZBTPQ2A3b0= github.com/h2non/filetype v1.1.4-0.20231228185113-6469358c2bcb h1:GlQyMv2C48qmfPItvAXFoyN341Swxp9JNVeUZxnmbJw= github.com/h2non/filetype v1.1.4-0.20231228185113-6469358c2bcb/go.mod h1:319b3zT68BvV+WRj7cwy856M2ehB3HqNOt6sy1HndBY= github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo= github.com/kr/pretty v0.2.1/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI= github.com/kr/pretty v0.3.0 h1:WgNl7dwNpEZ6jJ9k1snq4pZsg7DOEN8hP9Xw0Tsjwk0= github.com/kr/pretty v0.3.0/go.mod h1:640gp4NfQd8pI5XOwp5fnNeVWj67G7CFk/SaSQn7NBk= github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/rogpeppe/go-internal v1.6.1 h1:/FiVV8dS/e+YqF2JvO3yXRFbBLTIuSDkuC7aBOAvL+k= github.com/rogpeppe/go-internal v1.6.1/go.mod h1:xXDCJY+GAPziupqXw64V24skbSoqbTEfhy4qGm1nDQc= github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs= github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U= github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U= golang.org/x/net v0.46.0 h1:giFlY12I07fugqwPuWJi68oOnpfqFnJIJzaIIm2JVV4= golang.org/x/net v0.46.0/go.mod h1:Q9BGdFy1y4nkUwiLvT5qtyhAnEHgnQ/zd8PfU6nc210= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q= gopkg.in/errgo.v2 v2.1.0/go.mod h1:hNsd1EY+bozCKY1Ytp96fpM3vjJbqLJn88ws8XvfDNI= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= ================================================ FILE: htmlindex/attributes.go ================================================ package htmlindex import ( "net/url" "github.com/cornelk/gotokit/log" "github.com/cornelk/gotokit/set" "golang.org/x/net/html" ) type nodeAttributeParserData struct { logger *log.Logger url *url.URL node *html.Node attribute string value string } // nodeAttributeParser returns the URL values of the attribute of the node and // whether the attribute has been processed. type nodeAttributeParser func(data nodeAttributeParserData) ([]string, bool) // Node describes an HTML tag and its attributes that can contain URLs. type Node struct { Attributes []string noChildParsing bool parser nodeAttributeParser } // nolint: revive const ( BackgroundAttribute = "background" HrefAttribute = "href" DataSrcAttribute = "data-src" SrcAttribute = "src" DataSrcSetAttribute = "data-srcset" SrcSetAttribute = "srcset" ) // nolint: revive const ( ATag = "a" BodyTag = "body" ImgTag = "img" LinkTag = "link" ScriptTag = "script" StyleTag = "style" ) // Nodes describes the HTML tags and their attributes that can contain URL. var Nodes = map[string]Node{ ATag: { Attributes: []string{HrefAttribute}, }, BodyTag: { Attributes: []string{BackgroundAttribute}, }, ImgTag: { Attributes: []string{SrcAttribute, DataSrcAttribute, SrcSetAttribute, DataSrcSetAttribute}, parser: srcSetValueSplitter, }, LinkTag: { Attributes: []string{HrefAttribute}, }, ScriptTag: { Attributes: []string{SrcAttribute}, }, StyleTag: { noChildParsing: true, parser: styleParser, }, } // SrcSetAttributes contains the attributes that contain srcset values. var SrcSetAttributes = set.NewFromSlice([]string{ DataSrcSetAttribute, SrcSetAttribute, }) ================================================ FILE: htmlindex/htmlindex.go ================================================ // Package htmlindex provides an index for all HTML tags of relevance for scraping. package htmlindex import ( "fmt" "net/url" "slices" "sort" "strings" "github.com/cornelk/goscrape/css" "github.com/cornelk/gotokit/log" "golang.org/x/net/html" ) // Index provides an index for all HTML tags of relevance for scraping. type Index struct { logger *log.Logger // key is HTML tag, value is a map of all its urls and the HTML nodes for it data map[string]map[string][]*html.Node } // New returns a new index. func New(logger *log.Logger) *Index { return &Index{ logger: logger, data: make(map[string]map[string][]*html.Node), } } // Index the given HTML document. func (idx *Index) Index(baseURL *url.URL, node *html.Node) { for child := node.FirstChild; child != nil; child = child.NextSibling { switch child.Type { case html.ElementNode: idx.indexElementNode(baseURL, node, child) default: } } } func (idx *Index) indexElementNode(baseURL *url.URL, node, child *html.Node) { var references []string info, ok := Nodes[child.Data] if ok { references = idx.nodeAttributeURLs(baseURL, child, info.parser, info.Attributes...) } m, ok := idx.data[child.Data] if !ok { m = map[string][]*html.Node{} idx.data[child.Data] = m } for _, reference := range references { m[reference] = append(m[reference], child) } if node.FirstChild != nil && !info.noChildParsing { idx.Index(baseURL, child) } } // URLs returns all URLs of the references found for a specific tag. func (idx *Index) URLs(tag string) ([]*url.URL, error) { m, ok := idx.data[tag] if !ok { return nil, nil } data := make([]string, 0, len(m)) for key := range m { data = append(data, key) } sort.Strings(data) urls := make([]*url.URL, 0, len(m)) for _, fullURL := range data { u, err := url.Parse(fullURL) if err != nil { return nil, fmt.Errorf("parsing URL '%s': %w", fullURL, err) } urls = append(urls, u) } return urls, nil } // Nodes returns a map of all URLs and their HTML nodes. func (idx *Index) Nodes(tag string) map[string][]*html.Node { m, ok := idx.data[tag] if ok { return m } return map[string][]*html.Node{} } // nodeAttributeURLs returns resolved URLs based on the base URL and the HTML node attribute values. func (idx *Index) nodeAttributeURLs(baseURL *url.URL, node *html.Node, parser nodeAttributeParser, attributeNames ...string) []string { var results []string processReferences := func(references []string) { for _, reference := range references { ur, err := url.Parse(reference) if err != nil { continue } ur = baseURL.ResolveReference(ur) results = append(results, ur.String()) } } for _, attr := range node.Attr { if !slices.Contains(attributeNames, attr.Key) { continue } var references []string var parserHandled bool if parser != nil { data := nodeAttributeParserData{ logger: idx.logger, url: baseURL, node: node, attribute: attr.Key, value: strings.TrimSpace(attr.Val), } references, parserHandled = parser(data) } if parser == nil || !parserHandled { references = append(references, strings.TrimSpace(attr.Val)) } processReferences(references) } // special case to support style tag if len(attributeNames) == 0 && parser != nil { data := nodeAttributeParserData{ logger: idx.logger, url: baseURL, node: node, } references, _ := parser(data) processReferences(references) } return results } // srcSetValueSplitter returns the URL values of the srcset attribute of img nodes. func srcSetValueSplitter(data nodeAttributeParserData) ([]string, bool) { if !SrcSetAttributes.Contains(data.attribute) { return nil, false } // split the set of responsive images values := strings.Split(data.value, ",") for i, value := range values { value = strings.TrimSpace(value) // remove the width in pixels after the url values[i], _, _ = strings.Cut(value, " ") } return values, true } // styleParser returns the URL values of a CSS style tag. func styleParser(data nodeAttributeParserData) ([]string, bool) { if data.node.FirstChild == nil { return nil, false } var urls []string processor := func(_ *css.Token, _ string, url *url.URL) { urls = append(urls, url.String()) } cssData := data.node.FirstChild.Data css.Process(data.logger, data.url, cssData, processor) return urls, true } ================================================ FILE: htmlindex/htmlindex_test.go ================================================ package htmlindex import ( "bytes" "net/url" "testing" "github.com/cornelk/gotokit/log" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" "golang.org/x/net/html" ) func TestIndex(t *testing.T) { input := []byte(` Guide `) idx := testSetup(t, input) // check a tag nodeTag := "a" references, err := idx.URLs(nodeTag) require.NoError(t, err) require.Len(t, references, 1) tagURL := "https://domain.com/wp-content/uploads/document.pdf" assert.Equal(t, tagURL, references[0].String()) urls := idx.Nodes(nodeTag) require.Len(t, urls, 1) nodes, ok := urls[tagURL] require.True(t, ok) require.Len(t, nodes, 1) node := nodes[0] assert.Equal(t, nodeTag, node.Data) // check img tag nodeTag = "img" references, err = idx.URLs(nodeTag) require.NoError(t, err) require.Len(t, references, 1) tagURL = "https://domain.com/test.jpg" assert.Equal(t, tagURL, references[0].String()) // check for not existing tag nodeTag = "not-existing" references, err = idx.URLs(nodeTag) require.NoError(t, err) require.Empty(t, references) urls = idx.Nodes(nodeTag) require.Empty(t, urls) } func TestIndexImg(t *testing.T) { input := []byte(` `) idx := testSetup(t, input) references, err := idx.URLs(ImgTag) require.NoError(t, err) require.Len(t, references, 3) assert.Equal(t, "https://domain.com/test-480w.jpg", references[0].String()) assert.Equal(t, "https://domain.com/test-800w.jpg", references[1].String()) assert.Equal(t, "https://domain.com/test.jpg", references[2].String()) references, err = idx.URLs(BodyTag) require.NoError(t, err) require.Len(t, references, 1) assert.Equal(t, "https://domain.com/bg.jpg", references[0].String()) } func testSetup(t *testing.T, input []byte) *Index { t.Helper() buf := &bytes.Buffer{} _, err := buf.Write(input) require.NoError(t, err) doc, err := html.Parse(buf) require.NoError(t, err) ur, err := url.Parse("https://domain.com/") require.NoError(t, err) logger := log.NewTestLogger(t) idx := New(logger) idx.Index(ur, doc) return idx } ================================================ FILE: main.go ================================================ // Package main provides a command line tool to scrape websites and create an offline browsable version on the disk. package main import ( "context" "encoding/json" "errors" "fmt" "os" "strings" "github.com/alexflint/go-arg" "github.com/cornelk/goscrape/scraper" "github.com/cornelk/gotokit/app" "github.com/cornelk/gotokit/buildinfo" "github.com/cornelk/gotokit/env" "github.com/cornelk/gotokit/log" ) var ( version = "dev" commit = "" date = "" ) type arguments struct { Include []string `arg:"-n,--include" help:"only include URLs with PERL Regular Expressions support"` Exclude []string `arg:"-x,--exclude" help:"exclude URLs with PERL Regular Expressions support"` Output string `arg:"-o,--output" help:"output directory to write files to"` URLs []string `arg:"positional"` Depth int64 `arg:"-d,--depth" help:"download depth, 0 for unlimited" default:"10"` ImageQuality int64 `arg:"-i,--imagequality" help:"image quality, 0 to disable reencoding"` Timeout int64 `arg:"-t,--timeout" help:"time limit in seconds for each HTTP request to connect and read the request body"` Serve string `arg:"-s,--serve" help:"serve the website using a webserver"` ServerPort int16 `arg:"-r,--serverport" help:"port to use for the webserver" default:"8080"` CookieFile string `arg:"-c,--cookiefile" help:"file containing the cookie content"` SaveCookieFile string `arg:"--savecookiefile" help:"file to save the cookie content"` Headers []string `arg:"-h,--header" help:"HTTP header to use for scraping"` Proxy string `arg:"-p,--proxy" help:"proxy to use in format scheme://[user:password@]host:port (supports HTTP, HTTPS, SOCKS5 protocols)"` User string `arg:"-u,--user" help:"user[:password] to use for HTTP authentication"` UserAgent string `arg:"-a,--useragent" help:"user agent to use for scraping"` Verbose bool `arg:"-v,--verbose" help:"verbose output"` } func (arguments) Description() string { return "Scrape a website and create an offline browsable version on the disk.\n" } func (arguments) Version() string { return fmt.Sprintf("Version: %s\n", buildinfo.Version(version, commit, date)) } func main() { args, err := readArguments() if err != nil { fmt.Printf("Reading arguments failed: %s\n", err) os.Exit(1) } ctx := app.Context() if args.Verbose { log.SetDefaultLevel(log.DebugLevel) } logger, err := createLogger() if err != nil { fmt.Printf("Creating logger failed: %s\n", err) os.Exit(1) } if args.Serve != "" { if err := runServer(ctx, args, logger); err != nil { fmt.Printf("Server execution error: %s\n", err) os.Exit(1) } return } if err := runScraper(ctx, args, logger); err != nil { fmt.Printf("Scraping execution error: %s\n", err) os.Exit(1) } } func readArguments() (arguments, error) { var args arguments parser, err := arg.NewParser(arg.Config{}, &args) if err != nil { return arguments{}, fmt.Errorf("creating argument parser: %w", err) } if err = parser.Parse(os.Args[1:]); err != nil { switch { case errors.Is(err, arg.ErrHelp): parser.WriteHelp(os.Stdout) os.Exit(0) case errors.Is(err, arg.ErrVersion): fmt.Println(args.Version()) os.Exit(0) } return arguments{}, fmt.Errorf("parsing arguments: %w", err) } if len(args.URLs) == 0 && args.Serve == "" { parser.WriteHelp(os.Stdout) os.Exit(0) } return args, nil } func runScraper(ctx context.Context, args arguments, logger *log.Logger) error { if len(args.URLs) == 0 { return nil } var username, password string if args.User != "" { sl := strings.Split(args.User, ":") username = sl[0] if len(sl) > 1 { password = sl[1] } } imageQuality := args.ImageQuality if args.ImageQuality < 0 || args.ImageQuality >= 100 { imageQuality = 0 } cookies, err := readCookieFile(args.CookieFile) if err != nil { return fmt.Errorf("reading cookie: %w", err) } cfg := scraper.Config{ Includes: args.Include, Excludes: args.Exclude, ImageQuality: uint(imageQuality), MaxDepth: uint(args.Depth), Timeout: uint(args.Timeout), OutputDirectory: args.Output, Username: username, Password: password, Cookies: cookies, Header: scraper.Headers(args.Headers), Proxy: args.Proxy, UserAgent: args.UserAgent, } return scrapeURLs(ctx, cfg, logger, args) } func scrapeURLs(ctx context.Context, cfg scraper.Config, logger *log.Logger, args arguments) error { for _, url := range args.URLs { cfg.URL = url sc, err := scraper.New(logger, cfg) if err != nil { return fmt.Errorf("initializing scraper: %w", err) } logger.Info("Scraping", log.String("url", sc.URL.String())) if err = sc.Start(ctx); err != nil { if errors.Is(err, context.Canceled) { os.Exit(0) } return fmt.Errorf("scraping '%s': %w", sc.URL, err) } if args.SaveCookieFile != "" { if err := saveCookies(args.SaveCookieFile, sc.Cookies()); err != nil { return fmt.Errorf("saving cookies: %w", err) } } } return nil } func runServer(ctx context.Context, args arguments, logger *log.Logger) error { if err := scraper.ServeDirectory(ctx, args.Serve, args.ServerPort, logger); err != nil { return fmt.Errorf("serving directory: %w", err) } return nil } func createLogger() (*log.Logger, error) { logCfg, err := log.ConfigForEnv(env.Development) if err != nil { return nil, fmt.Errorf("initializing log config: %w", err) } logCfg.JSONOutput = false logCfg.CallerInfo = false logger, err := log.NewWithConfig(logCfg) if err != nil { return nil, fmt.Errorf("initializing logger: %w", err) } return logger, nil } func readCookieFile(cookieFile string) ([]scraper.Cookie, error) { if cookieFile == "" { return nil, nil } b, err := os.ReadFile(cookieFile) if err != nil { return nil, fmt.Errorf("reading cookie file: %w", err) } var cookies []scraper.Cookie if err := json.Unmarshal(b, &cookies); err != nil { return nil, fmt.Errorf("unmarshaling cookies: %w", err) } return cookies, nil } func saveCookies(cookieFile string, cookies []scraper.Cookie) error { if cookieFile == "" || len(cookies) == 0 { return nil } b, err := json.Marshal(cookies) if err != nil { return fmt.Errorf("marshaling cookies: %w", err) } if err := os.WriteFile(cookieFile, b, 0644); err != nil { return fmt.Errorf("saving cookies: %w", err) } return nil } ================================================ FILE: scraper/checks.go ================================================ // Package scraper provides a web scraper that can download a website and its assets. package scraper import ( "net/url" "strings" "github.com/cornelk/gotokit/log" ) // normalizeURLPath removes trailing slashes from URL paths for duplicate detection. // This treats URLs with and without trailing slashes as the same resource. func normalizeURLPath(path string) string { if path == "" { return "/" } // Keep root path as is, but remove trailing slashes from other paths if path != "/" && strings.HasSuffix(path, "/") { return strings.TrimSuffix(path, "/") } return path } // shouldURLBeDownloaded checks whether a page should be downloaded. // nolint: cyclop func (s *Scraper) shouldURLBeDownloaded(url *url.URL, currentDepth uint, isAsset bool) bool { if url.Scheme != "http" && url.Scheme != "https" { return false } p := url.String() if url.Host == s.URL.Host { p = url.Path } if p == "" { p = "/" } // Normalize the path for duplicate detection to handle trailing slashes normalizedPath := normalizeURLPath(p) if s.processed.Contains(normalizedPath) { // was already downloaded or checked? if url.Fragment != "" { return false } return false } s.processed.Add(normalizedPath) if !isAsset { if url.Host != s.URL.Host { s.logger.Debug("Skipping external host page", log.String("url", url.String())) return false } if s.config.MaxDepth != 0 && currentDepth == s.config.MaxDepth { s.logger.Debug("Skipping too deep level page", log.String("url", url.String())) return false } } if s.includes != nil && !s.isURLIncluded(url) { return false } if s.excludes != nil && s.isURLExcluded(url) { return false } s.logger.Debug("New URL to download", log.String("url", url.String())) return true } func (s *Scraper) isURLIncluded(url *url.URL) bool { for _, re := range s.includes { if re.MatchString(url.Path) { s.logger.Info("Including URL", log.String("url", url.String()), log.Stringer("included_expression", re)) return true } } return false } func (s *Scraper) isURLExcluded(url *url.URL) bool { for _, re := range s.excludes { if re.MatchString(url.Path) { s.logger.Info("Skipping URL", log.String("url", url.String()), log.Stringer("excluded_expression", re)) return true } } return false } ================================================ FILE: scraper/checks_test.go ================================================ package scraper import ( "net/url" "testing" "github.com/cornelk/gotokit/log" "github.com/cornelk/gotokit/set" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" ) func TestNormalizeURLPath(t *testing.T) { tests := []struct { input string expected string }{ {"", "/"}, {"/", "/"}, {"/path", "/path"}, {"/path/", "/path"}, {"/path/to/resource", "/path/to/resource"}, {"/path/to/resource/", "/path/to/resource"}, {"/category/blog-post", "/category/blog-post"}, {"/category/blog-post/", "/category/blog-post"}, } for _, test := range tests { t.Run(test.input, func(t *testing.T) { result := normalizeURLPath(test.input) assert.Equal(t, test.expected, result) }) } } func TestShouldURLBeDownloaded_TrailingSlashDuplicates(t *testing.T) { logger := log.NewTestLogger(t) cfg := Config{ URL: "https://example.com", } scraper, err := New(logger, cfg) require.NoError(t, err) require.NotNil(t, scraper) // Initialize empty processed set scraper.processed = set.New[string]() // Test that URLs with and without trailing slashes are treated as duplicates url1, err := url.Parse("https://example.com/category/blog-post") require.NoError(t, err) url2, err := url.Parse("https://example.com/category/blog-post/") require.NoError(t, err) // First URL should be downloadable should1 := scraper.shouldURLBeDownloaded(url1, 0, false) assert.True(t, should1, "First URL should be downloadable") // Second URL with trailing slash should be treated as duplicate should2 := scraper.shouldURLBeDownloaded(url2, 0, false) assert.False(t, should2, "Second URL with trailing slash should be treated as duplicate") // Verify that the normalized path is in the processed set assert.True(t, scraper.processed.Contains("/category/blog-post")) } func TestShouldURLBeDownloaded_TrailingSlashDuplicatesReverse(t *testing.T) { logger := log.NewTestLogger(t) cfg := Config{ URL: "https://example.com", } scraper, err := New(logger, cfg) require.NoError(t, err) require.NotNil(t, scraper) // Initialize empty processed set scraper.processed = set.New[string]() // Test reverse order - trailing slash first, then without url1, err := url.Parse("https://example.com/category/blog-post/") require.NoError(t, err) url2, err := url.Parse("https://example.com/category/blog-post") require.NoError(t, err) // First URL with trailing slash should be downloadable should1 := scraper.shouldURLBeDownloaded(url1, 0, false) assert.True(t, should1, "First URL with trailing slash should be downloadable") // Second URL without trailing slash should be treated as duplicate should2 := scraper.shouldURLBeDownloaded(url2, 0, false) assert.False(t, should2, "Second URL without trailing slash should be treated as duplicate") // Verify that the normalized path is in the processed set assert.True(t, scraper.processed.Contains("/category/blog-post")) } func TestShouldURLBeDownloaded_RootPath(t *testing.T) { logger := log.NewTestLogger(t) cfg := Config{ URL: "https://example.com", } scraper, err := New(logger, cfg) require.NoError(t, err) require.NotNil(t, scraper) // Initialize empty processed set scraper.processed = set.New[string]() // Test root path normalization url1, err := url.Parse("https://example.com/") require.NoError(t, err) url2, err := url.Parse("https://example.com") require.NoError(t, err) // First root URL should be downloadable should1 := scraper.shouldURLBeDownloaded(url1, 0, false) assert.True(t, should1, "First root URL should be downloadable") // Second root URL should be treated as duplicate should2 := scraper.shouldURLBeDownloaded(url2, 0, false) assert.False(t, should2, "Second root URL should be treated as duplicate") // Verify that the normalized root path is in the processed set assert.True(t, scraper.processed.Contains("/")) } func TestShouldURLBeDownloaded_ExternalURLs(t *testing.T) { logger := log.NewTestLogger(t) cfg := Config{ URL: "https://example.com", } scraper, err := New(logger, cfg) require.NoError(t, err) require.NotNil(t, scraper) // Initialize empty processed set scraper.processed = set.New[string]() // Test external URLs with trailing slashes as assets url1, err := url.Parse("https://external.com/path.css") require.NoError(t, err) url2, err := url.Parse("https://external.com/path.css/") require.NoError(t, err) // First external asset should be downloadable (if it passes other checks) should1 := scraper.shouldURLBeDownloaded(url1, 0, true) // asset = true // Second external asset with trailing slash should be treated as duplicate should2 := scraper.shouldURLBeDownloaded(url2, 0, true) // asset = true // First should pass, second should be blocked as duplicate assert.True(t, should1, "First external asset should be downloadable") assert.False(t, should2, "Second external asset with trailing slash should be treated as duplicate") // Verify that the normalized external URL is in the processed set normalizedURL1 := normalizeURLPath(url1.String()) assert.True(t, scraper.processed.Contains(normalizedURL1)) } ================================================ FILE: scraper/cookies.go ================================================ package scraper import ( "fmt" "net/http" "net/http/cookiejar" "net/url" "time" ) // Cookie represents a cookie, it copies parts of the http.Cookie struct but changes // the JSON marshaling to exclude empty fields. type Cookie struct { Name string `json:"name"` Value string `json:"value,omitempty"` Expires *time.Time `json:"expires,omitempty"` } // Cookies returns the current cookies. func (s *Scraper) Cookies() []Cookie { httpCookies := s.cookies.Cookies(s.URL) cookies := make([]Cookie, 0, len(httpCookies)) for _, c := range httpCookies { cookie := Cookie{ Name: c.Name, Value: c.Value, } if !c.Expires.IsZero() { cookie.Expires = &c.Expires } cookies = append(cookies, cookie) } return cookies } func createCookieJar(u *url.URL, cookies []Cookie) (*cookiejar.Jar, error) { jar, err := cookiejar.New(nil) if err != nil { return nil, fmt.Errorf("creating cookie jar: %w", err) } httpCookies := make([]*http.Cookie, 0, len(cookies)) for _, c := range cookies { h := &http.Cookie{ Name: c.Name, Value: c.Value, } if c.Expires != nil { h.Expires = *c.Expires } httpCookies = append(httpCookies, h) } jar.SetCookies(u, httpCookies) return jar, nil } ================================================ FILE: scraper/download.go ================================================ package scraper import ( "context" "errors" "fmt" "net/url" "path" "github.com/cornelk/goscrape/css" "github.com/cornelk/goscrape/htmlindex" "github.com/cornelk/gotokit/log" ) // assetProcessor is a processor of a downloaded asset that can transform // a downloaded file content before it will be stored on disk. type assetProcessor func(URL *url.URL, data []byte) []byte var tagsWithReferences = []string{ htmlindex.LinkTag, htmlindex.ScriptTag, htmlindex.BodyTag, htmlindex.StyleTag, } func (s *Scraper) downloadReferences(ctx context.Context, index *htmlindex.Index) error { references, err := index.URLs(htmlindex.BodyTag) if err != nil { s.logger.Error("Getting body node URLs failed", log.Err(err)) } s.imagesQueue = append(s.imagesQueue, references...) references, err = index.URLs(htmlindex.ImgTag) if err != nil { s.logger.Error("Getting img node URLs failed", log.Err(err)) } s.imagesQueue = append(s.imagesQueue, references...) for _, tag := range tagsWithReferences { references, err = index.URLs(tag) if err != nil { s.logger.Error("Getting node URLs failed", log.String("node", tag), log.Err(err)) } var processor assetProcessor if tag == htmlindex.LinkTag { processor = s.cssProcessor } for _, ur := range references { if err := s.downloadAsset(ctx, ur, processor); err != nil && errors.Is(err, context.Canceled) { return err } } } for _, image := range s.imagesQueue { if err := s.downloadAsset(ctx, image, s.checkImageForRecode); err != nil && errors.Is(err, context.Canceled) { return err } } s.imagesQueue = nil return nil } // downloadAsset downloads an asset if it does not exist on disk yet. func (s *Scraper) downloadAsset(ctx context.Context, u *url.URL, processor assetProcessor) error { u.Fragment = "" urlFull := u.String() if !s.shouldURLBeDownloaded(u, 0, true) { return nil } filePath := s.getFilePath(u, false) if s.fileExists(filePath) { return nil } s.logger.Info("Downloading asset", log.String("url", urlFull)) data, _, err := s.httpDownloader(ctx, u) if err != nil { s.logger.Error("Downloading asset failed", log.String("url", urlFull), log.Err(err)) return fmt.Errorf("downloading asset: %w", err) } if processor != nil { data = processor(u, data) } if err = s.fileWriter(filePath, data); err != nil { s.logger.Error("Writing asset file failed", log.String("url", urlFull), log.String("file", filePath), log.Err(err)) } return nil } func (s *Scraper) cssProcessor(baseURL *url.URL, data []byte) []byte { urls := make(map[string]string) processor := func(token *css.Token, data string, u *url.URL) { s.imagesQueue = append(s.imagesQueue, u) cssPath := *u cssPath.Path = path.Dir(cssPath.Path) + "/" resolved := resolveURL(&cssPath, data, s.URL.Host, false, "") urls[token.Value] = resolved } cssData := string(data) css.Process(s.logger, baseURL, cssData, processor) if len(urls) == 0 { return data } for ori, filePath := range urls { cssData = replaceCSSUrls(ori, filePath, cssData) s.logger.Debug("CSS Element relinked", log.String("url", ori), log.String("fixed_url", filePath)) } return []byte(cssData) } ================================================ FILE: scraper/download_test.go ================================================ package scraper import ( "net/url" "testing" "github.com/cornelk/gotokit/log" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" ) func TestCSSProcessor(t *testing.T) { logger := log.NewTestLogger(t) cfg := Config{ URL: "http://localhost", } s, err := New(logger, cfg) require.NoError(t, err) var fixtures = map[string]string{ "url('http://localhost/uri/between/single/quote')": "http://localhost/uri/between/single/quote", `url("http://localhost/uri/between/double/quote")`: "http://localhost/uri/between/double/quote", "url(http://localhost/uri)": "http://localhost/uri", "url(data:image/gif;base64,R0lGODl)": "", `div#gopher { background: url(/doc/gopher/frontpage.png) no-repeat; height: 155px; }`: "http://localhost/doc/gopher/frontpage.png", } u, _ := url.Parse("http://localhost") for input, expected := range fixtures { s.imagesQueue = nil s.cssProcessor(u, []byte(input)) if expected == "" { assert.Empty(t, s.imagesQueue) continue } assert.NotEmpty(t, s.imagesQueue) res := s.imagesQueue[0].String() assert.Equal(t, expected, res) } } ================================================ FILE: scraper/fileutil.go ================================================ package scraper import ( "fmt" "hash/fnv" "net/url" "path/filepath" "strings" ) const ( // PageExtension is the file extension that downloaded pages get. PageExtension = ".html" // PageDirIndex is the file name of the index file for every dir. PageDirIndex = "index" + PageExtension // MaxFilenameLength is the maximum length for a filename component to ensure filesystem compatibility. MaxFilenameLength = 200 ) // getFilePath returns a file path for a URL to store the URL content in. // The isAPage parameter is crucial: it tells us whether this URL contains HTML content // that should be treated as a web page (with .html extensions and directory indexing) // or if it's a binary file that should keep its original path unchanged. // Without this distinction, binary files would get corrupted paths like image.jpg.html. func (s *Scraper) getFilePath(url *url.URL, isAPage bool) string { fileName := url.Path if isAPage { // This is HTML content - apply web page naming conventions fileName = getPageFilePath(url) } // If not a page, keep the original URL path for binary files var externalHost string if url.Host != s.URL.Host { externalHost = "_" + url.Host // _ is a prefix for external domains on the filesystem } // Split the file path into directory and filename components dir := filepath.Dir(fileName) base := filepath.Base(fileName) // Truncate the filename component if it's too long truncatedBase := truncateFilename(base) // Reconstruct the path with the truncated filename if dir == "." { fileName = truncatedBase } else { fileName = filepath.Join(dir, truncatedBase) } return filepath.Join(s.config.OutputDirectory, s.URL.Host, externalHost, fileName) } // getPageFilePath returns a filename for a URL that represents a web page. // This function adds .html extensions and handles directory indexing, // which is what we want for HTML content but NOT for binary files like images or PDFs. func getPageFilePath(url *url.URL) string { fileName := url.Path // root of domain will be index.html switch { case fileName == "" || fileName == "/": fileName = PageDirIndex // directory index will be index.html in the directory case fileName[len(fileName)-1] == '/': fileName += PageDirIndex default: ext := filepath.Ext(fileName) // if file extension is missing add .html, otherwise keep the existing file extension if ext == "" { fileName += PageExtension } } return fileName } // truncateFilename truncates a filename if it exceeds MaxFilenameLength while preserving the extension. func truncateFilename(filename string) string { if len(filename) <= MaxFilenameLength { return filename } ext := filepath.Ext(filename) baseName := strings.TrimSuffix(filename, ext) // Calculate how much space we need for hash and extension hashLength := 8 // Using first 8 hex characters (from 32-bit FNV) reservedLength := hashLength + len(ext) // If the extension alone is too long, truncate it too if reservedLength > MaxFilenameLength { ext = ext[:MaxFilenameLength-hashLength] reservedLength = hashLength + len(ext) } maxBaseLength := MaxFilenameLength - reservedLength if maxBaseLength <= 0 { maxBaseLength = 1 } truncatedBase := baseName[:maxBaseLength] // Generate FNV-1a hash of original filename to ensure uniqueness h := fnv.New32a() _, _ = h.Write([]byte(filename)) hashStr := fmt.Sprintf("%08x", h.Sum32())[:hashLength] return truncatedBase + hashStr + ext } ================================================ FILE: scraper/fileutil_test.go ================================================ package scraper import ( "net/url" "os" "strings" "testing" "github.com/cornelk/gotokit/log" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" ) func TestGetFilePath(t *testing.T) { type filePathFixture struct { BaseURL string DownloadURL string ExpectedFilePath string } pathSeparator := string(os.PathSeparator) expectedBasePath := "google.com" + pathSeparator var fixtures = []filePathFixture{ {"https://google.com/", "https://github.com/", expectedBasePath + "_github.com" + pathSeparator + "index.html"}, {"https://google.com/", "https://github.com/#fragment", expectedBasePath + "_github.com" + pathSeparator + "index.html"}, {"https://google.com/", "https://github.com/test", expectedBasePath + "_github.com" + pathSeparator + "test.html"}, {"https://google.com/", "https://github.com/test/", expectedBasePath + "_github.com" + pathSeparator + "test" + pathSeparator + "index.html"}, {"https://google.com/", "https://github.com/test.aspx", expectedBasePath + "_github.com" + pathSeparator + "test.aspx"}, {"https://google.com/", "https://google.com/settings", expectedBasePath + "settings.html"}, } var cfg Config logger := log.NewTestLogger(t) for _, fix := range fixtures { cfg.URL = fix.BaseURL s, err := New(logger, cfg) require.NoError(t, err) URL, err := url.Parse(fix.DownloadURL) require.NoError(t, err) output := s.getFilePath(URL, true) assert.Equal(t, fix.ExpectedFilePath, output) } } func TestTruncateFilename(t *testing.T) { tests := []struct { name string filename string expected func(string) bool // function to validate the result }{ { name: "short filename unchanged", filename: "test.css", expected: func(result string) bool { return result == "test.css" }, }, { name: "long filename gets truncated", filename: "very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-long-filename.css", expected: func(result string) bool { return len(result) <= MaxFilenameLength && len(result) > 0 && result != "very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-long-filename.css" && result[len(result)-4:] == ".css" }, }, { name: "filename without extension", filename: "very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-long-filename", expected: func(result string) bool { return len(result) <= MaxFilenameLength && len(result) > 0 && result != "very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-very-long-filename" }, }, { name: "empty filename", filename: "", expected: func(result string) bool { return result == "" }, }, { name: "filename at max length", filename: strings.Repeat("a", MaxFilenameLength), expected: func(result string) bool { return len(result) == MaxFilenameLength }, }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { result := truncateFilename(tt.filename) assert.True(t, tt.expected(result), "filename: %q, result: %q", tt.filename, result) }) } } func TestTruncateFilenameUniqueness(t *testing.T) { // Test that different long filenames with the same prefix produce different results longPrefix := "this-is-a-very-long-filename-prefix-that-will-be-truncated-and-should-produce-different-results-based-on-the-hash-suffix-when-the-full-filename-is-different" filename1 := longPrefix + "-file1.css" filename2 := longPrefix + "-file2.css" result1 := truncateFilename(filename1) result2 := truncateFilename(filename2) assert.NotEqual(t, result1, result2, "Different long filenames should produce different truncated results") assert.LessOrEqual(t, len(result1), MaxFilenameLength, "Result1 should be within max length") assert.LessOrEqual(t, len(result2), MaxFilenameLength, "Result2 should be within max length") } ================================================ FILE: scraper/fs.go ================================================ package scraper import ( "fmt" "os" "path/filepath" "github.com/cornelk/gotokit/log" ) // createDownloadPath creates the download path if it does not exist yet. func (s *Scraper) createDownloadPath(path string) error { if path == "" { return nil } s.logger.Debug("Creating dir", log.String("path", path)) if err := os.MkdirAll(path, os.ModePerm); err != nil { return fmt.Errorf("creating directory '%s': %w", path, err) } return nil } func (s *Scraper) writeFile(filePath string, data []byte) error { dir := filepath.Dir(filePath) if len(dir) < len(s.URL.Host) { // nothing to append if it is the root dir dir = filepath.Join(".", s.URL.Host, dir) } if err := s.dirCreator(dir); err != nil { return err } s.logger.Debug("Creating file", log.String("path", filePath)) f, err := os.Create(filePath) if err != nil { return fmt.Errorf("creating file '%s': %w", filePath, err) } if _, err = f.Write(data); err != nil { // nolint: wrapcheck _ = f.Close() // try to close and remove file but return the first error _ = os.Remove(filePath) return fmt.Errorf("writing to file: %w", err) } if err := f.Close(); err != nil { return fmt.Errorf("closing file: %w", err) } return nil } func (s *Scraper) fileExists(filePath string) bool { if _, err := os.Stat(filePath); !os.IsNotExist(err) { return true } return false } ================================================ FILE: scraper/html.go ================================================ package scraper import ( "bytes" "fmt" "net/url" "slices" "strings" "github.com/cornelk/goscrape/css" "github.com/cornelk/goscrape/htmlindex" "github.com/cornelk/gotokit/log" "golang.org/x/net/html" ) // ignoredURLPrefixes contains a list of URL prefixes that do not need to bo adjusted. var ignoredURLPrefixes = []string{ "#", // fragment "/#", // fragment "data:", // embedded data "mailto:", // mail address } // fixURLReferences fixes URL references to point to relative file names. // It returns a bool that indicates that no reference needed to be fixed, // in this case the returned HTML string will be empty. func (s *Scraper) fixURLReferences(url *url.URL, doc *html.Node, index *htmlindex.Index) ([]byte, bool, error) { relativeToRoot := urlRelativeToRoot(url) if !s.fixHTMLNodeURLs(url, relativeToRoot, index) { return nil, false, nil } var rendered bytes.Buffer if err := html.Render(&rendered, doc); err != nil { return nil, false, fmt.Errorf("rendering html: %w", err) } return rendered.Bytes(), true, nil } // fixHTMLNodeURLs processes all HTML nodes that contain URLs that need to be fixed // to link to downloaded files. It returns whether any URLS have been fixed. func (s *Scraper) fixHTMLNodeURLs(baseURL *url.URL, relativeToRoot string, index *htmlindex.Index) bool { var changed bool for tag, nodeInfo := range htmlindex.Nodes { isHyperlink := tag == htmlindex.ATag urls := index.Nodes(tag) for _, nodes := range urls { for _, node := range nodes { switch node.Data { case htmlindex.StyleTag: if s.fixScriptNodeURL(baseURL, node, isHyperlink, relativeToRoot) { changed = true } default: if s.fixNodeURL(baseURL, nodeInfo.Attributes, node, isHyperlink, relativeToRoot) { changed = true } } } } } return changed } // fixNodeURL fixes the URL references of a HTML node to point to a relative file name. // It returns whether any attribute value bas been adjusted. func (s *Scraper) fixNodeURL(baseURL *url.URL, attributes []string, node *html.Node, isHyperlink bool, relativeToRoot string) bool { var changed bool for i, attr := range node.Attr { if !slices.Contains(attributes, attr.Key) { continue } value := strings.TrimSpace(attr.Val) if value == "" { continue } for _, prefix := range ignoredURLPrefixes { if strings.HasPrefix(value, prefix) { return false } } var adjusted string if htmlindex.SrcSetAttributes.Contains(attr.Key) { adjusted = resolveSrcSetURLs(baseURL, value, s.URL.Host, isHyperlink, relativeToRoot) } else { adjusted = resolveURL(baseURL, value, s.URL.Host, isHyperlink, relativeToRoot) } if adjusted == value { // check for no change continue } s.logger.Debug("HTML node relinked", log.String("value", value), log.String("fixed_value", adjusted)) attribute := &node.Attr[i] attribute.Val = adjusted changed = true } return changed } // fixScriptNodeURL fixes the URL references of a HTML script node to point to a relative file name. // It returns whether any attribute value bas been adjusted. func (s *Scraper) fixScriptNodeURL(baseURL *url.URL, node *html.Node, isHyperlink bool, relativeToRoot string) bool { if node.FirstChild == nil { return false } urls := map[string]string{} processor := func(_ *css.Token, before string, _ *url.URL) { adjusted := resolveURL(baseURL, before, s.URL.Host, isHyperlink, relativeToRoot) if before != adjusted { urls[before] = adjusted } } cssData := node.FirstChild.Data css.Process(s.logger, baseURL, cssData, processor) var changed bool for before, filePath := range urls { cssData = replaceCSSUrls(before, filePath, cssData) s.logger.Debug("CSS Element relinked", log.String("url", before), log.String("fixed_url", filePath)) changed = true } node.FirstChild.Data = cssData return changed } func resolveSrcSetURLs(base *url.URL, srcSetValue, mainPageHost string, isHyperlink bool, relativeToRoot string) string { // split the set of responsive images values := strings.Split(srcSetValue, ",") for i, value := range values { value = strings.TrimSpace(value) parts := strings.Split(value, " ") parts[0] = resolveURL(base, parts[0], mainPageHost, isHyperlink, relativeToRoot) values[i] = strings.Join(parts, " ") } return strings.Join(values, ", ") } func replaceCSSUrls(before, after, content string) string { prefixes := []string{ "\"", "'", "", } for _, prefix := range prefixes { wrong := fmt.Sprintf("url(%s%s%s)", prefix, before, prefix) fixed := "url('" + after + "')" content = strings.ReplaceAll(content, wrong, fixed) } return content } ================================================ FILE: scraper/html_test.go ================================================ package scraper import ( "bytes" "testing" "github.com/cornelk/goscrape/htmlindex" "github.com/cornelk/gotokit/log" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" "golang.org/x/net/html" ) func TestFixURLReferences(t *testing.T) { logger := log.NewTestLogger(t) cfg := Config{ URL: "http://domain.com", } s, err := New(logger, cfg) require.NoError(t, err) b := []byte(` Guide `) buf := &bytes.Buffer{} _, err = buf.Write(b) require.NoError(t, err) doc, err := html.Parse(buf) require.NoError(t, err) index := htmlindex.New(logger) index.Index(s.URL, doc) ref, fixed, err := s.fixURLReferences(s.URL, doc, index) require.NoError(t, err) assert.True(t, fixed) expected := "" + "Guide\n" + " \n\n" + "" assert.Equal(t, expected, string(ref)) } ================================================ FILE: scraper/http.go ================================================ package scraper import ( "bytes" "context" "errors" "fmt" "io" "net/http" "net/url" "strings" "time" "github.com/cornelk/gotokit/app" "github.com/cornelk/gotokit/log" ) var ( maxRetries = 10 retryDelay = 1500 * time.Millisecond errExhaustedRetries = errors.New("exhausted retries") ) func (s *Scraper) downloadURL(ctx context.Context, u *url.URL) (*http.Response, error) { req, err := http.NewRequestWithContext(ctx, http.MethodGet, u.String(), nil) if err != nil { return nil, fmt.Errorf("creating HTTP request: %w", err) } req.Header.Set("User-Agent", s.config.UserAgent) if s.auth != "" { req.Header.Set("Authorization", s.auth) } for key, values := range s.config.Header { for _, value := range values { req.Header.Set(key, value) } } resp, err := s.client.Do(req) if err != nil { return nil, fmt.Errorf("executing HTTP request: %w", err) } return resp, nil } func (s *Scraper) downloadURLWithRetries(ctx context.Context, u *url.URL) ([]byte, *url.URL, error) { var err error var resp *http.Response for retries := range maxRetries + 2 { if retries == maxRetries+1 { return nil, nil, fmt.Errorf("%w for URL %s", errExhaustedRetries, u) } resp, err = s.downloadURL(ctx, u) if err != nil { return nil, nil, err } if resp.StatusCode == http.StatusTooManyRequests { s.logger.Warn("Too Many Requests. Retrying again", log.Int("num", retries+1), log.Int("max", maxRetries), log.String("url", u.String())) // Wait a bit and try again using exponential backoff on each retry if err := app.Sleep(ctx, (time.Duration(retries)+1)*retryDelay); err != nil { return nil, nil, fmt.Errorf("sleeping between retries: %w", err) } continue } break } defer func() { if err := resp.Body.Close(); err != nil { s.logger.Error("Closing HTTP Request body failed", log.String("url", u.String()), log.Err(err)) } }() if resp.StatusCode != http.StatusOK { return nil, nil, fmt.Errorf("unexpected HTTP request status code %d", resp.StatusCode) } buf := &bytes.Buffer{} if _, err := io.Copy(buf, resp.Body); err != nil { return nil, nil, fmt.Errorf("reading HTTP request body: %w", err) } return buf.Bytes(), resp.Request.URL, nil } // Headers converts a slice of strings to a http.Header. func Headers(headers []string) http.Header { h := http.Header{} for _, header := range headers { sl := strings.SplitN(header, ":", 2) if len(sl) == 2 { h.Set(sl[0], sl[1]) } } return h } ================================================ FILE: scraper/http_test.go ================================================ package scraper import ( "context" "fmt" "net/http" "net/http/httptest" "net/url" "testing" "time" "github.com/cornelk/gotokit/log" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" ) func TestHeaders(t *testing.T) { headers := Headers([]string{"a:b", "c:d:e"}) assert.Equal(t, "b", headers.Get("a")) assert.Equal(t, "d:e", headers.Get("c")) } func TestDownloadURLWithRetries(t *testing.T) { ctx := context.Background() expected := "ok" var retry int svr := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { if retry < maxRetries { retry++ w.WriteHeader(http.StatusTooManyRequests) return } _, err := fmt.Fprint(w, expected) assert.NoError(t, err) })) defer svr.Close() ur, err := url.Parse(svr.URL) require.NoError(t, err) maxRetries = 2 retryDelay = time.Millisecond var cfg Config logger := log.NewTestLogger(t) s, err := New(logger, cfg) require.NoError(t, err) // download works after 2 retries b, urActual, err := s.downloadURLWithRetries(ctx, ur) require.NoError(t, err) require.NotNil(t, urActual) assert.Equal(t, svr.URL, urActual.String()) assert.Equal(t, expected, string(b)) assert.Equal(t, retry, maxRetries) // download fails after 3 retries retry = -100 _, _, err = s.downloadURLWithRetries(ctx, ur) assert.ErrorIs(t, err, errExhaustedRetries) } ================================================ FILE: scraper/images.go ================================================ package scraper import ( "bytes" "fmt" "image" "image/jpeg" "image/png" "net/url" "github.com/cornelk/gotokit/log" "github.com/h2non/filetype" "github.com/h2non/filetype/matchers" "github.com/h2non/filetype/types" ) func (s *Scraper) checkImageForRecode(url *url.URL, data []byte) []byte { if s.config.ImageQuality == 0 { return data } kind, err := filetype.Match(data) if err != nil || kind == types.Unknown { return data } s.logger.Debug("File type detected", log.String("type", kind.MIME.Type), log.String("sub_type", kind.MIME.Subtype)) if kind.MIME.Type == matchers.TypeJpeg.MIME.Type && kind.MIME.Subtype == matchers.TypeJpeg.MIME.Subtype { if recoded := s.recodeJPEG(url, data); recoded != nil { return recoded } return data } if kind.MIME.Type == matchers.TypePng.MIME.Type && kind.MIME.Subtype == matchers.TypePng.MIME.Subtype { if recoded := s.recodePNG(url, data); recoded != nil { return recoded } return data } return data } // encodeJPEG encodes a new JPG based on the given quality setting. func (s *Scraper) encodeJPEG(img image.Image) []byte { o := &jpeg.Options{ Quality: int(s.config.ImageQuality), } outBuf := &bytes.Buffer{} if err := jpeg.Encode(outBuf, img, o); err != nil { return nil } return outBuf.Bytes() } // recodeJPEG recodes the image and returns it if it is smaller than before. func (s *Scraper) recodeJPEG(url fmt.Stringer, data []byte) []byte { inBuf := bytes.NewBuffer(data) img, err := jpeg.Decode(inBuf) if err != nil { return nil } encoded := s.encodeJPEG(img) if encoded == nil || len(encoded) > len(data) { // only use the new file if it is smaller return nil } s.logger.Debug("Recoded JPEG", log.String("url", url.String()), log.Int("size_original", len(data)), log.Int("size_recoded", len(encoded))) return encoded } // recodePNG recodes the image and returns it if it is smaller than before. func (s *Scraper) recodePNG(url fmt.Stringer, data []byte) []byte { inBuf := bytes.NewBuffer(data) img, err := png.Decode(inBuf) if err != nil { return nil } encoded := s.encodeJPEG(img) if encoded == nil || len(encoded) > len(data) { // only use the new file if it is smaller return nil } s.logger.Debug("Recoded PNG", log.String("url", url.String()), log.Int("size_original", len(data)), log.Int("size_recoded", len(encoded))) return encoded } ================================================ FILE: scraper/scraper.go ================================================ package scraper import ( "bytes" "context" "encoding/base64" "errors" "fmt" "net/http" "net/http/cookiejar" "net/url" "regexp" "time" "github.com/cornelk/goscrape/htmlindex" "github.com/cornelk/gotokit/httpclient" "github.com/cornelk/gotokit/log" "github.com/cornelk/gotokit/set" "github.com/h2non/filetype" "github.com/h2non/filetype/types" "golang.org/x/net/html" ) // Config contains the scraper configuration. type Config struct { URL string Includes []string Excludes []string ImageQuality uint // image quality from 0 to 100%, 0 to disable reencoding MaxDepth uint // download depth, 0 for unlimited Timeout uint // time limit in seconds to process each http request OutputDirectory string Username string Password string Cookies []Cookie Header http.Header Proxy string UserAgent string } type ( httpDownloader func(ctx context.Context, u *url.URL) ([]byte, *url.URL, error) dirCreator func(path string) error fileExistenceCheck func(filePath string) bool fileWriter func(filePath string, data []byte) error ) // Scraper contains all scraping data. type Scraper struct { config Config cookies *cookiejar.Jar logger *log.Logger URL *url.URL // contains the main URL to parse, will be modified in case of a redirect auth string client *http.Client includes []*regexp.Regexp excludes []*regexp.Regexp // key is the URL of page or asset processed set.Set[string] imagesQueue []*url.URL webPageQueue []*url.URL webPageQueueDepth map[string]uint dirCreator dirCreator fileExistenceCheck fileExistenceCheck fileWriter fileWriter httpDownloader httpDownloader } // New creates a new Scraper instance. // nolint: funlen func New(logger *log.Logger, cfg Config) (*Scraper, error) { var errs []error u, err := url.Parse(cfg.URL) if err != nil { errs = append(errs, err) } u.Fragment = "" includes, err := compileRegexps(cfg.Includes) if err != nil { errs = append(errs, err) } excludes, err := compileRegexps(cfg.Excludes) if err != nil { errs = append(errs, err) } if errs != nil { return nil, errors.Join(errs...) } if u.Scheme == "" { u.Scheme = "http" // if no URL scheme was given default to http } cookies, err := createCookieJar(u, cfg.Cookies) if err != nil { return nil, err } // Create HTTP transport with proxy configuration transport, err := httpclient.ProxyTransportFromConfig(cfg.Proxy) if err != nil { return nil, fmt.Errorf("creating proxy transport: %w", err) } client := &http.Client{ Jar: cookies, Timeout: time.Duration(cfg.Timeout) * time.Second, Transport: transport, } s := &Scraper{ config: cfg, cookies: cookies, logger: logger, URL: u, client: client, includes: includes, excludes: excludes, processed: set.New[string](), webPageQueueDepth: map[string]uint{}, } s.dirCreator = s.createDownloadPath s.fileExistenceCheck = s.fileExists s.fileWriter = s.writeFile s.httpDownloader = s.downloadURLWithRetries if s.config.Username != "" { s.auth = "Basic " + base64.StdEncoding.EncodeToString([]byte(s.config.Username+":"+s.config.Password)) } return s, nil } // Start starts the scraping. func (s *Scraper) Start(ctx context.Context) error { if err := s.dirCreator(s.config.OutputDirectory); err != nil { return err } if !s.shouldURLBeDownloaded(s.URL, 0, false) { return errors.New("start page is excluded from downloading") } if err := s.processURL(ctx, s.URL, 0); err != nil { return err } for len(s.webPageQueue) > 0 { ur := s.webPageQueue[0] s.webPageQueue = s.webPageQueue[1:] currentDepth := s.webPageQueueDepth[ur.String()] if err := s.processURL(ctx, ur, currentDepth+1); err != nil && errors.Is(err, context.Canceled) { return err } } return nil } func (s *Scraper) processURL(ctx context.Context, u *url.URL, currentDepth uint) error { s.logger.Info("Downloading webpage", log.String("url", u.String())) data, respURL, err := s.httpDownloader(ctx, u) if err != nil { s.logger.Error("Processing HTTP Request failed", log.String("url", u.String()), log.Err(err)) return err } fileExtension := "" kind, err := filetype.Match(data) if err == nil && kind != types.Unknown { fileExtension = kind.Extension } if currentDepth == 0 { u = respURL // use the URL that the website returned as new base url for the // scrape, in case of a redirect it changed s.URL = u } buf := bytes.NewBuffer(data) doc, err := html.Parse(buf) if err != nil { s.logger.Error("Parsing HTML failed", log.String("url", u.String()), log.Err(err)) return fmt.Errorf("parsing HTML: %w", err) } index := htmlindex.New(s.logger) index.Index(u, doc) s.storeDownload(u, data, doc, index, fileExtension) if err := s.downloadReferences(ctx, index); err != nil { return err } // check first and download afterward to not hit max depth limit for // start page links because of recursive linking // a hrefs references, err := index.URLs(htmlindex.ATag) if err != nil { s.logger.Error("Parsing URL failed", log.Err(err)) } for _, ur := range references { ur.Fragment = "" if s.shouldURLBeDownloaded(ur, currentDepth, false) { s.webPageQueue = append(s.webPageQueue, ur) s.webPageQueueDepth[ur.String()] = currentDepth } } return nil } // storeDownload writes the download to a file, if a known binary file is detected, // processing of the file as page to look for links is skipped. func (s *Scraper) storeDownload(u *url.URL, data []byte, doc *html.Node, index *htmlindex.Index, fileExtension string) { // We need to distinguish between HTML pages and binary files (images, PDFs, etc.) // because they need different file path handling: // - HTML pages: add .html extension, handle directory indexes like /about -> /about.html // - Binary files: keep original path, so /photo.jpg stays /photo.jpg, not /photo.jpg.html // This prevents breaking binary downloads that were working before. isAPage := false if fileExtension == "" { fixed, hasChanges, err := s.fixURLReferences(u, doc, index) if err != nil { s.logger.Error("Fixing file references failed", log.String("url", u.String()), log.Err(err)) return } if hasChanges { data = fixed } // Only HTML content gets processed as a "page" - binary files stay as-is isAPage = true } filePath := s.getFilePath(u, isAPage) // always update html files, content might have changed if err := s.fileWriter(filePath, data); err != nil { s.logger.Error("Writing to file failed", log.String("URL", u.String()), log.String("file", filePath), log.Err(err)) } } // compileRegexps compiles the given regex strings to regular expressions // to be used in the include and exclude filters. func compileRegexps(regexps []string) ([]*regexp.Regexp, error) { var errs []error var compiled []*regexp.Regexp for _, exp := range regexps { re, err := regexp.Compile(exp) if err == nil { compiled = append(compiled, re) } else { errs = append(errs, err) } } if len(errs) > 0 { return nil, errors.Join(errs...) } return compiled, nil } ================================================ FILE: scraper/scraper_test.go ================================================ package scraper import ( "context" "fmt" "net/url" "testing" "github.com/cornelk/gotokit/log" "github.com/cornelk/gotokit/set" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" ) func newTestScraper(t *testing.T, startURL string, urls map[string][]byte) *Scraper { t.Helper() logger := log.NewTestLogger(t) cfg := Config{ URL: startURL, } scraper, err := New(logger, cfg) require.NoError(t, err) require.NotNil(t, scraper) scraper.dirCreator = func(_ string) error { return nil } scraper.fileWriter = func(_ string, _ []byte) error { return nil } scraper.fileExistenceCheck = func(_ string) bool { return false } scraper.httpDownloader = func(_ context.Context, url *url.URL) ([]byte, *url.URL, error) { ur := url.String() b, ok := urls[ur] if ok { return b, url, nil } return nil, nil, fmt.Errorf("url '%s' not found in test data", ur) } return scraper } func TestScraperLinks(t *testing.T) { indexPage := []byte(` Example `) page2 := []byte(` a a `) css := []byte(``) startURL := "https://example.org/#fragment" // start page with fragment urls := map[string][]byte{ "https://example.org/": indexPage, "https://example.org/page2": page2, "https://example.org/sub/": indexPage, "https://example.org/style.css": css, } scraper := newTestScraper(t, startURL, urls) require.NotNil(t, scraper) ctx := context.Background() err := scraper.Start(ctx) require.NoError(t, err) expectedProcessed := set.NewFromSlice([]string{ "/", "/page2", "/sub", "/style.css", }) assert.Equal(t, expectedProcessed, scraper.processed) } func TestScraperAttributes(t *testing.T) { indexPage := []byte(` `) empty := []byte(``) startURL := "https://example.org/" urls := map[string][]byte{ "https://example.org/": indexPage, "https://example.org/bg.gif": empty, } scraper := newTestScraper(t, startURL, urls) require.NotNil(t, scraper) ctx := context.Background() err := scraper.Start(ctx) require.NoError(t, err) expectedProcessed := set.NewFromSlice([]string{ "/", "/bg.gif", }) assert.Equal(t, expectedProcessed, scraper.processed) } func TestScraperInternalCss(t *testing.T) { indexPage := []byte(` `) empty := []byte(``) domain := "example.org" file1Reference := "background.jpg" file2Reference := "img/bg.jpg" file3Reference := "bg3.jpg" fullURL := "https://" + domain urls := map[string][]byte{ fullURL + "/": indexPage, fullURL + "/" + file1Reference: empty, fullURL + "/" + file2Reference: empty, fullURL + "/" + file3Reference: empty, } scraper := newTestScraper(t, fullURL+"/", urls) require.NotNil(t, scraper) files := map[string][]byte{} scraper.fileWriter = func(filePath string, data []byte) error { files[filePath] = data return nil } ctx := context.Background() err := scraper.Start(ctx) require.NoError(t, err) expectedProcessed := set.NewFromSlice([]string{ "/", "/" + file1Reference, "/" + file2Reference, "/" + file3Reference, }) require.Equal(t, expectedProcessed, scraper.processed) ref := domain + "/index.html" content := string(files[ref]) assert.Contains(t, content, "url('"+file1Reference+"')") assert.Contains(t, content, "url('"+file2Reference+"')") assert.Contains(t, content, "url("+file3Reference+")") } ================================================ FILE: scraper/server.go ================================================ package scraper import ( "context" "fmt" "mime" "net/http" "github.com/cornelk/gotokit/log" ) // set more mime types in the browser, this for example fixes .asp files not being // downloaded but handled as html. var mimeTypes = map[string]string{ ".asp": "text/html; charset=utf-8", } // ServeDirectory serves a directory on a given port as a web server. func ServeDirectory(ctx context.Context, path string, port int16, logger *log.Logger) error { fs := http.FileServer(http.Dir(path)) mux := http.NewServeMux() mux.Handle("/", fs) // server root by file system // update mime types for ext, mt := range mimeTypes { if err := mime.AddExtensionType(ext, mt); err != nil { return fmt.Errorf("adding mime type '%s': %w", ext, err) } } fullAddr := fmt.Sprintf("http://127.0.0.1:%d", port) logger.Info("Serving directory...", log.String("path", path), log.String("address", fullAddr)) server := &http.Server{ Addr: fmt.Sprintf(":%d", port), Handler: mux, } serverErr := make(chan error, 1) go func() { serverErr <- server.ListenAndServe() }() select { case <-ctx.Done(): //nolint: contextcheck if err := server.Shutdown(context.Background()); err != nil { return fmt.Errorf("shutting down webserver: %w", err) } return nil case err := <-serverErr: return fmt.Errorf("starting webserver: %w", err) } } ================================================ FILE: scraper/url.go ================================================ package scraper import ( "net/url" "path" "path/filepath" "strings" ) func resolveURL(base *url.URL, reference, mainPageHost string, isHyperlink bool, relativeToRoot string) string { ur, err := url.Parse(reference) if err != nil { return "" } var resolvedURL *url.URL if ur.Host != "" && ur.Host != mainPageHost { if isHyperlink { // do not change links to external websites return reference } resolvedURL = base.ResolveReference(ur) resolvedURL.Path = filepath.Join("_"+ur.Host, resolvedURL.Path) } else { if isHyperlink { ur.Path = getPageFilePath(ur) resolvedURL = base.ResolveReference(ur) } else { resolvedURL = base.ResolveReference(ur) } } if resolvedURL.Host == mainPageHost { resolvedURL.Path = urlRelativeToOther(resolvedURL, base) relativeToRoot = "" } resolvedURL.Host = "" // remove host resolvedURL.Scheme = "" // remove http/https resolved := resolvedURL.String() if resolved == "" { resolved = "/" // website root } else { if resolved[0] == '/' && len(relativeToRoot) > 0 { resolved = relativeToRoot + resolved[1:] } else { resolved = relativeToRoot + resolved } } if isHyperlink { if resolved[len(resolved)-1] == '/' { resolved += PageDirIndex // link dir index to index.html } else { l := strings.LastIndexByte(resolved, '/') if l != -1 && l < len(resolved) && resolved[l+1] == '#' { resolved = resolved[:l+1] + PageDirIndex + resolved[l+1:] // link fragment correct } } } resolved = strings.TrimPrefix(resolved, "/") return resolved } func urlRelativeToRoot(url *url.URL) string { var rel strings.Builder splits := strings.Split(url.Path, "/") for i := range splits { if (len(splits[i]) > 0) && (i < len(splits)-1) { rel.WriteString("../") } } return rel.String() } func urlRelativeToOther(src, base *url.URL) string { srcSplits := strings.Split(src.Path, "/") baseSplits := strings.Split(getPageFilePath(base), "/") for len(srcSplits) > 0 && len(baseSplits) > 0 { if len(srcSplits[0]) == 0 { srcSplits = srcSplits[1:] continue } if len(baseSplits[0]) == 0 { baseSplits = baseSplits[1:] continue } if srcSplits[0] == baseSplits[0] { srcSplits = srcSplits[1:] baseSplits = baseSplits[1:] } else { break } } var upLevels strings.Builder for i, split := range baseSplits { if split == "" { continue } // Page filename is not a level. if i == len(baseSplits)-1 { break } upLevels.WriteString("../") } return upLevels.String() + path.Join(srcSplits...) } ================================================ FILE: scraper/url_test.go ================================================ package scraper import ( "net/url" "testing" "github.com/stretchr/testify/assert" ) func TestResolveURL(t *testing.T) { type filePathFixture struct { BaseURL url.URL Reference string IsHyperlink bool RelativeToRoot string Resolved string } pathlessURL := url.URL{ Scheme: "https", Host: "petpic.xyz", Path: "", } URL := url.URL{ Scheme: "https", Host: "petpic.xyz", Path: "/earth/", } var fixtures = []filePathFixture{ {pathlessURL, "", true, "", "index.html"}, {pathlessURL, "#contents", true, "", "#contents"}, {URL, "brasil/index.html", true, "", "brasil/index.html"}, {URL, "brasil/rio/index.html", true, "", "brasil/rio/index.html"}, {URL, "../argentina/cat.jpg", false, "", "../argentina/cat.jpg"}, } for _, fix := range fixtures { resolved := resolveURL(&fix.BaseURL, fix.Reference, URL.Host, fix.IsHyperlink, fix.RelativeToRoot) assert.Equal(t, fix.Resolved, resolved) } } func Test_urlRelativeToOther(t *testing.T) { type filePathFixture struct { SrcURL url.URL BaseURL url.URL ExpectedSrcPath string } var fixtures = []filePathFixture{ {url.URL{Path: "/earth/brasil/rio/cat.jpg"}, url.URL{Path: "/earth/brasil/rio/"}, "cat.jpg"}, {url.URL{Path: "/earth/brasil/rio/cat.jpg"}, url.URL{Path: "/earth/"}, "brasil/rio/cat.jpg"}, {url.URL{Path: "/earth/cat.jpg"}, url.URL{Path: "/earth/brasil/rio/"}, "../../cat.jpg"}, {url.URL{Path: "/earth/argentina/cat.jpg"}, url.URL{Path: "/earth/brasil/rio/"}, "../../argentina/cat.jpg"}, {url.URL{Path: "/earth/brasil/rio/cat.jpg"}, url.URL{Path: "/mars/dogtown/"}, "../../earth/brasil/rio/cat.jpg"}, {url.URL{Path: "///earth//////cat.jpg"}, url.URL{Path: "///earth/brasil//rio////////"}, "../../cat.jpg"}, } for _, fix := range fixtures { relativeURL := urlRelativeToOther(&fix.SrcURL, &fix.BaseURL) assert.Equal(t, fix.ExpectedSrcPath, relativeURL) } } func Test_urlRelativeToRoot(t *testing.T) { type urlFixture struct { SrcURL url.URL Expected string } var fixtures = []urlFixture{ {url.URL{Path: "/earth/brasil/rio/cat.jpg"}, "../../../"}, {url.URL{Path: "cat.jpg"}, ""}, {url.URL{Path: "/earth/argentina"}, "../"}, {url.URL{Path: "///earth//////cat.jpg"}, "../"}, } for _, fix := range fixtures { relativeURL := urlRelativeToRoot(&fix.SrcURL) assert.Equal(t, fix.Expected, relativeURL) } }