Repository: cornelk/goscrape
Branch: main
Commit: ee772e8edd93
Files: 39
Total size: 86.7 KB
Directory structure:
gitextract_eo_umi2a/
├── .codecov.yml
├── .github/
│ ├── ISSUE_TEMPLATE/
│ │ ├── bug_report.md
│ │ └── feature_request.md
│ └── workflows/
│ ├── codecov.yaml
│ ├── go.yaml
│ └── release.yml
├── .gitignore
├── .golangci.yml
├── .goreleaser.yaml
├── CHANGELOG.md
├── Dockerfile
├── LICENSE
├── Makefile
├── README.md
├── css/
│ └── css.go
├── go.mod
├── go.sum
├── htmlindex/
│ ├── attributes.go
│ ├── htmlindex.go
│ └── htmlindex_test.go
├── main.go
└── scraper/
├── checks.go
├── checks_test.go
├── cookies.go
├── download.go
├── download_test.go
├── fileutil.go
├── fileutil_test.go
├── fs.go
├── html.go
├── html_test.go
├── http.go
├── http_test.go
├── images.go
├── scraper.go
├── scraper_test.go
├── server.go
├── url.go
└── url_test.go
================================================
FILE CONTENTS
================================================
================================================
FILE: .codecov.yml
================================================
coverage:
status:
patch: off
project:
default:
target: 35%
threshold: 5%
================================================
FILE: .github/ISSUE_TEMPLATE/bug_report.md
================================================
---
name: Bug report
about: Create a report to help us improve
title: ''
labels: bug
assignees: ''
---
**Describe the bug**
A clear and concise description of what the bug is.
**To Reproduce**
Steps to reproduce the behavior:
**Expected behavior**
A clear and concise description of what you expected to happen.
**Screenshots**
If applicable, add screenshots to help explain your problem.
================================================
FILE: .github/ISSUE_TEMPLATE/feature_request.md
================================================
---
name: Feature request
about: Suggest an idea for this project
title: ''
labels: enhancement
assignees: ''
---
**Is your feature request related to a problem? Please describe.**
A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
**Describe the solution you'd like**
A clear and concise description of what you want to happen.
**Describe alternatives you've considered**
A clear and concise description of any alternative solutions or features you've considered.
**Additional context**
Add any other context or screenshots about the feature request here.
================================================
FILE: .github/workflows/codecov.yaml
================================================
name: codecov
on:
push:
branches:
- main
pull_request:
jobs:
codecov:
timeout-minutes: 15
name: Coverage
runs-on: ubuntu-latest
steps:
- name: Set up Go 1.x
uses: actions/setup-go@v5
with:
go-version: "1.24"
id: go
- name: Check out code into the Go module directory
uses: actions/checkout@v4
- name: Get dependencies
run: go mod download
- name: Run tests with coverage
run: make test-coverage
- name: Upload coverage to Codecov
uses: codecov/codecov-action@v4
with:
token: ${{ secrets.CODECOV_TOKEN }}
file: ./.testCoverage
================================================
FILE: .github/workflows/go.yaml
================================================
name: CI
on:
push:
branches:
- main
pull_request:
types:
- opened
- reopened
- synchronize
- ready_for_review
jobs:
build:
if: ${{ github.event_name == 'push' || !github.event.pull_request.draft || !contains(github.event.commits[0].message, '[skip ci]') }}
timeout-minutes: 15
name: Build
runs-on: ubuntu-latest
strategy:
matrix:
go: [ "1.24" ]
steps:
- name: Set up Go 1.x
uses: actions/setup-go@v5
with:
go-version: ${{ matrix.go }}
id: go
- name: Check out code into the Go module directory
uses: actions/checkout@v4
- name: Install linters
run: make install-linters
- name: Get dependencies
run: go mod download
- name: Run tests
run: make test
- name: Run linter
run: make lint
================================================
FILE: .github/workflows/release.yml
================================================
name: release
on:
push:
tags:
- 'v*'
jobs:
goreleaser:
runs-on: ubuntu-latest
steps:
-
name: Checkout
uses: actions/checkout@v4
with:
fetch-depth: 0
-
name: Set up Go
uses: actions/setup-go@v5
with:
go-version: '1.24'
check-latest: true
cache: true
-
name: Docker login
uses: docker/login-action@v3
with:
registry: 'ghcr.io'
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}
-
name: Run GoReleaser
uses: goreleaser/goreleaser-action@v6
with:
version: latest
args: release --clean
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
================================================
FILE: .gitignore
================================================
*.db
*.dmp
*.exe
*.iml
*.local
*.out
*.prof
*.test
.DS_Store
.fleet
.idea
.vscode
/*.log
.testCoverage
dist/
goscrape
================================================
FILE: .golangci.yml
================================================
version: "2"
run:
modules-download-mode: readonly
linters:
enable:
- asasalint # Check for pass []any as any in variadic func(...any)
- asciicheck # Checks that code identifiers do not have non-ASCII symbols
- bidichk # Checks for dangerous unicode character sequences
- bodyclose # Checks whether HTTP response body is closed successfully
- containedctx # Detects struct contained context.Context field
- contextcheck # Check whether the function uses a non-inherited context
- copyloopvar # Detects places where loop variables are copied
- cyclop # Checks function and package cyclomatic complexity
- decorder # Check declaration order of types, constants, variables and functions
- dogsled # Checks assignments with too many blank identifiers
- durationcheck # Check for two durations multiplied together
- embeddedstructfieldcheck # Checks embedded struct field positioning and formatting
- err113 # Check errors handling expressions
- errcheck # Checks for unchecked errors in Go code
- errname # Checks that sentinel errors are prefixed with Err
- errorlint # Find code that will cause problems with error wrapping
- funlen # Checks for long functions
- gocheckcompilerdirectives # Checks that go compiler directives are valid
- gocognit # Computes and checks the cognitive complexity of functions
- goconst # Finds repeated strings that could be replaced by a constant
- gocritic # Provides diagnostics that check for bugs, performance and style issues
- gocyclo # Computes and checks the cyclomatic complexity of functions
- godot # Check if comments end in a period
- govet # Examines Go source code and reports suspicious constructs
- grouper # Analyze expression groups
- iface # Detect the incorrect use of interfaces
- ineffassign # Detects when assignments to existing variables are not used
- intrange # Find places where for loops could use an integer range
- ireturn # Accept Interfaces, Return Concrete Types
- maintidx # Measures the maintainability index of each function
- makezero # Find slice declarations with non-zero initial length
- mirror # Reports wrong mirror patterns of bytes/strings usage
- misspell # Finds commonly misspelled English words
- modernize # Suggest simplifications using modern Go features
- nakedret # Finds naked returns in functions longer than a maximum size
- nestif # Reports deeply nested if statements
- nilerr # Finds code that returns nil even if it checks that error is not nil
- nilnil # Checks that there is no simultaneous return of nil error and invalid value
- noctx # Detects missing usage of context.Context
- nosprintfhostport # Checks for misuse of Sprintf to construct a host with port
- perfsprint # Checks that fmt.Sprintf can be replaced with a faster alternative
- prealloc # Find slice declarations that could be pre-allocated
- predeclared # Find code that shadows one of Go's predeclared identifiers
- reassign # Checks that package variables are not reassigned
- revive # Fast, configurable, extensible linter for Go
- staticcheck # Set of rules from staticcheck
- testifylint # Checks usage of github.com/stretchr/testify
- thelper # Detects test helpers which do not call t.Helper()
- tparallel # Detects inappropriate usage of t.Parallel()
- unconvert # Remove unnecessary type conversions
- unparam # Reports unused function parameters
- unused # Checks Go code for unused constants, variables, functions and types
- usestdlibvars # Detect the possibility to use variables/constants from stdlib
- wastedassign # Finds wasted assignment statements
- whitespace # Checks for unnecessary newlines
- wrapcheck # Checks that errors returned from external packages are wrapped
settings:
cyclop:
max-complexity: 15
whitespace:
multi-if: true # Enforces newlines (or comments) after every multi-line if statement
multi-func: true # Enforces newlines (or comments) after every multi-line function signature
exclusions:
rules:
- linters:
- err113
text: do not define dynamic errors
formatters:
enable:
- gci # Check if code and import statements are formatted, with additional rules
- gofmt # Check if the code is formatted according to 'gofmt' command
- goimports # Checks if the code and import statements are formatted according to the 'goimports' command
================================================
FILE: .goreleaser.yaml
================================================
version: 2
before:
hooks:
- go mod download
builds:
- id: goscrape
binary: goscrape
dir: .
env:
- CGO_ENABLED=0
targets:
- go_first_class
flags:
- -trimpath
ldflags:
- -s -w -X main.version={{.Version}} -X main.commit={{.Commit}} -X main.date={{ .CommitDate }}
dockers:
- id: goscrape
image_templates:
- "ghcr.io/cornelk/goscrape:{{ .Version }}"
universal_binaries:
- replace: false
archives:
- id: goscrape
builds: ['goscrape']
name_template: >-
{{ .ProjectName }}_v{{ .Version }}_
{{- if eq .Os "darwin" }}macos
{{- else }}{{ .Os }}{{ end }}_
{{- if eq .Arch "amd64" }}64bit
{{- else if eq .Arch "386" }}32bit
{{- else if eq .Arch "Darwin" }}macos
{{- else }}{{ .Arch }}{{ end }}
format_overrides:
- goos: windows
format: zip
checksum:
name_template: 'checksums.txt'
snapshot:
name_template: "{{ .Tag }}-snapshot"
changelog:
disable: true
================================================
FILE: CHANGELOG.md
================================================
# Changelog for goscrape
All notable changes to this project will be documented in this file.
## [v0.3.0] - 2025-01-01
Added:
* body background attribute support
* fragments at the end of the URL are now filtered
* support srcset attribute for image downloading
Fixed:
* inline css gets processed correctly
* --include / --exclude parameter list handling
## [v0.2.0] - 2024-06-21
Added:
* add cookie loading and storing support
* add docker image build
* add header parameter support
* add user agent parameter support
* support serving directory in local webserver
Changed:
* improved error messages
* reduced dependencies and binary size
Fixed:
* fix asset downloading
* fix downloading of binary files
* fix logger output
* fix logger verbose flag usage
* fix stopping of program using ctrl+c
## [v0.1.1] - 2023-02-11
Added:
* binaries for releases
* http request timeout configuration support
* proxy configuration support
Fixed:
* do not abort scraping on error
## [v0.1.0] - 2022-06-26
First version of goscrape released.
================================================
FILE: Dockerfile
================================================
FROM gcr.io/distroless/static-debian12
COPY goscrape /
ENTRYPOINT ["./goscrape"]
================================================
FILE: LICENSE
================================================
MIT License
Copyright (c) 2017 Cornel
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
================================================
FILE: Makefile
================================================
GOLANGCI_VERSION = v2.6.0
help: ## show help, shown by default if no target is specified
@grep -E '^[0-9a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-30s\033[0m %s\n", $$1, $$2}'
lint: ## run code linters
golangci-lint run
test: ## run tests
go test -timeout 10s -race ./...
test-coverage: ## run unit tests and create test coverage
go test -timeout 10s ./... -coverprofile .testCoverage -covermode=atomic -coverpkg=./...
test-coverage-web: test-coverage ## run unit tests and show test coverage in browser
go tool cover -func .testCoverage | grep total | awk '{print "Total coverage: "$$3}'
go tool cover -html=.testCoverage
install: ## install all binaries
go install -buildvcs=false .
install-linters: ## install all linters
go install github.com/golangci/golangci-lint/v2/cmd/golangci-lint@${GOLANGCI_VERSION}
release-snapshot: ## build release binaries from current git state as snapshot
goreleaser release --snapshot --clean
================================================
FILE: README.md
================================================
# goscrape - create offline browsable copies of websites
[](https://github.com/cornelk/goscrape/actions)
[](https://pkg.go.dev/github.com/cornelk/goscrape)
[](https://goreportcard.com/report/github.com/cornelk/goscrape)
[](https://codecov.io/gh/cornelk/goscrape)
A web scraper built with Golang. It downloads the content of a website and allows it to be archived and read offline.
## Features
Features and advantages over existing tools like wget, httrack, Teleport Pro:
* Free and open source
* Available for all platforms that Golang supports
* JPEG and PNG images can be converted down in quality to save disk space
* Excluded URLS will not be fetched (unlike [wget](https://savannah.gnu.org/bugs/?20808))
* No incomplete temp files are left on disk
* Downloaded asset files are skipped in a new scraper run
* Assets from external domains are downloaded automatically
* Sane default values
## Limitations
* No GUI version, console only
## Installation
There are 2 options to install goscrape:
1. Download and unpack a binary release from [Releases](https://github.com/cornelk/goscrape/releases)
or
2. Compile the latest release from source:
```
go install github.com/cornelk/goscrape@latest
```
Compiling the tool from source code needs to have a recent version of [Golang](https://go.dev/) installed.
## Usage
Scrape a website by running
```
goscrape http://website.com
```
To serve the downloaded website directory in a local run webserver use
```
goscrape --serve website.com
```
## Options
```
--include INCLUDE, -n INCLUDE
only include URLs with PERL Regular Expressions support
--exclude EXCLUDE, -x EXCLUDE
exclude URLs with PERL Regular Expressions support
--output OUTPUT, -o OUTPUT
output directory to write files to
--depth DEPTH, -d DEPTH
download depth, 0 for unlimited [default: 10]
--imagequality IMAGEQUALITY, -i IMAGEQUALITY
image quality, 0 to disable reencoding
--timeout TIMEOUT, -t TIMEOUT
time limit in seconds for each HTTP request to connect and read the request body
--serve SERVE, -s SERVE
serve the website using a webserver
--serverport SERVERPORT, -r SERVERPORT
port to use for the webserver [default: 8080]
--cookiefile COOKIEFILE, -c COOKIEFILE
file containing the cookie content
--savecookiefile SAVECOOKIEFILE
file to save the cookie content
--header HEADER, -h HEADER
HTTP header to use for scraping
--proxy PROXY, -p PROXY
proxy to use in format scheme://[user:password@]host:port (supports HTTP, HTTPS, SOCKS5 protocols)
--user USER, -u USER user[:password] to use for HTTP authentication
--useragent USERAGENT, -a USERAGENT
user agent to use for scraping
--verbose, -v verbose output
--help, -h display this help and exit
--version display version and exit
```
## Cookies
Cookies can be passed in a file using the `--cookiefile` parameter and a file containing
cookies in the following format:
```
[{"name":"user","value":"123"},{"name":"sessioe","value":"sid"}]
```
## Proxy Configuration
The `--proxy` flag supports multiple proxy protocols for scraping through different types of proxy servers:
### Supported Proxy Types
- **HTTP proxies**: `http://proxy.example.com:8080`
- **HTTPS proxies**: `https://proxy.example.com:8080`
- **SOCKS5 proxies**: `socks5://proxy.example.com:1080`
### Authentication
Proxies requiring authentication can include credentials in the URL:
```bash
# HTTP proxy with authentication
goscrape --proxy http://username:password@proxy.example.com:8080 http://website.com
# SOCKS5 proxy with authentication
goscrape --proxy socks5://user:pass@proxy.example.com:1080 http://website.com
```
================================================
FILE: css/css.go
================================================
// Package css provides a CSS parser that can process CSS data and call a processor for every found URL.
package css
import (
"net/url"
"regexp"
"strings"
"github.com/cornelk/gotokit/log"
"github.com/gorilla/css/scanner"
)
var cssURLRe = regexp.MustCompile(`^url\(['"]?(.*?)['"]?\)$`)
// Token represents a token and the corresponding string.
type Token = scanner.Token
type urlProcessor func(token *Token, data string, url *url.URL)
// Process the CSS data and call a processor for every found URL.
func Process(logger *log.Logger, url *url.URL, data string, processor urlProcessor) {
css := scanner.New(data)
for {
token := css.Next()
if token.Type == scanner.TokenEOF || token.Type == scanner.TokenError {
break
}
if token.Type != scanner.TokenURI {
continue
}
match := cssURLRe.FindStringSubmatch(token.Value)
if match == nil {
continue
}
src := match[1]
if strings.HasPrefix(strings.ToLower(src), "data:") {
continue // skip embedded data
}
u, err := url.Parse(src)
if err != nil {
logger.Error("Parsing URL failed",
log.String("url", src),
log.Err(err))
continue
}
processor(token, src, u)
}
}
================================================
FILE: go.mod
================================================
module github.com/cornelk/goscrape
go 1.24.0
require (
github.com/alexflint/go-arg v1.6.0
github.com/cornelk/gotokit v0.0.0-20251031201833-083458d3990b
github.com/gorilla/css v1.0.1
github.com/h2non/filetype v1.1.4-0.20231228185113-6469358c2bcb
github.com/stretchr/testify v1.11.1
golang.org/x/net v0.46.0
)
require (
github.com/alexflint/go-scalar v1.2.0 // indirect
github.com/davecgh/go-spew v1.1.1 // indirect
github.com/kr/pretty v0.3.0 // indirect
github.com/pmezard/go-difflib v1.0.0 // indirect
gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c // indirect
gopkg.in/yaml.v3 v3.0.1 // indirect
)
================================================
FILE: go.sum
================================================
github.com/alexflint/go-arg v1.6.0 h1:wPP9TwTPO54fUVQl4nZoxbFfKCcy5E6HBCumj1XVRSo=
github.com/alexflint/go-arg v1.6.0/go.mod h1:A7vTJzvjoaSTypg4biM5uYNTkJ27SkNTArtYXnlqVO8=
github.com/alexflint/go-scalar v1.2.0 h1:WR7JPKkeNpnYIOfHRa7ivM21aWAdHD0gEWHCx+WQBRw=
github.com/alexflint/go-scalar v1.2.0/go.mod h1:LoFvNMqS1CPrMVltza4LvnGKhaSpc3oyLEBUZVhhS2o=
github.com/cornelk/gotokit v0.0.0-20251031201833-083458d3990b h1:jT7eWHlrlvjVXSV47U4IjdgI75nLhf/327EHYaGLzso=
github.com/cornelk/gotokit v0.0.0-20251031201833-083458d3990b/go.mod h1:05rfUdBvcZdkWv0/oZfXevCTUzos6WafYzkPzkSkKbY=
github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E=
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/gorilla/css v1.0.1 h1:ntNaBIghp6JmvWnxbZKANoLyuXTPZ4cAMlo6RyhlbO8=
github.com/gorilla/css v1.0.1/go.mod h1:BvnYkspnSzMmwRK+b8/xgNPLiIuNZr6vbZBTPQ2A3b0=
github.com/h2non/filetype v1.1.4-0.20231228185113-6469358c2bcb h1:GlQyMv2C48qmfPItvAXFoyN341Swxp9JNVeUZxnmbJw=
github.com/h2non/filetype v1.1.4-0.20231228185113-6469358c2bcb/go.mod h1:319b3zT68BvV+WRj7cwy856M2ehB3HqNOt6sy1HndBY=
github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo=
github.com/kr/pretty v0.2.1/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI=
github.com/kr/pretty v0.3.0 h1:WgNl7dwNpEZ6jJ9k1snq4pZsg7DOEN8hP9Xw0Tsjwk0=
github.com/kr/pretty v0.3.0/go.mod h1:640gp4NfQd8pI5XOwp5fnNeVWj67G7CFk/SaSQn7NBk=
github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ=
github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI=
github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY=
github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE=
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/rogpeppe/go-internal v1.6.1 h1:/FiVV8dS/e+YqF2JvO3yXRFbBLTIuSDkuC7aBOAvL+k=
github.com/rogpeppe/go-internal v1.6.1/go.mod h1:xXDCJY+GAPziupqXw64V24skbSoqbTEfhy4qGm1nDQc=
github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs=
github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U=
github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U=
golang.org/x/net v0.46.0 h1:giFlY12I07fugqwPuWJi68oOnpfqFnJIJzaIIm2JVV4=
golang.org/x/net v0.46.0/go.mod h1:Q9BGdFy1y4nkUwiLvT5qtyhAnEHgnQ/zd8PfU6nc210=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk=
gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q=
gopkg.in/errgo.v2 v2.1.0/go.mod h1:hNsd1EY+bozCKY1Ytp96fpM3vjJbqLJn88ws8XvfDNI=
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
================================================
FILE: htmlindex/attributes.go
================================================
package htmlindex
import (
"net/url"
"github.com/cornelk/gotokit/log"
"github.com/cornelk/gotokit/set"
"golang.org/x/net/html"
)
type nodeAttributeParserData struct {
logger *log.Logger
url *url.URL
node *html.Node
attribute string
value string
}
// nodeAttributeParser returns the URL values of the attribute of the node and
// whether the attribute has been processed.
type nodeAttributeParser func(data nodeAttributeParserData) ([]string, bool)
// Node describes an HTML tag and its attributes that can contain URLs.
type Node struct {
Attributes []string
noChildParsing bool
parser nodeAttributeParser
}
// nolint: revive
const (
BackgroundAttribute = "background"
HrefAttribute = "href"
DataSrcAttribute = "data-src"
SrcAttribute = "src"
DataSrcSetAttribute = "data-srcset"
SrcSetAttribute = "srcset"
)
// nolint: revive
const (
ATag = "a"
BodyTag = "body"
ImgTag = "img"
LinkTag = "link"
ScriptTag = "script"
StyleTag = "style"
)
// Nodes describes the HTML tags and their attributes that can contain URL.
var Nodes = map[string]Node{
ATag: {
Attributes: []string{HrefAttribute},
},
BodyTag: {
Attributes: []string{BackgroundAttribute},
},
ImgTag: {
Attributes: []string{SrcAttribute, DataSrcAttribute, SrcSetAttribute, DataSrcSetAttribute},
parser: srcSetValueSplitter,
},
LinkTag: {
Attributes: []string{HrefAttribute},
},
ScriptTag: {
Attributes: []string{SrcAttribute},
},
StyleTag: {
noChildParsing: true,
parser: styleParser,
},
}
// SrcSetAttributes contains the attributes that contain srcset values.
var SrcSetAttributes = set.NewFromSlice([]string{
DataSrcSetAttribute,
SrcSetAttribute,
})
================================================
FILE: htmlindex/htmlindex.go
================================================
// Package htmlindex provides an index for all HTML tags of relevance for scraping.
package htmlindex
import (
"fmt"
"net/url"
"slices"
"sort"
"strings"
"github.com/cornelk/goscrape/css"
"github.com/cornelk/gotokit/log"
"golang.org/x/net/html"
)
// Index provides an index for all HTML tags of relevance for scraping.
type Index struct {
logger *log.Logger
// key is HTML tag, value is a map of all its urls and the HTML nodes for it
data map[string]map[string][]*html.Node
}
// New returns a new index.
func New(logger *log.Logger) *Index {
return &Index{
logger: logger,
data: make(map[string]map[string][]*html.Node),
}
}
// Index the given HTML document.
func (idx *Index) Index(baseURL *url.URL, node *html.Node) {
for child := node.FirstChild; child != nil; child = child.NextSibling {
switch child.Type {
case html.ElementNode:
idx.indexElementNode(baseURL, node, child)
default:
}
}
}
func (idx *Index) indexElementNode(baseURL *url.URL, node, child *html.Node) {
var references []string
info, ok := Nodes[child.Data]
if ok {
references = idx.nodeAttributeURLs(baseURL, child, info.parser, info.Attributes...)
}
m, ok := idx.data[child.Data]
if !ok {
m = map[string][]*html.Node{}
idx.data[child.Data] = m
}
for _, reference := range references {
m[reference] = append(m[reference], child)
}
if node.FirstChild != nil && !info.noChildParsing {
idx.Index(baseURL, child)
}
}
// URLs returns all URLs of the references found for a specific tag.
func (idx *Index) URLs(tag string) ([]*url.URL, error) {
m, ok := idx.data[tag]
if !ok {
return nil, nil
}
data := make([]string, 0, len(m))
for key := range m {
data = append(data, key)
}
sort.Strings(data)
urls := make([]*url.URL, 0, len(m))
for _, fullURL := range data {
u, err := url.Parse(fullURL)
if err != nil {
return nil, fmt.Errorf("parsing URL '%s': %w", fullURL, err)
}
urls = append(urls, u)
}
return urls, nil
}
// Nodes returns a map of all URLs and their HTML nodes.
func (idx *Index) Nodes(tag string) map[string][]*html.Node {
m, ok := idx.data[tag]
if ok {
return m
}
return map[string][]*html.Node{}
}
// nodeAttributeURLs returns resolved URLs based on the base URL and the HTML node attribute values.
func (idx *Index) nodeAttributeURLs(baseURL *url.URL, node *html.Node,
parser nodeAttributeParser, attributeNames ...string) []string {
var results []string
processReferences := func(references []string) {
for _, reference := range references {
ur, err := url.Parse(reference)
if err != nil {
continue
}
ur = baseURL.ResolveReference(ur)
results = append(results, ur.String())
}
}
for _, attr := range node.Attr {
if !slices.Contains(attributeNames, attr.Key) {
continue
}
var references []string
var parserHandled bool
if parser != nil {
data := nodeAttributeParserData{
logger: idx.logger,
url: baseURL,
node: node,
attribute: attr.Key,
value: strings.TrimSpace(attr.Val),
}
references, parserHandled = parser(data)
}
if parser == nil || !parserHandled {
references = append(references, strings.TrimSpace(attr.Val))
}
processReferences(references)
}
// special case to support style tag
if len(attributeNames) == 0 && parser != nil {
data := nodeAttributeParserData{
logger: idx.logger,
url: baseURL,
node: node,
}
references, _ := parser(data)
processReferences(references)
}
return results
}
// srcSetValueSplitter returns the URL values of the srcset attribute of img nodes.
func srcSetValueSplitter(data nodeAttributeParserData) ([]string, bool) {
if !SrcSetAttributes.Contains(data.attribute) {
return nil, false
}
// split the set of responsive images
values := strings.Split(data.value, ",")
for i, value := range values {
value = strings.TrimSpace(value)
// remove the width in pixels after the url
values[i], _, _ = strings.Cut(value, " ")
}
return values, true
}
// styleParser returns the URL values of a CSS style tag.
func styleParser(data nodeAttributeParserData) ([]string, bool) {
if data.node.FirstChild == nil {
return nil, false
}
var urls []string
processor := func(_ *css.Token, _ string, url *url.URL) {
urls = append(urls, url.String())
}
cssData := data.node.FirstChild.Data
css.Process(data.logger, data.url, cssData, processor)
return urls, true
}
================================================
FILE: htmlindex/htmlindex_test.go
================================================
package htmlindex
import (
"bytes"
"net/url"
"testing"
"github.com/cornelk/gotokit/log"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"golang.org/x/net/html"
)
func TestIndex(t *testing.T) {
input := []byte(`
Guide
`)
idx := testSetup(t, input)
// check a tag
nodeTag := "a"
references, err := idx.URLs(nodeTag)
require.NoError(t, err)
require.Len(t, references, 1)
tagURL := "https://domain.com/wp-content/uploads/document.pdf"
assert.Equal(t, tagURL, references[0].String())
urls := idx.Nodes(nodeTag)
require.Len(t, urls, 1)
nodes, ok := urls[tagURL]
require.True(t, ok)
require.Len(t, nodes, 1)
node := nodes[0]
assert.Equal(t, nodeTag, node.Data)
// check img tag
nodeTag = "img"
references, err = idx.URLs(nodeTag)
require.NoError(t, err)
require.Len(t, references, 1)
tagURL = "https://domain.com/test.jpg"
assert.Equal(t, tagURL, references[0].String())
// check for not existing tag
nodeTag = "not-existing"
references, err = idx.URLs(nodeTag)
require.NoError(t, err)
require.Empty(t, references)
urls = idx.Nodes(nodeTag)
require.Empty(t, urls)
}
func TestIndexImg(t *testing.T) {
input := []byte(`