Repository: gocolly/colly
Branch: master
Commit: abd17898f26e
Files: 67
Total size: 253.0 KB
Directory structure:
gitextract_san38b80/
├── .codecov.yml
├── .github/
│ ├── ISSUE_TEMPLATE/
│ │ ├── bug_report.md
│ │ ├── config.yml
│ │ └── feature_request.md
│ └── workflows/
│ └── ci.yml
├── CHANGELOG.md
├── CONTRIBUTING.md
├── LICENSE.txt
├── README.md
├── VERSION
├── _examples/
│ ├── README.md
│ ├── basic/
│ │ └── basic.go
│ ├── coursera_courses/
│ │ └── coursera_courses.go
│ ├── cryptocoinmarketcap/
│ │ └── cryptocoinmarketcap.go
│ ├── error_handling/
│ │ └── error_handling.go
│ ├── factba.se/
│ │ └── factbase.go
│ ├── google_groups/
│ │ └── google_groups.go
│ ├── hackernews_comments/
│ │ └── hackernews_comments.go
│ ├── instagram/
│ │ └── instagram.go
│ ├── local_files/
│ │ ├── html/
│ │ │ ├── child_page/
│ │ │ │ ├── one.html
│ │ │ │ ├── three.html
│ │ │ │ └── two.html
│ │ │ └── index.html
│ │ └── local_files.go
│ ├── login/
│ │ └── login.go
│ ├── max_depth/
│ │ └── max_depth.go
│ ├── multipart/
│ │ └── multipart.go
│ ├── openedx_courses/
│ │ └── openedx_courses.go
│ ├── parallel/
│ │ └── parallel.go
│ ├── proxy_switcher/
│ │ └── proxy_switcher.go
│ ├── queue/
│ │ └── queue.go
│ ├── random_delay/
│ │ └── random_delay.go
│ ├── rate_limit/
│ │ └── rate_limit.go
│ ├── reddit/
│ │ └── reddit.go
│ ├── request_context/
│ │ └── request_context.go
│ ├── scraper_server/
│ │ └── scraper_server.go
│ ├── shopify_sitemap/
│ │ └── shopify_sitemap.go
│ ├── url_filter/
│ │ └── url_filter.go
│ └── xkcd_store/
│ └── xkcd_store.go
├── cmd/
│ └── colly/
│ └── colly.go
├── colly.go
├── colly_test.go
├── context.go
├── context_test.go
├── debug/
│ ├── debug.go
│ ├── logdebugger.go
│ └── webdebugger.go
├── extensions/
│ ├── extensions.go
│ ├── random_user_agent.go
│ ├── referer.go
│ └── url_length_filter.go
├── go.mod
├── go.sum
├── htmlelement.go
├── http_backend.go
├── http_trace.go
├── http_trace_test.go
├── proxy/
│ └── proxy.go
├── queue/
│ ├── queue.go
│ └── queue_test.go
├── request.go
├── response.go
├── storage/
│ └── storage.go
├── unmarshal.go
├── unmarshal_test.go
├── xmlelement.go
└── xmlelement_test.go
================================================
FILE CONTENTS
================================================
================================================
FILE: .codecov.yml
================================================
comment: false
================================================
FILE: .github/ISSUE_TEMPLATE/bug_report.md
================================================
---
name: Bug report
about: Create a report to help us improve
title: ''
labels: ''
assignees: ''
---
================================================
FILE: .github/ISSUE_TEMPLATE/config.yml
================================================
blank_issues_enabled: true
contact_links:
- name: Question
url: https://stackoverflow.com/
about: Questions should go to Stack Overflow. You can use go-colly tag.
================================================
FILE: .github/ISSUE_TEMPLATE/feature_request.md
================================================
---
name: Feature request
about: Suggest an idea for this project
title: ''
labels: ''
assignees: ''
---
================================================
FILE: .github/workflows/ci.yml
================================================
---
name: CI
on:
push:
branches:
- '**'
pull_request:
jobs:
test:
name: Test ${{matrix.go}}
runs-on: [ubuntu-latest]
strategy:
fail-fast: false
max-parallel: 4
matrix:
go: [
"1.24",
"1.23",
"1.22",
"1.21",
]
steps:
- name: Checkout branch
uses: actions/checkout@v2
- name: Setup go
uses: actions/setup-go@v2
with:
go-version: ${{matrix.go}}
- name: Test
run: |
go install golang.org/x/lint/golint@latest
OUT="$(go get -a)"; test -z "$OUT" || (echo "$OUT" && return 1)
OUT="$(gofmt -l -d ./)"; test -z "$OUT" || (echo "$OUT" && return 1)
golint -set_exit_status
go vet -v ./...
go test -race -v -coverprofile=coverage.txt -covermode=atomic ./...
build:
name: Build ${{matrix.go}}
runs-on: [ubuntu-latest]
strategy:
fail-fast: false
max-parallel: 4
matrix:
go: [
"1.24",
"1.23",
"1.22",
"1.21",
]
steps:
- name: Checkout branch
uses: actions/checkout@v2
- name: Setup go
uses: actions/setup-go@v2
with:
go-version: ${{matrix.go}}
- name: Build
run: |
go install golang.org/x/lint/golint@latest
OUT="$(go get -a)"; test -z "$OUT" || (echo "$OUT" && return 1)
OUT="$(gofmt -l -d ./)"; test -z "$OUT" || (echo "$OUT" && return 1)
golint -set_exit_status
go build
codecov:
name: Codecov
runs-on: [ubuntu-latest]
needs:
- test
- build
steps:
- name: Run Codecov
run: bash <(curl -s https://codecov.io/bash)
================================================
FILE: CHANGELOG.md
================================================
# 2.1.0 - 2020.06.09
- HTTP tracing support
- New callback: OnResponseHeader
- Queue fixes
- New collector option: Collector.CheckHead
- Proxy fixes
- Fixed POST revisit checking
- Updated dependencies
# 2.0.0 - 2019.11.28
- Breaking change: Change Collector.RedirectHandler member to Collector.SetRedirectHandler function
- Go module support
- Collector.HasVisited method added to be able to check if an url has been visited
- Collector.SetClient method introduced
- HTMLElement.ChildTexts method added
- New user agents
- Multiple bugfixes
# 1.2.0 - 2019.02.13
- Compatibility with the latest htmlquery package
- New request shortcut for HEAD requests
- Check URL availability before visiting
- Fix proxy URL value
- Request counter fix
- Minor fixes in examples
# 1.1.0 - 2018.08.13
- Appengine integration takes context.Context instead of http.Request (API change)
- Added "Accept" http header by default to every request
- Support slices of pointers in unmarshal
- Fixed a race condition in queues
- ForEachWithBreak method added to HTMLElement
- Added a local file example
- Support gzip decompression of response bodies
- Don't share waitgroup when cloning a collector
- Fixed instagram example
# 1.0.0 - 2018.05.13
================================================
FILE: CONTRIBUTING.md
================================================
# Contribute
## Introduction
First, thank you for considering contributing to colly! It's people like you that make the open source community such a great community! 😊
We welcome any type of contribution, not only code. You can help with
- **QA**: file bug reports, the more details you can give the better (e.g. screenshots with the console open)
- **Marketing**: writing blog posts, howto's, printing stickers, ...
- **Community**: presenting the project at meetups, organizing a dedicated meetup for the local community, ...
- **Code**: take a look at the [open issues](https://github.com/gocolly/colly/issues). Even if you can't write code, commenting on them, showing that you care about a given issue matters. It helps us triage them.
- **Money**: we welcome financial contributions in full transparency on our [open collective](https://opencollective.com/colly).
## Your First Contribution
Working on your first Pull Request? You can learn how from this *free* series, [How to Contribute to an Open Source Project on GitHub](https://app.egghead.io/playlists/how-to-contribute-to-an-open-source-project-on-github).
## Submitting code
Any code change should be submitted as a pull request. The description should explain what the code does and give steps to execute it. The pull request should also contain tests.
## Code review process
The bigger the pull request, the longer it will take to review and merge. Try to break down large pull requests in smaller chunks that are easier to review and merge.
It is also always helpful to have some context for your pull request. What was the purpose? Why does it matter to you?
## Financial contributions
We also welcome financial contributions in full transparency on our [open collective](https://opencollective.com/colly).
Anyone can file an expense. If the expense makes sense for the development of the community, it will be "merged" in the ledger of our open collective by the core contributors and the person who filed the expense will be reimbursed.
## Questions
If you have any questions, create an [issue](https://github.com/gocolly/colly/issues/new) (protip: do a quick search first to see if someone else didn't ask the same question before!).
You can also reach us at hello@colly.opencollective.com.
## Credits
### Contributors
Thank you to all the people who have already contributed to colly!
This is a test page This is a test paragraph
### Backers
Thank you to all our backers! [[Become a backer](https://opencollective.com/colly#backer)]
### Sponsors
Thank you to all our sponsors! (please ask your company to also support this open source project by [becoming a sponsor](https://opencollective.com/colly#sponsor))
================================================
FILE: LICENSE.txt
================================================
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright [yyyy] [name of copyright owner]
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
================================================
FILE: README.md
================================================
# Colly
Lightning Fast and Elegant Scraping Framework for Gophers
Colly provides a clean interface to write any kind of crawler/scraper/spider.
With Colly you can easily extract structured data from websites, which can be used for a wide range of applications, like data mining, data processing or archiving.
[](https://pkg.go.dev/github.com/gocolly/colly/v2)
[](#backers) [](#sponsors) [](https://github.com/gocolly/colly/actions/workflows/ci.yml)
[](http://goreportcard.com/report/gocolly/colly)
[](https://github.com/gocolly/colly/tree/master/_examples)
[](https://codecov.io/github/gocolly/colly?branch=master)
[](https://app.fossa.io/projects/git%2Bgithub.com%2Fgocolly%2Fcolly?ref=badge_shield)
[](https://twitter.com/gocolly)
## Features
- Clean API
- Fast (>1k request/sec on a single core)
- Manages request delays and maximum concurrency per domain
- Automatic cookie and session handling
- Sync/async/parallel scraping
- Caching
- Automatic encoding of non-unicode responses
- Robots.txt support
- Distributed scraping
- Configuration via environment variables
- Extensions
## Example
```go
import (
"fmt"
"github.com/gocolly/colly/v2"
)
func main() {
c := colly.NewCollector()
// Find and visit all links
c.OnHTML("a[href]", func(e *colly.HTMLElement) {
e.Request.Visit(e.Attr("href"))
})
c.OnRequest(func(r *colly.Request) {
fmt.Println("Visiting", r.URL)
})
c.Visit("http://go-colly.org/")
}
```
See [examples folder](https://github.com/gocolly/colly/tree/master/_examples) for more detailed examples.
## Installation
`go get github.com/gocolly/colly/v2`
## Bugs
Bugs or suggestions? Visit the [issue tracker](https://github.com/gocolly/colly/issues) or join `#colly` on freenode
## Other Projects Using Colly
Below is a list of public, open source projects that use Colly:
- [greenpeace/check-my-pages](https://github.com/greenpeace/check-my-pages) Scraping script to test the Spanish Greenpeace web archive.
- [altsab/gowap](https://github.com/altsab/gowap) Wappalyzer implementation in Go.
- [jesuiscamille/goquotes](https://github.com/jesuiscamille/goquotes) A quotes scraper, making your day a little better!
- [jivesearch/jivesearch](https://github.com/jivesearch/jivesearch) A search engine that doesn't track you.
- [Leagify/colly-draft-prospects](https://github.com/Leagify/colly-draft-prospects) A scraper for future NFL Draft prospects.
- [lucasepe/go-ps4](https://github.com/lucasepe/go-ps4) Search playstation store for your favorite PS4 games using the command line.
- [yringler/inside-chassidus-scraper](https://github.com/yringler/inside-chassidus-scraper) Scrapes Rabbi Paltiel's web site for lesson metadata.
- [gamedb/gamedb](https://github.com/gamedb/gamedb) A database of Steam games.
- [lawzava/scrape](https://github.com/lawzava/scrape) CLI for email scraping from any website.
- [eureka101v/WeiboSpiderGo](https://github.com/eureka101v/WeiboSpiderGo) A sina weibo(chinese twitter) scraper
- [Go-phie/gophie](https://github.com/Go-phie/gophie) Search, Download and Stream movies from your terminal
- [imthaghost/goclone](https://github.com/imthaghost/goclone) Clone websites to your computer within seconds.
- [superiss/spidy](https://github.com/superiss/spidy) Crawl the web and collect expired domains.
- [docker-slim/docker-slim](https://github.com/docker-slim/docker-slim) Optimize your Docker containers to make them smaller and better.
- [seversky/gachifinder](https://github.com/seversky/gachifinder) an agent for asynchronous scraping, parsing and writing to some storages(elasticsearch for now)
- [eval-exec/goodreads](https://github.com/eval-exec/goodreads) crawl all tags and all pages of quotes from goodreads.
If you are using Colly in a project please send a pull request to add it to the list.
## Contributors
This project exists thanks to all the people who contribute. [[Contribute]](CONTRIBUTING.md).
## Backers
Thank you to all our backers! 🙏 [[Become a backer](https://opencollective.com/colly#backer)]
## Sponsors
Support this project by becoming a sponsor. Your logo will show up here with a link to your website. [[Become a sponsor](https://opencollective.com/colly#sponsor)]
## License
[](https://app.fossa.io/projects/git%2Bgithub.com%2Fgocolly%2Fcolly?ref=badge_large)
================================================
FILE: VERSION
================================================
2.1.0
================================================
FILE: _examples/README.md
================================================
# Colly examples
This folder provides easy to understand code snippets on how to get started with colly.
To execute an example run `go run [example/example.go]`
## Demo
```
$ go run rate_limit/rate_limit.go
[000001] 1 [ 1 - request] map["url":"https://httpbin.org/delay/2?n=4"] (60.872µs)
[000002] 1 [ 2 - request] map["url":"https://httpbin.org/delay/2?n=2"] (154.425µs)
[000003] 1 [ 3 - request] map["url":"https://httpbin.org/delay/2?n=0"] (158.374µs)
[000004] 1 [ 5 - request] map["url":"https://httpbin.org/delay/2?n=3"] (426.999µs)
[000005] 1 [ 4 - request] map["url":"https://httpbin.org/delay/2?n=1"] (448.75µs)
[000007] 1 [ 2 - response] map["url":"https://httpbin.org/delay/2?n=2" "status":"OK"] (2.855764394s)
[000008] 1 [ 2 - scraped] map["url":"https://httpbin.org/delay/2?n=2"] (2.855797868s)
[000006] 1 [ 1 - response] map["url":"https://httpbin.org/delay/2?n=4" "status":"OK"] (2.855756753s)
[000009] 1 [ 1 - scraped] map["url":"https://httpbin.org/delay/2?n=4"] (2.855819581s)
[000010] 1 [ 3 - response] map["status":"OK" "url":"https://httpbin.org/delay/2?n=0"] (5.002065299s)
[000011] 1 [ 3 - scraped] map["url":"https://httpbin.org/delay/2?n=0"] (5.002103755s)
[000012] 1 [ 5 - response] map["status":"OK" "url":"https://httpbin.org/delay/2?n=3"] (5.012080614s)
[000013] 1 [ 5 - scraped] map["url":"https://httpbin.org/delay/2?n=3"] (5.012101056s)
[000014] 1 [ 4 - response] map["url":"https://httpbin.org/delay/2?n=1" "status":"OK"] (7.155725591s)
[000015] 1 [ 4 - scraped] map["url":"https://httpbin.org/delay/2?n=1"] (7.155759136s)
```
================================================
FILE: _examples/basic/basic.go
================================================
package main
import (
"fmt"
"github.com/gocolly/colly/v2"
)
func main() {
// Instantiate default collector
c := colly.NewCollector(
// Visit only domains: hackerspaces.org, wiki.hackerspaces.org
colly.AllowedDomains("hackerspaces.org", "wiki.hackerspaces.org"),
)
// On every a element which has href attribute call callback
c.OnHTML("a[href]", func(e *colly.HTMLElement) {
link := e.Attr("href")
// Print link
fmt.Printf("Link found: %q -> %s\n", e.Text, link)
// Visit link found on page
// Only those links are visited which are in AllowedDomains
c.Visit(e.Request.AbsoluteURL(link))
})
// Before making a request print "Visiting ..."
c.OnRequest(func(r *colly.Request) {
fmt.Println("Visiting", r.URL.String())
})
// Start scraping on https://hackerspaces.org
c.Visit("https://hackerspaces.org/")
}
================================================
FILE: _examples/coursera_courses/coursera_courses.go
================================================
package main
import (
"encoding/json"
"log"
"os"
"strings"
"time"
"github.com/gocolly/colly/v2"
)
// Course stores information about a coursera course
type Course struct {
Title string
Description string
Creator string
Level string
URL string
Language string
Commitment string
Rating string
}
func main() {
fName := "courses.json"
file, err := os.Create(fName)
if err != nil {
log.Fatalf("Cannot create file %q: %s\n", fName, err)
return
}
defer file.Close()
// Instantiate default collector
c := colly.NewCollector(
// Visit only domains: coursera.org, www.coursera.org
colly.AllowedDomains("coursera.org", "www.coursera.org"),
// Cache responses to prevent multiple download of pages
// even if the collector is restarted
colly.CacheDir("./coursera_cache"),
// Cached responses older than the specified duration will be refreshed
colly.CacheExpiration(24*time.Hour),
)
// Create another collector to scrape course details
detailCollector := c.Clone()
courses := make([]Course, 0, 200)
// On every element which has "href" attribute call callback
c.OnHTML("a[href]", func(e *colly.HTMLElement) {
// If attribute class is this long string return from callback
// As this a is irrelevant
if e.Attr("class") == "Button_1qxkboh-o_O-primary_cv02ee-o_O-md_28awn8-o_O-primaryLink_109aggg" {
return
}
link := e.Attr("href")
// If link start with browse or includes either signup or login return from callback
if !strings.HasPrefix(link, "/browse") || strings.Index(link, "=signup") > -1 || strings.Index(link, "=login") > -1 {
return
}
// start scaping the page under the link found
e.Request.Visit(link)
})
// Before making a request print "Visiting ..."
c.OnRequest(func(r *colly.Request) {
log.Println("visiting", r.URL.String())
})
// On every element with collection-product-card class call callback
c.OnHTML(`a.collection-product-card`, func(e *colly.HTMLElement) {
// Activate detailCollector if the link contains "coursera.org/learn"
courseURL := e.Request.AbsoluteURL(e.Attr("href"))
if strings.Index(courseURL, "coursera.org/learn") != -1 {
detailCollector.Visit(courseURL)
}
})
// Extract details of the course
detailCollector.OnHTML(`div[id=rendered-content]`, func(e *colly.HTMLElement) {
log.Println("Course found", e.Request.URL)
title := e.ChildText(".banner-title")
if title == "" {
log.Println("No title found", e.Request.URL)
}
course := Course{
Title: title,
URL: e.Request.URL.String(),
Description: e.ChildText("div.content"),
Creator: e.ChildText("li.banner-instructor-info > a > div > div > span"),
Rating: e.ChildText("span.number-rating"),
}
// Iterate over div components and add details to course
e.ForEach(".AboutCourse .ProductGlance > div", func(_ int, el *colly.HTMLElement) {
svgTitle := strings.Split(el.ChildText("div:nth-child(1) svg title"), " ")
lastWord := svgTitle[len(svgTitle)-1]
switch lastWord {
// svg Title: Available Languages
case "languages":
course.Language = el.ChildText("div:nth-child(2) > div:nth-child(1)")
// svg Title: Mixed/Beginner/Intermediate/Advanced Level
case "Level":
course.Level = el.ChildText("div:nth-child(2) > div:nth-child(1)")
// svg Title: Hours to complete
case "complete":
course.Commitment = el.ChildText("div:nth-child(2) > div:nth-child(1)")
}
})
courses = append(courses, course)
})
// Start scraping on http://coursera.com/browse
c.Visit("https://coursera.org/browse")
enc := json.NewEncoder(file)
enc.SetIndent("", " ")
// Dump json to the standard output
enc.Encode(courses)
}
================================================
FILE: _examples/cryptocoinmarketcap/cryptocoinmarketcap.go
================================================
package main
import (
"encoding/csv"
"log"
"os"
"github.com/gocolly/colly/v2"
)
func main() {
fName := "cryptocoinmarketcap.csv"
file, err := os.Create(fName)
if err != nil {
log.Fatalf("Cannot create file %q: %s\n", fName, err)
return
}
defer file.Close()
writer := csv.NewWriter(file)
defer writer.Flush()
// Write CSV header
writer.Write([]string{"Name", "Symbol", "Market Cap (USD)", "Price (USD)", "Circulating Supply (USD)", "Volume (24h)", "Change (1h)", "Change (24h)", "Change (7d)"})
// Instantiate default collector
c := colly.NewCollector()
c.OnHTML("tbody tr", func(e *colly.HTMLElement) {
writer.Write([]string{
e.ChildText(".cmc-table__column-name"),
e.ChildText(".cmc-table__cell--sort-by__symbol"),
e.ChildText(".cmc-table__cell--sort-by__market-cap"),
e.ChildText(".cmc-table__cell--sort-by__price"),
e.ChildText(".cmc-table__cell--sort-by__circulating-supply"),
e.ChildText(".cmc-table__cell--sort-by__volume-24-h"),
e.ChildText(".cmc-table__cell--sort-by__percent-change-1-h"),
e.ChildText(".cmc-table__cell--sort-by__percent-change-24-h"),
e.ChildText(".cmc-table__cell--sort-by__percent-change-7-d"),
})
})
c.Visit("https://coinmarketcap.com/all/views/all/")
log.Printf("Scraping finished, check file %q for results\n", fName)
}
================================================
FILE: _examples/error_handling/error_handling.go
================================================
package main
import (
"fmt"
"github.com/gocolly/colly/v2"
)
func main() {
// Create a collector
c := colly.NewCollector()
// Set HTML callback
// Won't be called if error occurs
c.OnHTML("*", func(e *colly.HTMLElement) {
fmt.Println(e)
})
// Set error handler
c.OnError(func(r *colly.Response, err error) {
fmt.Println("Request URL:", r.Request.URL, "failed with response:", r, "\nError:", err)
})
// Start scraping
c.Visit("https://definitely-not-a.website/")
}
================================================
FILE: _examples/factba.se/factbase.go
================================================
package main
import (
"encoding/json"
"fmt"
"os"
"strconv"
"github.com/gocolly/colly/v2"
)
var baseSearchURL = "https://factba.se/json/json-transcript.php?q=&f=&dt=&p="
var baseTranscriptURL = "https://factba.se/transcript/"
type result struct {
Slug string `json:"slug"`
Date string `json:"date"`
}
type results struct {
Data []*result `json:"data"`
}
type transcript struct {
Speaker string
Text string
}
func main() {
c := colly.NewCollector(
colly.AllowedDomains("factba.se"),
)
d := c.Clone()
d.OnHTML("body", func(e *colly.HTMLElement) {
t := make([]transcript, 0)
e.ForEach(".topic-media-row", func(_ int, el *colly.HTMLElement) {
t = append(t, transcript{
Speaker: el.ChildText(".speaker-label"),
Text: el.ChildText(".transcript-text-block"),
})
})
jsonData, err := json.MarshalIndent(t, "", " ")
if err != nil {
return
}
os.WriteFile(colly.SanitizeFileName(e.Request.Ctx.Get("date")+"_"+e.Request.Ctx.Get("slug"))+".json", jsonData, 0644)
})
stop := false
c.OnResponse(func(r *colly.Response) {
rs := &results{}
err := json.Unmarshal(r.Body, rs)
if err != nil || len(rs.Data) == 0 {
stop = true
return
}
for _, res := range rs.Data {
u := baseTranscriptURL + res.Slug
ctx := colly.NewContext()
ctx.Put("date", res.Date)
ctx.Put("slug", res.Slug)
d.Request("GET", u, nil, ctx, nil)
}
})
for i := 1; i < 1000; i++ {
if stop {
break
}
if err := c.Visit(baseSearchURL + strconv.Itoa(i)); err != nil {
fmt.Println("Error:", err)
break
}
}
}
================================================
FILE: _examples/google_groups/google_groups.go
================================================
package main
import (
"encoding/json"
"flag"
"log"
"os"
"strings"
"github.com/gocolly/colly/v2"
)
// Mail is the container of a single e-mail
type Mail struct {
Title string
Link string
Author string
Date string
Message string
}
func main() {
var groupName string
flag.StringVar(&groupName, "group", "hspbp", "Google Groups group name")
flag.Parse()
threads := make(map[string][]Mail)
threadCollector := colly.NewCollector()
mailCollector := colly.NewCollector()
// Collect threads
threadCollector.OnHTML("tr", func(e *colly.HTMLElement) {
ch := e.DOM.Children()
author := ch.Eq(1).Text()
// deleted topic
if author == "" {
return
}
title := ch.Eq(0).Text()
link, _ := ch.Eq(0).Children().Eq(0).Attr("href")
// fix link to point to the pure HTML version of the thread
link = strings.Replace(link, ".com/d/topic", ".com/forum/?_escaped_fragment_=topic", 1)
date := ch.Eq(2).Text()
log.Printf("Thread found: %s %q %s %s\n", link, title, author, date)
mailCollector.Visit(link)
})
// Visit next page
threadCollector.OnHTML("body > a[href]", func(e *colly.HTMLElement) {
log.Println("Next page link found:", e.Attr("href"))
e.Request.Visit(e.Attr("href"))
})
// Extract mails
mailCollector.OnHTML("body", func(e *colly.HTMLElement) {
// Find subject
threadSubject := e.ChildText("h2")
if _, ok := threads[threadSubject]; !ok {
threads[threadSubject] = make([]Mail, 0, 8)
}
// Extract mails
e.ForEach("table tr", func(_ int, el *colly.HTMLElement) {
mail := Mail{
Title: el.ChildText("td:nth-of-type(1)"),
Link: el.ChildAttr("td:nth-of-type(1)", "href"),
Author: el.ChildText("td:nth-of-type(2)"),
Date: el.ChildText("td:nth-of-type(3)"),
Message: el.ChildText("td:nth-of-type(4)"),
}
threads[threadSubject] = append(threads[threadSubject], mail)
})
// Follow next page link
if link, found := e.DOM.Find("> a[href]").Attr("href"); found {
e.Request.Visit(link)
} else {
log.Printf("Thread %q done\n", threadSubject)
}
})
threadCollector.Visit("https://groups.google.com/forum/?_escaped_fragment_=forum/" + groupName)
enc := json.NewEncoder(os.Stdout)
enc.SetIndent("", " ")
// Dump json to the standard output
enc.Encode(threads)
}
================================================
FILE: _examples/hackernews_comments/hackernews_comments.go
================================================
package main
import (
"encoding/json"
"flag"
"log"
"os"
"strconv"
"strings"
"github.com/gocolly/colly/v2"
)
type comment struct {
Author string `selector:"a.hnuser"`
URL string `selector:".age a[href]" attr:"href"`
Comment string `selector:".comment"`
Replies []*comment
depth int
}
func main() {
var itemID string
flag.StringVar(&itemID, "id", "", "hackernews post id")
flag.Parse()
if itemID == "" {
log.Println("Hackernews post id required")
os.Exit(1)
}
comments := make([]*comment, 0)
// Instantiate default collector
c := colly.NewCollector()
// Extract comment
c.OnHTML(".comment-tree tr.athing", func(e *colly.HTMLElement) {
width, err := strconv.Atoi(e.ChildAttr("td.ind img", "width"))
if err != nil {
return
}
// hackernews uses 40px spacers to indent comment replies,
// so we have to divide the width with it to get the depth
// of the comment
depth := width / 40
c := &comment{
Replies: make([]*comment, 0),
depth: depth,
}
e.Unmarshal(c)
c.Comment = strings.TrimSpace(c.Comment[:len(c.Comment)-5])
if depth == 0 {
comments = append(comments, c)
return
}
parent := comments[len(comments)-1]
// append comment to its parent
for i := 0; i < depth-1; i++ {
parent = parent.Replies[len(parent.Replies)-1]
}
parent.Replies = append(parent.Replies, c)
})
c.Visit("https://news.ycombinator.com/item?id=" + itemID)
enc := json.NewEncoder(os.Stdout)
enc.SetIndent("", " ")
// Dump json to the standard output
enc.Encode(comments)
}
================================================
FILE: _examples/instagram/instagram.go
================================================
package main
import (
"crypto/md5"
"encoding/json"
"fmt"
"log"
"net/url"
"os"
"regexp"
"strings"
"github.com/gocolly/colly/v2"
)
// "id": user id, "after": end cursor
const nextPageURL string = `https://www.instagram.com/graphql/query/?query_hash=%s&variables=%s`
const nextPagePayload string = `{"id":"%s","first":50,"after":"%s"}`
var requestID string
var requestIds [][]byte
var queryIdPattern = regexp.MustCompile(`queryId:".{32}"`)
type pageInfo struct {
EndCursor string `json:"end_cursor"`
NextPage bool `json:"has_next_page"`
}
type mainPageData struct {
Rhxgis string `json:"rhx_gis"`
EntryData struct {
ProfilePage []struct {
Graphql struct {
User struct {
Id string `json:"id"`
Media struct {
Edges []struct {
Node struct {
ImageURL string `json:"display_url"`
ThumbnailURL string `json:"thumbnail_src"`
IsVideo bool `json:"is_video"`
Date int `json:"date"`
Dimensions struct {
Width int `json:"width"`
Height int `json:"height"`
} `json:"dimensions"`
} `json:node"`
} `json:"edges"`
PageInfo pageInfo `json:"page_info"`
} `json:"edge_owner_to_timeline_media"`
} `json:"user"`
} `json:"graphql"`
} `json:"ProfilePage"`
} `json:"entry_data"`
}
type nextPageData struct {
Data struct {
User struct {
Container struct {
PageInfo pageInfo `json:"page_info"`
Edges []struct {
Node struct {
ImageURL string `json:"display_url"`
ThumbnailURL string `json:"thumbnail_src"`
IsVideo bool `json:"is_video"`
Date int `json:"taken_at_timestamp"`
Dimensions struct {
Width int `json:"width"`
Height int `json:"height"`
}
}
} `json:"edges"`
} `json:"edge_owner_to_timeline_media"`
}
} `json:"data"`
}
func main() {
if len(os.Args) != 2 {
log.Println("Missing account name argument")
os.Exit(1)
}
var actualUserId string
instagramAccount := os.Args[1]
outputDir := fmt.Sprintf("./instagram_%s/", instagramAccount)
c := colly.NewCollector(
//colly.CacheDir("./_instagram_cache/"),
colly.UserAgent("Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36"),
)
c.OnRequest(func(r *colly.Request) {
r.Headers.Set("X-Requested-With", "XMLHttpRequest")
r.Headers.Set("Referer", "https://www.instagram.com/"+instagramAccount)
if r.Ctx.Get("gis") != "" {
gis := fmt.Sprintf("%s:%s", r.Ctx.Get("gis"), r.Ctx.Get("variables"))
h := md5.New()
h.Write([]byte(gis))
gisHash := fmt.Sprintf("%x", h.Sum(nil))
r.Headers.Set("X-Instagram-GIS", gisHash)
}
})
c.OnHTML("html", func(e *colly.HTMLElement) {
d := c.Clone()
d.OnResponse(func(r *colly.Response) {
requestIds = queryIdPattern.FindAll(r.Body, -1)
requestID = string(requestIds[1][9:41])
})
requestIDURL := e.Request.AbsoluteURL(e.ChildAttr(`link[as="script"]`, "href"))
d.Visit(requestIDURL)
dat := e.ChildText("body > script:first-of-type")
jsonData := dat[strings.Index(dat, "{") : len(dat)-1]
data := &mainPageData{}
err := json.Unmarshal([]byte(jsonData), data)
if err != nil {
log.Fatal(err)
}
log.Println("saving output to ", outputDir)
os.MkdirAll(outputDir, os.ModePerm)
page := data.EntryData.ProfilePage[0]
actualUserId = page.Graphql.User.Id
for _, obj := range page.Graphql.User.Media.Edges {
// skip videos
if obj.Node.IsVideo {
continue
}
c.Visit(obj.Node.ImageURL)
}
nextPageVars := fmt.Sprintf(nextPagePayload, actualUserId, page.Graphql.User.Media.PageInfo.EndCursor)
e.Request.Ctx.Put("variables", nextPageVars)
if page.Graphql.User.Media.PageInfo.NextPage {
u := fmt.Sprintf(
nextPageURL,
requestID,
url.QueryEscape(nextPageVars),
)
log.Println("Next page found", u)
e.Request.Ctx.Put("gis", data.Rhxgis)
e.Request.Visit(u)
}
})
c.OnError(func(r *colly.Response, e error) {
log.Println("error:", e, r.Request.URL, string(r.Body))
})
c.OnResponse(func(r *colly.Response) {
if strings.Index(r.Headers.Get("Content-Type"), "image") > -1 {
r.Save(outputDir + r.FileName())
return
}
if strings.Index(r.Headers.Get("Content-Type"), "json") == -1 {
return
}
data := &nextPageData{}
err := json.Unmarshal(r.Body, data)
if err != nil {
log.Fatal(err)
}
for _, obj := range data.Data.User.Container.Edges {
// skip videos
if obj.Node.IsVideo {
continue
}
c.Visit(obj.Node.ImageURL)
}
if data.Data.User.Container.PageInfo.NextPage {
nextPageVars := fmt.Sprintf(nextPagePayload, actualUserId, data.Data.User.Container.PageInfo.EndCursor)
r.Request.Ctx.Put("variables", nextPageVars)
u := fmt.Sprintf(
nextPageURL,
requestID,
url.QueryEscape(nextPageVars),
)
log.Println("Next page found", u)
r.Request.Visit(u)
}
})
c.Visit("https://instagram.com/" + instagramAccount)
}
================================================
FILE: _examples/local_files/html/child_page/one.html
================================================
Child Page One
================================================
FILE: _examples/local_files/html/child_page/three.html
================================================
Child Page Three
================================================
FILE: _examples/local_files/html/child_page/two.html
================================================
Child Page Two
================================================
FILE: _examples/local_files/html/index.html
================================================
Index.html
================================================
FILE: _examples/local_files/local_files.go
================================================
package main
import (
"fmt"
"net/http"
"os"
"path/filepath"
"github.com/gocolly/colly/v2"
)
func main() {
dir, err := filepath.Abs(filepath.Dir(os.Args[0]))
if err != nil {
panic(err)
}
t := &http.Transport{}
t.RegisterProtocol("file", http.NewFileTransport(http.Dir("/")))
c := colly.NewCollector()
c.WithTransport(t)
pages := []string{}
c.OnHTML("h1", func(e *colly.HTMLElement) {
pages = append(pages, e.Text)
})
c.OnHTML("a", func(e *colly.HTMLElement) {
c.Visit("file://" + dir + "/html" + e.Attr("href"))
})
fmt.Println("file://" + dir + "/html/index.html")
c.Visit("file://" + dir + "/html/index.html")
c.Wait()
for i, p := range pages {
fmt.Printf("%d : %s\n", i, p)
}
}
================================================
FILE: _examples/login/login.go
================================================
package main
import (
"log"
"github.com/gocolly/colly/v2"
)
func main() {
// create a new collector
c := colly.NewCollector()
// authenticate
err := c.Post("http://example.com/login", map[string]string{"username": "admin", "password": "admin"})
if err != nil {
log.Fatal(err)
}
// attach callbacks after login
c.OnResponse(func(r *colly.Response) {
log.Println("response received", r.StatusCode)
})
// start scraping
c.Visit("https://example.com/")
}
================================================
FILE: _examples/max_depth/max_depth.go
================================================
package main
import (
"fmt"
"github.com/gocolly/colly/v2"
)
func main() {
// Instantiate default collector
c := colly.NewCollector(
// MaxDepth is 1, so only the links on the scraped page
// is visited, and no further links are followed
colly.MaxDepth(1),
)
// On every a element which has href attribute call callback
c.OnHTML("a[href]", func(e *colly.HTMLElement) {
link := e.Attr("href")
// Print link
fmt.Println(link)
// Visit link found on page
e.Request.Visit(link)
})
// Start scraping on https://en.wikipedia.org
c.Visit("https://en.wikipedia.org/")
}
================================================
FILE: _examples/multipart/multipart.go
================================================
package main
import (
"fmt"
"io"
"net/http"
"os"
"time"
"github.com/gocolly/colly/v2"
)
func generateFormData() map[string][]byte {
f, _ := os.Open("gocolly.jpg")
defer f.Close()
imgData, _ := io.ReadAll(f)
return map[string][]byte{
"firstname": []byte("one"),
"lastname": []byte("two"),
"email": []byte("onetwo@example.com"),
"file": imgData,
}
}
func setupServer() {
var handler http.HandlerFunc = func(w http.ResponseWriter, r *http.Request) {
fmt.Println("received request")
err := r.ParseMultipartForm(10000000)
if err != nil {
fmt.Println("server: Error")
w.WriteHeader(500)
w.Write([]byte("Internal Server Error"))
return
}
w.WriteHeader(200)
fmt.Println("server: OK")
w.Write([]byte("Success"))
}
go http.ListenAndServe(":8080", handler)
}
func main() {
// Start a single route http server to post an image to.
setupServer()
c := colly.NewCollector(colly.AllowURLRevisit(), colly.MaxDepth(5))
// On every a element which has href attribute call callback
c.OnHTML("html", func(e *colly.HTMLElement) {
fmt.Println(e.Text)
time.Sleep(1 * time.Second)
e.Request.PostMultipart("http://localhost:8080/", generateFormData())
})
// Before making a request print "Visiting ..."
c.OnRequest(func(r *colly.Request) {
fmt.Println("Posting gocolly.jpg to", r.URL.String())
})
// Start scraping
c.PostMultipart("http://localhost:8080/", generateFormData())
c.Wait()
}
================================================
FILE: _examples/openedx_courses/openedx_courses.go
================================================
package main
import (
"encoding/json"
"fmt"
"strings"
"time"
"github.com/gocolly/colly/v2"
)
// DATE_FORMAT default format date used in openedx
const DATE_FORMAT = "02 Jan, 2006"
// Course store openedx course data
type Course struct {
CourseID string
Run string
Name string
Number string
StartDate *time.Time
EndDate *time.Time
URL string
}
func main() {
// Instantiate default collector
c := colly.NewCollector(
// Using IndonesiaX as sample
colly.AllowedDomains("indonesiax.co.id", "www.indonesiax.co.id"),
// Cache responses to prevent multiple download of pages
// even if the collector is restarted
colly.CacheDir("./cache"),
)
courses := make([]Course, 0, 200)
// On every a element which has href attribute call callback
c.OnHTML("a[href]", func(e *colly.HTMLElement) {
link := e.Attr("href")
if !strings.HasPrefix(link, "/courses/") {
return
}
// start scraping the page under the link found
e.Request.Visit(link)
})
c.OnHTML("div[class=main-container]", func(e *colly.HTMLElement) {
if e.DOM.Find("section#course-info").Length() == 0 {
return
}
title := strings.Split(e.ChildText(".course-info__title"), "\n")[0]
course_id := e.ChildAttr("input[name=course_id]", "value")
texts := e.ChildTexts("span[data-datetime]")
start_date, _ := time.Parse(DATE_FORMAT, texts[0])
end_date, _ := time.Parse(DATE_FORMAT, texts[1])
var run string
if len(strings.Split(course_id, "_")) > 1 {
run = strings.Split(course_id, "_")[1]
}
course := Course{
CourseID: course_id,
Run: run,
Name: title,
Number: e.ChildText("span.course-number"),
StartDate: &start_date,
EndDate: &end_date,
URL: fmt.Sprintf("/courses/%s/about", course_id),
}
courses = append(courses, course)
})
// Start scraping on https://openedxdomain/courses
c.Visit("https://www.indonesiax.co.id/courses")
// Convert results to JSON data if the scraping job has finished
jsonData, err := json.MarshalIndent(courses, "", " ")
if err != nil {
panic(err)
}
// Dump json to the standard output (can be redirected to a file)
fmt.Println(string(jsonData))
}
================================================
FILE: _examples/parallel/parallel.go
================================================
package main
import (
"fmt"
"github.com/gocolly/colly/v2"
)
func main() {
// Instantiate default collector
c := colly.NewCollector(
// MaxDepth is 2, so only the links on the scraped page
// and links on those pages are visited
colly.MaxDepth(2),
colly.Async(),
)
// Limit the maximum parallelism to 2
// This is necessary if the goroutines are dynamically
// created to control the limit of simultaneous requests.
//
// Parallelism can be controlled also by spawning fixed
// number of go routines.
c.Limit(&colly.LimitRule{DomainGlob: "*", Parallelism: 2})
// On every a element which has href attribute call callback
c.OnHTML("a[href]", func(e *colly.HTMLElement) {
link := e.Attr("href")
// Print link
fmt.Println(link)
// Visit link found on page on a new thread
e.Request.Visit(link)
})
// Start scraping on https://en.wikipedia.org
c.Visit("https://en.wikipedia.org/")
// Wait until threads are finished
c.Wait()
}
================================================
FILE: _examples/proxy_switcher/proxy_switcher.go
================================================
package main
import (
"bytes"
"log"
"github.com/gocolly/colly/v2"
"github.com/gocolly/colly/v2/proxy"
)
func main() {
// Instantiate default collector
c := colly.NewCollector(colly.AllowURLRevisit())
// Rotate two socks5 proxies
rp, err := proxy.RoundRobinProxySwitcher("socks5://127.0.0.1:1337", "socks5://127.0.0.1:1338")
if err != nil {
log.Fatal(err)
}
c.SetProxyFunc(rp)
// Print the response
c.OnResponse(func(r *colly.Response) {
log.Printf("Proxy Address: %s\n", r.Request.ProxyURL)
log.Printf("%s\n", bytes.Replace(r.Body, []byte("\n"), nil, -1))
})
// Fetch httpbin.org/ip five times
for i := 0; i < 5; i++ {
c.Visit("https://httpbin.org/ip")
}
}
================================================
FILE: _examples/queue/queue.go
================================================
package main
import (
"fmt"
"github.com/gocolly/colly/v2"
"github.com/gocolly/colly/v2/queue"
)
func main() {
url := "https://httpbin.org/delay/1"
// Instantiate default collector
c := colly.NewCollector(colly.AllowURLRevisit())
// create a request queue with 2 consumer threads
q, _ := queue.New(
2, // Number of consumer threads
&queue.InMemoryQueueStorage{MaxSize: 10000}, // Use default queue storage
)
c.OnRequest(func(r *colly.Request) {
fmt.Println("visiting", r.URL)
if r.ID < 15 {
r2, err := r.New("GET", fmt.Sprintf("%s?x=%v", url, r.ID), nil)
if err == nil {
q.AddRequest(r2)
}
}
})
for i := 0; i < 5; i++ {
// Add URLs to the queue
q.AddURL(fmt.Sprintf("%s?n=%d", url, i))
}
// Consume URLs
q.Run(c)
}
================================================
FILE: _examples/random_delay/random_delay.go
================================================
package main
import (
"fmt"
"time"
"github.com/gocolly/colly/v2"
"github.com/gocolly/colly/v2/debug"
)
func main() {
url := "https://httpbin.org/delay/2"
// Instantiate default collector
c := colly.NewCollector(
// Attach a debugger to the collector
colly.Debugger(&debug.LogDebugger{}),
colly.Async(),
)
// Limit the number of threads started by colly to two
// when visiting links which domains' matches "*httpbin.*" glob
c.Limit(&colly.LimitRule{
DomainGlob: "*httpbin.*",
Parallelism: 2,
RandomDelay: 5 * time.Second,
})
// Start scraping in four threads on https://httpbin.org/delay/2
for i := 0; i < 4; i++ {
c.Visit(fmt.Sprintf("%s?n=%d", url, i))
}
// Start scraping on https://httpbin.org/delay/2
c.Visit(url)
// Wait until threads are finished
c.Wait()
}
================================================
FILE: _examples/rate_limit/rate_limit.go
================================================
package main
import (
"fmt"
"github.com/gocolly/colly/v2"
"github.com/gocolly/colly/v2/debug"
)
func main() {
url := "https://httpbin.org/delay/2"
// Instantiate default collector
c := colly.NewCollector(
// Turn on asynchronous requests
colly.Async(),
// Attach a debugger to the collector
colly.Debugger(&debug.LogDebugger{}),
)
// Limit the number of threads started by colly to two
// when visiting links which domains' matches "*httpbin.*" glob
c.Limit(&colly.LimitRule{
DomainGlob: "*httpbin.*",
Parallelism: 2,
//Delay: 5 * time.Second,
})
// Start scraping in five threads on https://httpbin.org/delay/2
for i := 0; i < 5; i++ {
c.Visit(fmt.Sprintf("%s?n=%d", url, i))
}
// Wait until threads are finished
c.Wait()
}
================================================
FILE: _examples/reddit/reddit.go
================================================
package main
import (
"fmt"
"os"
"time"
"github.com/gocolly/colly/v2"
)
type item struct {
StoryURL string
Source string
comments string
CrawledAt time.Time
Comments string
Title string
}
func main() {
stories := []item{}
// Instantiate default collector
c := colly.NewCollector(
// Visit only domains: old.reddit.com
colly.AllowedDomains("old.reddit.com"),
// Parallelism
colly.Async(true),
)
// On every a element which has .top-matter attribute call callback
// This class is unique to the div that holds all information about a story
c.OnHTML(".top-matter", func(e *colly.HTMLElement) {
temp := item{}
temp.StoryURL = e.ChildAttr("a[data-event-action=title]", "href")
temp.Source = "https://old.reddit.com/r/programming/"
temp.Title = e.ChildText("a[data-event-action=title]")
temp.Comments = e.ChildAttr("a[data-event-action=comments]", "href")
temp.CrawledAt = time.Now()
stories = append(stories, temp)
})
// On every span tag with the class next-button
c.OnHTML("span.next-button", func(h *colly.HTMLElement) {
t := h.ChildAttr("a", "href")
c.Visit(t)
})
// Set max Parallelism and introduce a Random Delay
c.Limit(&colly.LimitRule{
Parallelism: 2,
RandomDelay: 5 * time.Second,
})
// Before making a request print "Visiting ..."
c.OnRequest(func(r *colly.Request) {
fmt.Println("Visiting", r.URL.String())
})
// Crawl all reddits the user passes in
reddits := os.Args[1:]
for _, reddit := range reddits {
c.Visit(reddit)
}
c.Wait()
fmt.Println(stories)
}
================================================
FILE: _examples/request_context/request_context.go
================================================
package main
import (
"fmt"
"github.com/gocolly/colly/v2"
)
func main() {
// Instantiate default collector
c := colly.NewCollector()
// Before making a request put the URL with
// the key of "url" into the context of the request
c.OnRequest(func(r *colly.Request) {
r.Ctx.Put("url", r.URL.String())
})
// After making a request get "url" from
// the context of the request
c.OnResponse(func(r *colly.Response) {
fmt.Println(r.Ctx.Get("url"))
})
// Start scraping on https://en.wikipedia.org
c.Visit("https://en.wikipedia.org/")
}
================================================
FILE: _examples/scraper_server/scraper_server.go
================================================
package main
import (
"encoding/json"
"log"
"net/http"
"github.com/gocolly/colly/v2"
)
type pageInfo struct {
StatusCode int
Links map[string]int
}
func handler(w http.ResponseWriter, r *http.Request) {
URL := r.URL.Query().Get("url")
if URL == "" {
log.Println("missing URL argument")
return
}
log.Println("visiting", URL)
c := colly.NewCollector()
p := &pageInfo{Links: make(map[string]int)}
// count links
c.OnHTML("a[href]", func(e *colly.HTMLElement) {
link := e.Request.AbsoluteURL(e.Attr("href"))
if link != "" {
p.Links[link]++
}
})
// extract status code
c.OnResponse(func(r *colly.Response) {
log.Println("response received", r.StatusCode)
p.StatusCode = r.StatusCode
})
c.OnError(func(r *colly.Response, err error) {
log.Println("error:", r.StatusCode, err)
p.StatusCode = r.StatusCode
})
c.Visit(URL)
// dump results
b, err := json.Marshal(p)
if err != nil {
log.Println("failed to serialize response:", err)
return
}
w.Header().Add("Content-Type", "application/json")
w.Write(b)
}
func main() {
// example usage: curl -s 'http://127.0.0.1:7171/?url=http://go-colly.org/'
addr := ":7171"
http.HandleFunc("/", handler)
log.Println("listening on", addr)
log.Fatal(http.ListenAndServe(addr, nil))
}
================================================
FILE: _examples/shopify_sitemap/shopify_sitemap.go
================================================
package main
import (
"fmt"
"github.com/gocolly/colly/v2"
)
func main() {
// Array containing all the known URLs in a sitemap
knownUrls := []string{}
// Create a Collector specifically for Shopify
c := colly.NewCollector(colly.AllowedDomains("www.shopify.com"))
// Create a callback on the XPath query searching for the URLs
c.OnXML("//urlset/url/loc", func(e *colly.XMLElement) {
knownUrls = append(knownUrls, e.Text)
})
// Start the collector
c.Visit("https://www.shopify.com/sitemap.xml")
fmt.Println("All known URLs:")
for _, url := range knownUrls {
fmt.Println("\t", url)
}
fmt.Println("Collected", len(knownUrls), "URLs")
}
================================================
FILE: _examples/url_filter/url_filter.go
================================================
package main
import (
"fmt"
"regexp"
"github.com/gocolly/colly/v2"
)
func main() {
// Instantiate default collector
c := colly.NewCollector(
// Visit only root url and urls which start with "e" or "h" on httpbin.org
colly.URLFilters(
regexp.MustCompile("http://httpbin\\.org/(|e.+)$"),
regexp.MustCompile("http://httpbin\\.org/h.+"),
),
)
// On every a element which has href attribute call callback
c.OnHTML("a[href]", func(e *colly.HTMLElement) {
link := e.Attr("href")
// Print link
fmt.Printf("Link found: %q -> %s\n", e.Text, link)
// Visit link found on page
// Only those links are visited which are matched by any of the URLFilter regexps
c.Visit(e.Request.AbsoluteURL(link))
})
// Before making a request print "Visiting ..."
c.OnRequest(func(r *colly.Request) {
fmt.Println("Visiting", r.URL.String())
})
// Start scraping on http://httpbin.org
c.Visit("http://httpbin.org/")
}
================================================
FILE: _examples/xkcd_store/xkcd_store.go
================================================
package main
import (
"encoding/csv"
"log"
"os"
"github.com/gocolly/colly/v2"
)
func main() {
fName := "xkcd_store_items.csv"
file, err := os.Create(fName)
if err != nil {
log.Fatalf("Cannot create file %q: %s\n", fName, err)
return
}
defer file.Close()
writer := csv.NewWriter(file)
defer writer.Flush()
// Write CSV header
writer.Write([]string{"Name", "Price", "URL", "Image URL"})
// Instantiate default collector
c := colly.NewCollector(
// Allow requests only to store.xkcd.com
colly.AllowedDomains("store.xkcd.com"),
)
// Extract product details
c.OnHTML(".product-grid-item", func(e *colly.HTMLElement) {
writer.Write([]string{
e.ChildAttr("a", "title"),
e.ChildText("span"),
e.Request.AbsoluteURL(e.ChildAttr("a", "href")),
"https:" + e.ChildAttr("img", "src"),
})
})
// Find and visit next page links
c.OnHTML(`.next a[href]`, func(e *colly.HTMLElement) {
e.Request.Visit(e.Attr("href"))
})
c.Visit("https://store.xkcd.com/collections/everything")
log.Printf("Scraping finished, check file %q for results\n", fName)
// Display collector's statistics
log.Println(c)
}
================================================
FILE: cmd/colly/colly.go
================================================
// Copyright 2018 Adam Tauber
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package main
import (
"bytes"
"fmt"
"log"
"os"
"strings"
"github.com/jawher/mow.cli"
)
var scraperHeadTemplate = `package main
import (
"log"
"github.com/gocolly/colly/v2"
)
func main() {
c := colly.NewCollector()
`
var scraperEndTemplate = `
c.Visit("https://yourdomain.com/")
}
`
var htmlCallbackTemplate = `
c.OnHTML("element-selector", func(e *colly.HTMLElement) {
log.Println(e.Text)
})
`
var requestCallbackTemplate = `
c.OnRequest(func(r *colly.Request) {
log.Println("Visiting", r.URL)
})
`
var responseCallbackTemplate = `
c.OnResponse(func(r *colly.Response) {
log.Println("Visited", r.Request.URL, r.StatusCode)
})
`
var errorCallbackTemplate = `
c.OnError(func(r *colly.Response, err error) {
log.Printf("Error on %s: %s", r.Request.URL, err)
})
`
func main() {
app := cli.App("colly", "Scraping Framework for Gophers")
app.Command("new", "Create new scraper", func(cmd *cli.Cmd) {
var (
callbacks = cmd.StringOpt("callbacks", "", "Add callbacks to the template. (E.g. '--callbacks=html,response,error')")
hosts = cmd.StringOpt("hosts", "", "Specify scraper's allowed hosts. (e.g. '--hosts=xy.com,abcd.com')")
path = cmd.StringArg("PATH", "", "Path of the new scraper")
)
cmd.Spec = "[--callbacks] [--hosts] [PATH]"
cmd.Action = func() {
scraper := bytes.NewBufferString(scraperHeadTemplate)
outfile := os.Stdout
if *path != "" {
var err error
outfile, err = os.Create(*path)
if err != nil {
log.Fatal(err)
}
defer outfile.Close()
}
if *hosts != "" {
scraper.WriteString("\n c.AllowedDomains = []string{")
for i, h := range strings.Split(*hosts, ",") {
if i > 0 {
scraper.WriteString(", ")
}
scraper.WriteString(fmt.Sprintf("%q", h))
}
scraper.WriteString("}\n")
}
if len(*callbacks) > 0 {
for _, c := range strings.Split(*callbacks, ",") {
switch c {
case "html":
scraper.WriteString(htmlCallbackTemplate)
case "request":
scraper.WriteString(requestCallbackTemplate)
case "response":
scraper.WriteString(responseCallbackTemplate)
case "error":
scraper.WriteString(errorCallbackTemplate)
}
}
}
scraper.WriteString(scraperEndTemplate)
outfile.Write(scraper.Bytes())
}
})
app.Run(os.Args)
}
================================================
FILE: colly.go
================================================
// Copyright 2018 Adam Tauber
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Package colly implements a HTTP scraping framework
package colly
import (
"bytes"
"context"
"crypto/rand"
"encoding/json"
"errors"
"fmt"
"hash/fnv"
"io"
"log"
"net/http"
"net/http/cookiejar"
"net/url"
"os"
"path/filepath"
"regexp"
"slices"
"strconv"
"strings"
"sync"
"sync/atomic"
"time"
"github.com/PuerkitoBio/goquery"
"github.com/antchfx/htmlquery"
"github.com/antchfx/xmlquery"
"github.com/gocolly/colly/v2/debug"
"github.com/gocolly/colly/v2/storage"
"github.com/kennygrant/sanitize"
whatwgUrl "github.com/nlnwa/whatwg-url/url"
"github.com/temoto/robotstxt"
"google.golang.org/appengine/urlfetch"
)
// A CollectorOption sets an option on a Collector.
type CollectorOption func(*Collector)
// Collector provides the scraper instance for a scraping job
type Collector struct {
// UserAgent is the User-Agent string used by HTTP requests
UserAgent string
// Custom headers for the request
Headers *http.Header
// MaxDepth limits the recursion depth of visited URLs.
// Set it to 0 for infinite recursion (default).
MaxDepth int
// AllowedDomains is a domain whitelist.
// Leave it blank to allow any domains to be visited
AllowedDomains []string
// DisallowedDomains is a domain blacklist.
DisallowedDomains []string
// DisallowedURLFilters is a list of regular expressions which restricts
// visiting URLs. If any of the rules matches to a URL the
// request will be stopped. DisallowedURLFilters will
// be evaluated before URLFilters
// Leave it blank to allow any URLs to be visited
DisallowedURLFilters []*regexp.Regexp
// URLFilters is a list of regular expressions which restricts
// visiting URLs. If any of the rules matches to a URL the
// request won't be stopped. DisallowedURLFilters will
// be evaluated before URLFilters
// Leave it blank to allow any URLs to be visited
URLFilters []*regexp.Regexp
// AllowURLRevisit allows multiple downloads of the same URL
AllowURLRevisit bool
// MaxBodySize is the limit of the retrieved response body in bytes.
// 0 means unlimited.
// The default value for MaxBodySize is 10MB (10 * 1024 * 1024 bytes).
MaxBodySize int
// CacheDir specifies a location where GET requests are cached as files.
// When it's not defined, caching is disabled.
CacheDir string
// IgnoreRobotsTxt allows the Collector to ignore any restrictions set by
// the target host's robots.txt file. See http://www.robotstxt.org/ for more
// information.
IgnoreRobotsTxt bool
// Async turns on asynchronous network communication. Use Collector.Wait() to
// be sure all requests have been finished.
Async bool
// ParseHTTPErrorResponse allows parsing HTTP responses with non 2xx status codes.
// By default, Colly parses only successful HTTP responses. Set ParseHTTPErrorResponse
// to true to enable it.
ParseHTTPErrorResponse bool
// ID is the unique identifier of a collector
ID uint32
// DetectCharset can enable character encoding detection for non-utf8 response bodies
// without explicit charset declaration. This feature uses https://github.com/saintfish/chardet
DetectCharset bool
// RedirectHandler allows control on how a redirect will be managed
// use c.SetRedirectHandler to set this value
redirectHandler func(req *http.Request, via []*http.Request) error
// CheckHead performs a HEAD request before every GET to pre-validate the response
CheckHead bool
// TraceHTTP enables capturing and reporting request performance for crawler tuning.
// When set to true, the Response.Trace will be filled in with an HTTPTrace object.
TraceHTTP bool
// Context is the context that will be used for HTTP requests. You can set this
// to support clean cancellation of scraping.
Context context.Context
// MaxRequests limit the number of requests done by the instance.
// Set it to 0 for infinite requests (default).
MaxRequests uint32
store storage.Storage
debugger debug.Debugger
robotsMap map[string]*robotstxt.RobotsData
htmlCallbacks []*htmlCallbackContainer
xmlCallbacks []*xmlCallbackContainer
requestCallbacks []RequestCallback
responseCallbacks []ResponseCallback
responseHeadersCallbacks []ResponseHeadersCallback
requestHeadersCallbacks []RequestCallback
errorCallbacks []ErrorCallback
scrapedCallbacks []ScrapedCallback
requestCount atomic.Uint32
responseCount atomic.Uint32
backend *httpBackend
wg *sync.WaitGroup
lock *sync.RWMutex
// CacheExpiration sets the maximum age for cache files.
// If a cached file is older than this duration, it will be ignored and refreshed.
CacheExpiration time.Duration
}
// RequestCallback is a type alias for OnRequest callback functions
type RequestCallback func(*Request)
// ResponseHeadersCallback is a type alias for OnResponseHeaders callback functions
type ResponseHeadersCallback func(*Response)
// ResponseCallback is a type alias for OnResponse callback functions
type ResponseCallback func(*Response)
// HTMLCallback is a type alias for OnHTML callback functions
type HTMLCallback func(*HTMLElement)
// XMLCallback is a type alias for OnXML callback functions
type XMLCallback func(*XMLElement)
// ErrorCallback is a type alias for OnError callback functions
type ErrorCallback func(*Response, error)
// ScrapedCallback is a type alias for OnScraped callback functions
type ScrapedCallback func(*Response)
// ProxyFunc is a type alias for proxy setter functions.
type ProxyFunc func(*http.Request) (*url.URL, error)
// AlreadyVisitedError is the error type for already visited URLs.
//
// It's returned synchronously by Visit when the URL passed to Visit
// is already visited.
//
// When already visited URL is encountered after following
// redirects, this error appears in OnError callback, and if Async
// mode is not enabled, is also returned by Visit.
type AlreadyVisitedError struct {
// Destination is the URL that was attempted to be visited.
// It might not match the URL passed to Visit if redirect
// was followed.
Destination *url.URL
}
// Error implements error interface.
func (e *AlreadyVisitedError) Error() string {
return fmt.Sprintf("%q already visited", e.Destination)
}
type htmlCallbackContainer struct {
Selector string
Function HTMLCallback
active atomic.Bool
}
type xmlCallbackContainer struct {
Query string
Function XMLCallback
active atomic.Bool
}
type cookieJarSerializer struct {
store storage.Storage
lock *sync.RWMutex
}
var collectorCounter uint32
// The key type is unexported to prevent collisions with context keys defined in
// other packages.
type key int
// ProxyURLKey is the context key for the request proxy address.
const (
ProxyURLKey key = iota
CheckRevisitKey
)
// The prefix for environment variables of Colly settings
const envVariablePrefix = "COLLY_"
var (
// ErrForbiddenDomain is the error thrown if visiting
// a domain which is not allowed in AllowedDomains
ErrForbiddenDomain = errors.New("Forbidden domain")
// ErrMissingURL is the error type for missing URL errors
ErrMissingURL = errors.New("Missing URL")
// ErrMaxDepth is the error type for exceeding max depth
ErrMaxDepth = errors.New("Max depth limit reached")
// ErrForbiddenURL is the error thrown if visiting
// a URL which is not allowed by URLFilters
ErrForbiddenURL = errors.New("ForbiddenURL")
// ErrNoURLFiltersMatch is the error thrown if visiting
// a URL which is not allowed by URLFilters
ErrNoURLFiltersMatch = errors.New("No URLFilters match")
// ErrRobotsTxtBlocked is the error type for robots.txt errors
ErrRobotsTxtBlocked = errors.New("URL blocked by robots.txt")
// ErrNoCookieJar is the error type for missing cookie jar
ErrNoCookieJar = errors.New("Cookie jar is not available")
// ErrNoPattern is the error type for LimitRules without patterns
ErrNoPattern = errors.New("No pattern defined in LimitRule")
// ErrEmptyProxyURL is the error type for empty Proxy URL list
ErrEmptyProxyURL = errors.New("Proxy URL list is empty")
// ErrAbortedAfterHeaders is the error returned when OnResponseHeaders aborts the transfer.
ErrAbortedAfterHeaders = errors.New("Aborted after receiving response headers")
// ErrAbortedBeforeRequest is the error returned when OnResponseHeaders aborts the transfer.
ErrAbortedBeforeRequest = errors.New("Aborted before Do Request")
// ErrQueueFull is the error returned when the queue is full
ErrQueueFull = errors.New("Queue MaxSize reached")
// ErrMaxRequests is the error returned when exceeding max requests
ErrMaxRequests = errors.New("Max Requests limit reached")
// ErrRetryBodyUnseekable is the error when retry with not seekable body
ErrRetryBodyUnseekable = errors.New("Retry Body Unseekable")
)
var envMap = map[string]func(*Collector, string){
"ALLOWED_DOMAINS": func(c *Collector, val string) {
c.AllowedDomains = strings.Split(val, ",")
},
"CACHE_DIR": func(c *Collector, val string) {
c.CacheDir = val
},
"DETECT_CHARSET": func(c *Collector, val string) {
c.DetectCharset = isYesString(val)
},
"DISABLE_COOKIES": func(c *Collector, _ string) {
c.backend.Client.Jar = nil
},
"DISALLOWED_DOMAINS": func(c *Collector, val string) {
c.DisallowedDomains = strings.Split(val, ",")
},
"IGNORE_ROBOTSTXT": func(c *Collector, val string) {
c.IgnoreRobotsTxt = isYesString(val)
},
"FOLLOW_REDIRECTS": func(c *Collector, val string) {
if !isYesString(val) {
c.redirectHandler = func(req *http.Request, via []*http.Request) error {
return http.ErrUseLastResponse
}
}
},
"MAX_BODY_SIZE": func(c *Collector, val string) {
size, err := strconv.Atoi(val)
if err == nil {
c.MaxBodySize = size
}
},
"MAX_DEPTH": func(c *Collector, val string) {
maxDepth, err := strconv.Atoi(val)
if err == nil {
c.MaxDepth = maxDepth
}
},
"MAX_REQUESTS": func(c *Collector, val string) {
maxRequests, err := strconv.ParseUint(val, 0, 32)
if err == nil {
c.MaxRequests = uint32(maxRequests)
}
},
"PARSE_HTTP_ERROR_RESPONSE": func(c *Collector, val string) {
c.ParseHTTPErrorResponse = isYesString(val)
},
"TRACE_HTTP": func(c *Collector, val string) {
c.TraceHTTP = isYesString(val)
},
"USER_AGENT": func(c *Collector, val string) {
c.UserAgent = val
},
}
var urlParser = whatwgUrl.NewParser(whatwgUrl.WithPercentEncodeSinglePercentSign())
// NewCollector creates a new Collector instance with default configuration
func NewCollector(options ...CollectorOption) *Collector {
c := &Collector{}
c.Init()
for _, f := range options {
f(c)
}
c.parseSettingsFromEnv()
return c
}
// UserAgent sets the user agent used by the Collector.
func UserAgent(ua string) CollectorOption {
return func(c *Collector) {
c.UserAgent = ua
}
}
// Headers sets the custom headers used by the Collector.
func Headers(headers map[string]string) CollectorOption {
return func(c *Collector) {
customHeaders := make(http.Header)
for header, value := range headers {
customHeaders.Add(header, value)
}
c.Headers = &customHeaders
}
}
// MaxDepth limits the recursion depth of visited URLs.
func MaxDepth(depth int) CollectorOption {
return func(c *Collector) {
c.MaxDepth = depth
}
}
// MaxRequests limit the number of requests done by the instance.
// Set it to 0 for infinite requests (default).
func MaxRequests(max uint32) CollectorOption {
return func(c *Collector) {
c.MaxRequests = max
}
}
// AllowedDomains sets the domain whitelist used by the Collector.
func AllowedDomains(domains ...string) CollectorOption {
return func(c *Collector) {
c.AllowedDomains = domains
}
}
// ParseHTTPErrorResponse allows parsing responses with HTTP errors
func ParseHTTPErrorResponse() CollectorOption {
return func(c *Collector) {
c.ParseHTTPErrorResponse = true
}
}
// DisallowedDomains sets the domain blacklist used by the Collector.
func DisallowedDomains(domains ...string) CollectorOption {
return func(c *Collector) {
c.DisallowedDomains = domains
}
}
// DisallowedURLFilters sets the list of regular expressions which restricts
// visiting URLs. If any of the rules matches to a URL the request will be stopped.
func DisallowedURLFilters(filters ...*regexp.Regexp) CollectorOption {
return func(c *Collector) {
c.DisallowedURLFilters = filters
}
}
// URLFilters sets the list of regular expressions which restricts
// visiting URLs. If any of the rules matches to a URL the request won't be stopped.
func URLFilters(filters ...*regexp.Regexp) CollectorOption {
return func(c *Collector) {
c.URLFilters = filters
}
}
// AllowURLRevisit instructs the Collector to allow multiple downloads of the same URL
func AllowURLRevisit() CollectorOption {
return func(c *Collector) {
c.AllowURLRevisit = true
}
}
// MaxBodySize sets the limit of the retrieved response body in bytes.
func MaxBodySize(sizeInBytes int) CollectorOption {
return func(c *Collector) {
c.MaxBodySize = sizeInBytes
}
}
// CacheDir specifies the location where GET requests are cached as files.
func CacheDir(path string) CollectorOption {
return func(c *Collector) {
c.CacheDir = path
}
}
// IgnoreRobotsTxt instructs the Collector to ignore any restrictions
// set by the target host's robots.txt file.
func IgnoreRobotsTxt() CollectorOption {
return func(c *Collector) {
c.IgnoreRobotsTxt = true
}
}
// TraceHTTP instructs the Collector to collect and report request trace data
// on the Response.Trace.
func TraceHTTP() CollectorOption {
return func(c *Collector) {
c.TraceHTTP = true
}
}
// StdlibContext sets the context that will be used for HTTP requests.
// You can set this to support clean cancellation of scraping.
func StdlibContext(ctx context.Context) CollectorOption {
return func(c *Collector) {
c.Context = ctx
}
}
// ID sets the unique identifier of the Collector.
func ID(id uint32) CollectorOption {
return func(c *Collector) {
c.ID = id
}
}
// Async turns on asynchronous network requests.
func Async(a ...bool) CollectorOption {
return func(c *Collector) {
if len(a) > 0 {
c.Async = a[0]
} else {
c.Async = true
}
}
}
// DetectCharset enables character encoding detection for non-utf8 response bodies
// without explicit charset declaration. This feature uses https://github.com/saintfish/chardet
func DetectCharset() CollectorOption {
return func(c *Collector) {
c.DetectCharset = true
}
}
// Debugger sets the debugger used by the Collector.
func Debugger(d debug.Debugger) CollectorOption {
return func(c *Collector) {
d.Init()
c.debugger = d
}
}
// CheckHead performs a HEAD request before every GET to pre-validate the response
func CheckHead() CollectorOption {
return func(c *Collector) {
c.CheckHead = true
}
}
// CacheExpiration sets the maximum age for cache files.
// If a cached file is older than this duration, it will be ignored and refreshed.
func CacheExpiration(d time.Duration) CollectorOption {
return func(c *Collector) {
c.CacheExpiration = d
}
}
// Init initializes the Collector's private variables and sets default
// configuration for the Collector
func (c *Collector) Init() {
c.UserAgent = "colly - https://github.com/gocolly/colly"
c.Headers = nil
c.MaxDepth = 0
c.MaxRequests = 0
c.store = &storage.InMemoryStorage{}
c.store.Init()
c.MaxBodySize = 10 * 1024 * 1024
c.backend = &httpBackend{}
jar, _ := cookiejar.New(nil)
c.backend.Init(jar)
c.backend.Client.CheckRedirect = c.checkRedirectFunc()
c.wg = &sync.WaitGroup{}
c.lock = &sync.RWMutex{}
c.robotsMap = make(map[string]*robotstxt.RobotsData)
c.IgnoreRobotsTxt = true
c.ID = atomic.AddUint32(&collectorCounter, 1)
c.TraceHTTP = false
c.Context = context.Background()
}
// Appengine will replace the Collector's backend http.Client
// With an Http.Client that is provided by appengine/urlfetch
// This function should be used when the scraper is run on
// Google App Engine. Example:
//
// func startScraper(w http.ResponseWriter, r *http.Request) {
// ctx := appengine.NewContext(r)
// c := colly.NewCollector()
// c.Appengine(ctx)
// ...
// c.Visit("https://google.ca")
// }
func (c *Collector) Appengine(ctx context.Context) {
client := urlfetch.Client(ctx)
client.Jar = c.backend.Client.Jar
client.CheckRedirect = c.backend.Client.CheckRedirect
client.Timeout = c.backend.Client.Timeout
c.backend.Client = client
}
// Visit starts Collector's collecting job by creating a
// request to the URL specified in parameter.
// Visit also calls the previously provided callbacks
func (c *Collector) Visit(URL string) error {
if c.CheckHead {
if check := c.scrape(URL, "HEAD", 1, nil, nil, nil, true); check != nil {
return check
}
}
return c.scrape(URL, "GET", 1, nil, nil, nil, true)
}
// HasVisited checks if the provided URL has been visited
func (c *Collector) HasVisited(URL string) (bool, error) {
return c.checkHasVisited(URL, nil)
}
// HasPosted checks if the provided URL and requestData has been visited
// This method is useful more likely to prevent re-visit same URL and POST body
func (c *Collector) HasPosted(URL string, requestData map[string]string) (bool, error) {
return c.checkHasVisited(URL, requestData)
}
// Head starts a collector job by creating a HEAD request.
func (c *Collector) Head(URL string) error {
return c.scrape(URL, "HEAD", 1, nil, nil, nil, false)
}
// Post starts a collector job by creating a POST request.
// Post also calls the previously provided callbacks
func (c *Collector) Post(URL string, requestData map[string]string) error {
return c.scrape(URL, "POST", 1, createFormReader(requestData), nil, nil, true)
}
// PostRaw starts a collector job by creating a POST request with raw binary data.
// Post also calls the previously provided callbacks
func (c *Collector) PostRaw(URL string, requestData []byte) error {
return c.scrape(URL, "POST", 1, bytes.NewReader(requestData), nil, nil, true)
}
// PostMultipart starts a collector job by creating a Multipart POST request
// with raw binary data. PostMultipart also calls the previously provided callbacks
func (c *Collector) PostMultipart(URL string, requestData map[string][]byte) error {
boundary := randomBoundary()
hdr := http.Header{}
hdr.Set("Content-Type", "multipart/form-data; boundary="+boundary)
hdr.Set("User-Agent", c.UserAgent)
return c.scrape(URL, "POST", 1, createMultipartReader(boundary, requestData), nil, hdr, true)
}
// Request starts a collector job by creating a custom HTTP request
// where method, context, headers and request data can be specified.
// Set requestData, ctx, hdr parameters to nil if you don't want to use them.
// Valid methods:
// - "GET"
// - "HEAD"
// - "POST"
// - "PUT"
// - "DELETE"
// - "PATCH"
// - "OPTIONS"
func (c *Collector) Request(method, URL string, requestData io.Reader, ctx *Context, hdr http.Header) error {
return c.scrape(URL, method, 1, requestData, ctx, hdr, true)
}
// SetDebugger attaches a debugger to the collector
func (c *Collector) SetDebugger(d debug.Debugger) {
d.Init()
c.debugger = d
}
// UnmarshalRequest creates a Request from serialized data
func (c *Collector) UnmarshalRequest(r []byte) (*Request, error) {
req := &serializableRequest{}
err := json.Unmarshal(r, req)
if err != nil {
return nil, err
}
u, err := url.Parse(req.URL)
if err != nil {
return nil, err
}
ctx := NewContext()
for k, v := range req.Ctx {
ctx.Put(k, v)
}
return &Request{
Method: req.Method,
URL: u,
Depth: req.Depth,
Body: bytes.NewReader(req.Body),
Ctx: ctx,
ID: c.requestCount.Add(1),
Headers: &req.Headers,
collector: c,
}, nil
}
func (c *Collector) scrape(u, method string, depth int, requestData io.Reader, ctx *Context, hdr http.Header, checkRevisit bool) error {
parsedWhatwgURL, err := urlParser.Parse(u)
if err != nil {
return err
}
parsedURL, err := url.Parse(parsedWhatwgURL.Href(false))
if err != nil {
return err
}
if hdr == nil {
hdr = http.Header{}
if c.Headers != nil {
for k, v := range *c.Headers {
for _, value := range v {
hdr.Add(k, value)
}
}
}
}
if _, ok := hdr["User-Agent"]; !ok {
hdr.Set("User-Agent", c.UserAgent)
}
if seeker, ok := requestData.(io.ReadSeeker); ok {
_, err := seeker.Seek(0, io.SeekStart)
if err != nil {
return err
}
}
req, err := http.NewRequest(method, parsedURL.String(), requestData)
if err != nil {
return err
}
req.Header = hdr
// The Go HTTP API ignores "Host" in the headers, preferring the client
// to use the Host field on Request.
if hostHeader := hdr.Get("Host"); hostHeader != "" {
req.Host = hostHeader
}
// note: once 1.13 is minimum supported Go version,
// replace this with http.NewRequestWithContext
req = req.WithContext(context.WithValue(c.Context, CheckRevisitKey, checkRevisit))
if err := c.requestCheck(parsedURL, method, req.GetBody, depth, checkRevisit); err != nil {
return err
}
u = parsedURL.String()
c.wg.Add(1)
if c.Async {
go c.fetch(u, method, depth, requestData, ctx, hdr, req)
return nil
}
return c.fetch(u, method, depth, requestData, ctx, hdr, req)
}
func (c *Collector) fetch(u, method string, depth int, requestData io.Reader, ctx *Context, hdr http.Header, req *http.Request) error {
defer c.wg.Done()
if ctx == nil {
ctx = NewContext()
}
request := &Request{
URL: req.URL,
Headers: &req.Header,
Host: req.Host,
Ctx: ctx,
Depth: depth,
Method: method,
Body: requestData,
collector: c,
ID: c.requestCount.Add(1),
}
if req.Header.Get("Accept") == "" {
req.Header.Set("Accept", "*/*")
}
c.handleOnRequest(request)
if request.abort {
return nil
}
if method == "POST" && req.Header.Get("Content-Type") == "" {
req.Header.Add("Content-Type", "application/x-www-form-urlencoded")
}
var hTrace *HTTPTrace
if c.TraceHTTP {
hTrace = &HTTPTrace{}
req = hTrace.WithTrace(req)
}
origURL := req.URL
checkResponseHeadersFunc := func(req *http.Request, statusCode int, headers http.Header) bool {
if req.URL != origURL {
request.URL = req.URL
request.Headers = &req.Header
}
c.handleOnResponseHeaders(&Response{Ctx: ctx, Request: request, StatusCode: statusCode, Headers: &headers})
return !request.abort
}
checkRequestHeadersFunc := func(req *http.Request) bool {
c.handleOnRequestHeaders(request)
return !request.abort
}
response, err := c.backend.Cache(req, c.MaxBodySize, checkRequestHeadersFunc, checkResponseHeadersFunc, c.CacheDir, c.CacheExpiration)
if proxyURL, ok := req.Context().Value(ProxyURLKey).(string); ok {
request.ProxyURL = proxyURL
}
if err := c.handleOnError(response, err, request, ctx); err != nil {
return err
}
c.responseCount.Add(1)
response.Ctx = ctx
response.Request = request
response.Trace = hTrace
err = response.fixCharset(c.DetectCharset, request.ResponseCharacterEncoding)
if err != nil {
return err
}
c.handleOnResponse(response)
err = c.handleOnHTML(response)
if err != nil {
c.handleOnError(response, err, request, ctx)
}
err = c.handleOnXML(response)
if err != nil {
c.handleOnError(response, err, request, ctx)
}
c.handleOnScraped(response)
return err
}
func (c *Collector) requestCheck(parsedURL *url.URL, method string, getBody func() (io.ReadCloser, error), depth int, checkRevisit bool) error {
u := parsedURL.String()
if c.MaxDepth > 0 && c.MaxDepth < depth {
return ErrMaxDepth
}
if c.MaxRequests > 0 && c.requestCount.Load() >= c.MaxRequests {
return ErrMaxRequests
}
if err := c.checkFilters(u, parsedURL.Hostname()); err != nil {
return err
}
if method != "HEAD" && !c.IgnoreRobotsTxt {
if err := c.checkRobots(parsedURL); err != nil {
return err
}
}
if checkRevisit && !c.AllowURLRevisit {
// TODO weird behaviour, it allows CheckHead to work correctly,
// but it should probably better be solved with
// "check-but-not-save" flag or something
if method != "GET" && getBody == nil {
return nil
}
var body io.ReadCloser
if getBody != nil {
var err error
body, err = getBody()
if err != nil {
return err
}
defer body.Close()
}
uHash := requestHash(u, body)
visited, err := c.store.IsVisited(uHash)
if err != nil {
return err
}
if visited {
return &AlreadyVisitedError{parsedURL}
}
return c.store.Visited(uHash)
}
return nil
}
func (c *Collector) checkFilters(URL, domain string) error {
if len(c.DisallowedURLFilters) > 0 {
if isMatchingFilter(c.DisallowedURLFilters, []byte(URL)) {
return ErrForbiddenURL
}
}
if len(c.URLFilters) > 0 {
if !isMatchingFilter(c.URLFilters, []byte(URL)) {
return ErrNoURLFiltersMatch
}
}
if !c.isDomainAllowed(domain) {
return ErrForbiddenDomain
}
return nil
}
func (c *Collector) isDomainAllowed(domain string) bool {
if slices.Contains(c.DisallowedDomains, domain) {
return false
}
if c.AllowedDomains == nil || len(c.AllowedDomains) == 0 {
return true
}
return slices.Contains(c.AllowedDomains, domain)
}
func (c *Collector) checkRobots(u *url.URL) error {
c.lock.RLock()
robot, ok := c.robotsMap[u.Host]
c.lock.RUnlock()
if !ok {
// no robots file cached
// Prepare request,
req, err := http.NewRequest("GET", u.Scheme+"://"+u.Host+"/robots.txt", nil)
if err != nil {
return err
}
hdr := http.Header{}
if c.Headers != nil {
for k, v := range *c.Headers {
for _, value := range v {
hdr.Add(k, value)
}
}
}
if _, ok := hdr["User-Agent"]; !ok {
hdr.Set("User-Agent", c.UserAgent)
}
req.Header = hdr
// The Go HTTP API ignores "Host" in the headers, preferring the client
// to use the Host field on Request.
if hostHeader := hdr.Get("Host"); hostHeader != "" {
req.Host = hostHeader
}
resp, err := c.backend.Client.Do(req)
if err != nil {
return err
}
defer resp.Body.Close()
robot, err = robotstxt.FromResponse(resp)
if err != nil {
return err
}
c.lock.Lock()
c.robotsMap[u.Host] = robot
c.lock.Unlock()
}
uaGroup := robot.FindGroup(c.UserAgent)
if uaGroup == nil {
return nil
}
eu := u.EscapedPath()
if u.RawQuery != "" {
eu += "?" + u.Query().Encode()
}
if !uaGroup.Test(eu) {
return ErrRobotsTxtBlocked
}
return nil
}
// String is the text representation of the collector.
// It contains useful debug information about the collector's internals
func (c *Collector) String() string {
return fmt.Sprintf(
"Requests made: %d (%d responses) | Callbacks: OnRequest: %d, OnHTML: %d, OnResponse: %d, OnError: %d",
c.requestCount.Load(),
c.responseCount.Load(),
len(c.requestCallbacks),
len(c.htmlCallbacks),
len(c.responseCallbacks),
len(c.errorCallbacks),
)
}
// Wait returns when the collector jobs are finished
func (c *Collector) Wait() {
c.wg.Wait()
}
// OnRequest registers a function. Function will be executed on every
// request made by the Collector
func (c *Collector) OnRequest(f RequestCallback) {
c.lock.Lock()
if c.requestCallbacks == nil {
c.requestCallbacks = make([]RequestCallback, 0, 4)
}
c.requestCallbacks = append(c.requestCallbacks, f)
c.lock.Unlock()
}
// OnResponseHeaders registers a function. Function will be executed on every response
// when headers and status are already received, but body is not yet read.
//
// Like in OnRequest, you can call Request.Abort to abort the transfer. This might be
// useful if, for example, you're following all hyperlinks, but want to avoid
// downloading files.
//
// Be aware that using this will prevent HTTP/1.1 connection reuse, as
// the only way to abort a download is to immediately close the connection.
// HTTP/2 doesn't suffer from this problem, as it's possible to close
// specific stream inside the connection.
func (c *Collector) OnResponseHeaders(f ResponseHeadersCallback) {
c.lock.Lock()
c.responseHeadersCallbacks = append(c.responseHeadersCallbacks, f)
c.lock.Unlock()
}
// OnRequestHeaders registers a function. Function will be executed on every
// request made by the Collector before Request Do
func (c *Collector) OnRequestHeaders(f RequestCallback) {
c.lock.Lock()
c.requestHeadersCallbacks = append(c.requestHeadersCallbacks, f)
c.lock.Unlock()
}
// OnResponse registers a function. Function will be executed on every response
func (c *Collector) OnResponse(f ResponseCallback) {
c.lock.Lock()
if c.responseCallbacks == nil {
c.responseCallbacks = make([]ResponseCallback, 0, 4)
}
c.responseCallbacks = append(c.responseCallbacks, f)
c.lock.Unlock()
}
// OnHTML registers a function. Function will be executed on every HTML
// element matched by the GoQuery Selector parameter.
// GoQuery Selector is a selector used by https://github.com/PuerkitoBio/goquery
func (c *Collector) OnHTML(goquerySelector string, f HTMLCallback) {
c.lock.Lock()
if c.htmlCallbacks == nil {
c.htmlCallbacks = make([]*htmlCallbackContainer, 0, 4)
}
cc := &htmlCallbackContainer{
Selector: goquerySelector,
Function: f,
}
cc.active.Store(true)
c.htmlCallbacks = append(c.htmlCallbacks, cc)
c.lock.Unlock()
}
// OnXML registers a function. Function will be executed on every XML
// element matched by the xpath Query parameter.
// xpath Query is used by https://github.com/antchfx/xmlquery
func (c *Collector) OnXML(xpathQuery string, f XMLCallback) {
c.lock.Lock()
if c.xmlCallbacks == nil {
c.xmlCallbacks = make([]*xmlCallbackContainer, 0, 4)
}
cc := &xmlCallbackContainer{
Query: xpathQuery,
Function: f,
}
cc.active.Store(true)
c.xmlCallbacks = append(c.xmlCallbacks, cc)
c.lock.Unlock()
}
// OnHTMLDetach deregister a function. Function will not be execute after detached
func (c *Collector) OnHTMLDetach(goquerySelector string) {
c.lock.Lock()
defer c.lock.Unlock()
for _, cc := range c.htmlCallbacks {
if cc.Selector == goquerySelector {
cc.active.Store(false)
}
}
}
// OnXMLDetach deregister a function. Function will not be execute after detached
func (c *Collector) OnXMLDetach(xpathQuery string) {
c.lock.Lock()
defer c.lock.Unlock()
for _, cc := range c.xmlCallbacks {
if cc.Query == xpathQuery {
cc.active.Store(false)
}
}
}
// OnError registers a function. Function will be executed if an error
// occurs during the HTTP request.
func (c *Collector) OnError(f ErrorCallback) {
c.lock.Lock()
if c.errorCallbacks == nil {
c.errorCallbacks = make([]ErrorCallback, 0, 4)
}
c.errorCallbacks = append(c.errorCallbacks, f)
c.lock.Unlock()
}
// OnScraped registers a function that will be executed as the final part of
// the scraping, after OnHTML and OnXML have finished.
func (c *Collector) OnScraped(f ScrapedCallback) {
c.lock.Lock()
if c.scrapedCallbacks == nil {
c.scrapedCallbacks = make([]ScrapedCallback, 0, 4)
}
c.scrapedCallbacks = append(c.scrapedCallbacks, f)
c.lock.Unlock()
}
// SetClient will override the previously set http.Client
func (c *Collector) SetClient(client *http.Client) {
c.backend.Client = client
}
// WithTransport allows you to set a custom http.RoundTripper (transport)
func (c *Collector) WithTransport(transport http.RoundTripper) {
c.backend.Client.Transport = transport
}
// DisableCookies turns off cookie handling
func (c *Collector) DisableCookies() {
c.backend.Client.Jar = nil
}
// SetCookieJar overrides the previously set cookie jar
func (c *Collector) SetCookieJar(j http.CookieJar) {
c.backend.Client.Jar = j
}
// SetRequestTimeout overrides the default timeout (10 seconds) for this collector
func (c *Collector) SetRequestTimeout(timeout time.Duration) {
c.backend.Client.Timeout = timeout
}
// SetStorage overrides the default in-memory storage.
// Storage stores scraping related data like cookies and visited urls
func (c *Collector) SetStorage(s storage.Storage) error {
if err := s.Init(); err != nil {
return err
}
c.store = s
c.backend.Client.Jar = createJar(s)
return nil
}
// SetProxy sets a proxy for the collector. This method overrides the previously
// used http.Transport if the type of the transport is not http.RoundTripper.
// The proxy type is determined by the URL scheme. "http"
// and "socks5" are supported. If the scheme is empty,
// "http" is assumed.
func (c *Collector) SetProxy(proxyURL string) error {
proxyParsed, err := url.Parse(proxyURL)
if err != nil {
return err
}
c.SetProxyFunc(http.ProxyURL(proxyParsed))
return nil
}
// SetProxyFunc sets a custom proxy setter/switcher function.
// See built-in ProxyFuncs for more details.
// This method overrides the previously used http.Transport
// if the type of the transport is not *http.Transport.
// The proxy type is determined by the URL scheme. "http"
// and "socks5" are supported. If the scheme is empty,
// "http" is assumed.
func (c *Collector) SetProxyFunc(p ProxyFunc) {
t, ok := c.backend.Client.Transport.(*http.Transport)
if c.backend.Client.Transport != nil && ok {
t.Proxy = p
t.DisableKeepAlives = true
} else {
c.backend.Client.Transport = &http.Transport{
Proxy: p,
DisableKeepAlives: true,
}
}
}
func createEvent(eventType string, requestID, collectorID uint32, kvargs map[string]string) *debug.Event {
return &debug.Event{
CollectorID: collectorID,
RequestID: requestID,
Type: eventType,
Values: kvargs,
}
}
func (c *Collector) handleOnRequest(r *Request) {
if c.debugger != nil {
c.debugger.Event(createEvent("request", r.ID, c.ID, map[string]string{
"url": r.URL.String(),
}))
}
for _, f := range c.requestCallbacks {
f(r)
}
}
func (c *Collector) handleOnResponse(r *Response) {
if c.debugger != nil {
c.debugger.Event(createEvent("response", r.Request.ID, c.ID, map[string]string{
"url": r.Request.URL.String(),
"status": http.StatusText(r.StatusCode),
}))
}
for _, f := range c.responseCallbacks {
f(r)
}
}
func (c *Collector) handleOnResponseHeaders(r *Response) {
if c.debugger != nil {
c.debugger.Event(createEvent("responseHeaders", r.Request.ID, c.ID, map[string]string{
"url": r.Request.URL.String(),
"status": http.StatusText(r.StatusCode),
}))
}
for _, f := range c.responseHeadersCallbacks {
f(r)
}
}
func (c *Collector) handleOnRequestHeaders(r *Request) {
if c.debugger != nil {
c.debugger.Event(createEvent("requestHeaders", r.ID, c.ID, map[string]string{
"url": r.URL.String(),
}))
}
for _, f := range c.requestHeadersCallbacks {
f(r)
}
}
func (c *Collector) handleOnHTML(resp *Response) error {
c.lock.RLock()
htmlCallbacks := slices.Clone(c.htmlCallbacks)
c.lock.RUnlock()
if len(htmlCallbacks) == 0 {
return nil
}
contentType := resp.Headers.Get("Content-Type")
if contentType == "" {
contentType = http.DetectContentType(resp.Body)
}
// implementation of mime.ParseMediaType without parsing the params
// part
mediatype, _, _ := strings.Cut(contentType, ";")
mediatype = strings.TrimSpace(strings.ToLower(mediatype))
// TODO we also want to parse application/xml as XHTML if it has
// appropriate doctype
switch mediatype {
case "text/html", "application/xhtml+xml":
default:
return nil
}
doc, err := goquery.NewDocumentFromReader(bytes.NewBuffer(resp.Body))
if err != nil {
return err
}
if href, found := doc.Find("base[href]").Attr("href"); found {
u, err := urlParser.ParseRef(resp.Request.URL.String(), href)
if err == nil {
baseURL, err := url.Parse(u.Href(false))
if err == nil {
resp.Request.baseURL = baseURL
}
}
}
for _, cc := range htmlCallbacks {
if !cc.active.Load() {
continue
}
i := 0
doc.Find(cc.Selector).Each(func(_ int, s *goquery.Selection) {
for _, n := range s.Nodes {
e := NewHTMLElementFromSelectionNode(resp, s, n, i)
i++
if c.debugger != nil {
c.debugger.Event(createEvent("html", resp.Request.ID, c.ID, map[string]string{
"selector": cc.Selector,
"url": resp.Request.URL.String(),
}))
}
cc.Function(e)
}
})
}
return nil
}
func (c *Collector) handleOnXML(resp *Response) error {
c.lock.RLock()
xmlCallbacks := slices.Clone(c.xmlCallbacks)
c.lock.RUnlock()
if len(xmlCallbacks) == 0 {
return nil
}
contentType := strings.ToLower(resp.Headers.Get("Content-Type"))
isXMLFile := strings.HasSuffix(strings.ToLower(resp.Request.URL.Path), ".xml") || strings.HasSuffix(strings.ToLower(resp.Request.URL.Path), ".xml.gz")
if !strings.Contains(contentType, "html") && (!strings.Contains(contentType, "xml") && !isXMLFile) {
return nil
}
if strings.Contains(contentType, "html") {
doc, err := htmlquery.Parse(bytes.NewBuffer(resp.Body))
if err != nil {
return err
}
if e := htmlquery.FindOne(doc, "//base"); e != nil {
for _, a := range e.Attr {
if a.Key == "href" {
baseURL, err := resp.Request.URL.Parse(a.Val)
if err == nil {
resp.Request.baseURL = baseURL
}
break
}
}
}
for _, cc := range xmlCallbacks {
if !cc.active.Load() {
continue
}
for i, n := range htmlquery.Find(doc, cc.Query) {
e := NewXMLElementFromHTMLNode(resp, n)
e.Index = i
if c.debugger != nil {
c.debugger.Event(createEvent("xml", resp.Request.ID, c.ID, map[string]string{
"selector": cc.Query,
"url": resp.Request.URL.String(),
}))
}
cc.Function(e)
}
}
} else if strings.Contains(contentType, "xml") || isXMLFile {
doc, err := xmlquery.Parse(bytes.NewBuffer(resp.Body))
if err != nil {
return err
}
for _, cc := range xmlCallbacks {
if !cc.active.Load() {
continue
}
xmlquery.FindEach(doc, cc.Query, func(i int, n *xmlquery.Node) {
e := NewXMLElementFromXMLNode(resp, n)
if c.debugger != nil {
c.debugger.Event(createEvent("xml", resp.Request.ID, c.ID, map[string]string{
"selector": cc.Query,
"url": resp.Request.URL.String(),
}))
}
cc.Function(e)
})
}
}
return nil
}
func (c *Collector) handleOnError(response *Response, err error, request *Request, ctx *Context) error {
if err == nil && (c.ParseHTTPErrorResponse || response.StatusCode < 203) {
return nil
}
if err == nil && response.StatusCode >= 203 {
err = errors.New(http.StatusText(response.StatusCode))
}
if response == nil {
response = &Response{
Request: request,
Ctx: ctx,
}
}
if c.debugger != nil {
c.debugger.Event(createEvent("error", request.ID, c.ID, map[string]string{
"url": request.URL.String(),
"status": http.StatusText(response.StatusCode),
}))
}
if response.Request == nil {
response.Request = request
}
if response.Ctx == nil {
response.Ctx = request.Ctx
}
for _, f := range c.errorCallbacks {
f(response, err)
}
return err
}
func (c *Collector) cleanupCallbacks() {
c.lock.Lock()
defer c.lock.Unlock()
// Clean HTML callbacks
c.htmlCallbacks = slices.DeleteFunc(c.htmlCallbacks, func(cc *htmlCallbackContainer) bool {
return !cc.active.Load()
})
// Clean XML callbacks
c.xmlCallbacks = slices.DeleteFunc(c.xmlCallbacks, func(cc *xmlCallbackContainer) bool {
return !cc.active.Load()
})
}
func (c *Collector) handleOnScraped(r *Response) {
if c.debugger != nil {
c.debugger.Event(createEvent("scraped", r.Request.ID, c.ID, map[string]string{
"url": r.Request.URL.String(),
}))
}
for _, f := range c.scrapedCallbacks {
f(r)
}
// Cleanup inactive callbacks after processing each response
c.cleanupCallbacks()
}
// Limit adds a new LimitRule to the collector
func (c *Collector) Limit(rule *LimitRule) error {
return c.backend.Limit(rule)
}
// Limits adds new LimitRules to the collector
func (c *Collector) Limits(rules []*LimitRule) error {
return c.backend.Limits(rules)
}
// SetRedirectHandler instructs the Collector to allow multiple downloads of the same URL
func (c *Collector) SetRedirectHandler(f func(req *http.Request, via []*http.Request) error) {
c.redirectHandler = f
c.backend.Client.CheckRedirect = c.checkRedirectFunc()
}
// SetCookies handles the receipt of the cookies in a reply for the given URL
func (c *Collector) SetCookies(URL string, cookies []*http.Cookie) error {
if c.backend.Client.Jar == nil {
return ErrNoCookieJar
}
u, err := url.Parse(URL)
if err != nil {
return err
}
c.backend.Client.Jar.SetCookies(u, cookies)
return nil
}
// Cookies returns the cookies to send in a request for the given URL.
func (c *Collector) Cookies(URL string) []*http.Cookie {
if c.backend.Client.Jar == nil {
return nil
}
u, err := url.Parse(URL)
if err != nil {
return nil
}
return c.backend.Client.Jar.Cookies(u)
}
// Clone creates an exact copy of a Collector without callbacks.
// HTTP backend, robots.txt cache and cookie jar are shared
// between collectors.
func (c *Collector) Clone() *Collector {
return &Collector{
AllowedDomains: c.AllowedDomains,
AllowURLRevisit: c.AllowURLRevisit,
CacheDir: c.CacheDir,
CacheExpiration: c.CacheExpiration,
DetectCharset: c.DetectCharset,
DisallowedDomains: c.DisallowedDomains,
ID: atomic.AddUint32(&collectorCounter, 1),
IgnoreRobotsTxt: c.IgnoreRobotsTxt,
MaxBodySize: c.MaxBodySize,
MaxDepth: c.MaxDepth,
MaxRequests: c.MaxRequests,
DisallowedURLFilters: c.DisallowedURLFilters,
URLFilters: c.URLFilters,
CheckHead: c.CheckHead,
ParseHTTPErrorResponse: c.ParseHTTPErrorResponse,
UserAgent: c.UserAgent,
Headers: c.Headers,
TraceHTTP: c.TraceHTTP,
Context: c.Context,
store: c.store,
backend: c.backend,
debugger: c.debugger,
Async: c.Async,
redirectHandler: c.redirectHandler,
errorCallbacks: make([]ErrorCallback, 0, 8),
htmlCallbacks: make([]*htmlCallbackContainer, 0, 8),
xmlCallbacks: make([]*xmlCallbackContainer, 0, 8),
scrapedCallbacks: make([]ScrapedCallback, 0, 8),
lock: c.lock,
requestCallbacks: make([]RequestCallback, 0, 8),
responseCallbacks: make([]ResponseCallback, 0, 8),
robotsMap: c.robotsMap,
wg: &sync.WaitGroup{},
}
}
func (c *Collector) checkRedirectFunc() func(req *http.Request, via []*http.Request) error {
return func(req *http.Request, via []*http.Request) error {
if err := c.checkFilters(req.URL.String(), req.URL.Hostname()); err != nil {
return fmt.Errorf("Not following redirect to %q: %w", req.URL, err)
}
// allow redirects to the original destination
// to support websites redirecting to the same page while setting
// session cookies
samePageRedirect := normalizeURL(req.URL.String()) == normalizeURL(via[0].URL.String())
if !c.AllowURLRevisit && !samePageRedirect {
var body io.ReadCloser
if req.GetBody != nil {
var err error
body, err = req.GetBody()
if err != nil {
return err
}
defer body.Close()
}
uHash := requestHash(req.URL.String(), body)
visited, err := c.store.IsVisited(uHash)
if err != nil {
return err
}
if visited {
if checkRevisit, ok := req.Context().Value(CheckRevisitKey).(bool); !ok || checkRevisit {
return &AlreadyVisitedError{req.URL}
}
}
err = c.store.Visited(uHash)
if err != nil {
return err
}
}
if c.redirectHandler != nil {
return c.redirectHandler(req, via)
}
// Honor golangs default of maximum of 10 redirects
if len(via) >= 10 {
return http.ErrUseLastResponse
}
lastRequest := via[len(via)-1]
// If domain has changed, remove the Authorization-header if it exists
if req.URL.Host != lastRequest.URL.Host {
req.Header.Del("Authorization")
}
return nil
}
}
func (c *Collector) parseSettingsFromEnv() {
for _, e := range os.Environ() {
if !strings.HasPrefix(e, envVariablePrefix) {
continue
}
pair := strings.SplitN(e[len(envVariablePrefix):], "=", 2)
if f, ok := envMap[pair[0]]; ok {
f(c, pair[1])
} else {
log.Println("Unknown environment variable:", pair[0])
}
}
}
func (c *Collector) checkHasVisited(URL string, requestData map[string]string) (bool, error) {
hash := requestHash(URL, createFormReader(requestData))
return c.store.IsVisited(hash)
}
// SanitizeFileName replaces dangerous characters in a string
// so the return value can be used as a safe file name.
func SanitizeFileName(fileName string) string {
ext := filepath.Ext(fileName)
cleanExt := sanitize.BaseName(ext)
if cleanExt == "" {
cleanExt = ".unknown"
}
return strings.Replace(fmt.Sprintf(
"%s.%s",
sanitize.BaseName(fileName[:len(fileName)-len(ext)]),
cleanExt[1:],
), "-", "_", -1)
}
func createFormReader(data map[string]string) io.Reader {
form := url.Values{}
for k, v := range data {
form.Add(k, v)
}
return strings.NewReader(form.Encode())
}
func createMultipartReader(boundary string, data map[string][]byte) io.Reader {
dashBoundary := "--" + boundary
body := []byte{}
buffer := bytes.NewBuffer(body)
buffer.WriteString("Content-type: multipart/form-data; boundary=" + boundary + "\n\n")
for contentType, content := range data {
buffer.WriteString(dashBoundary + "\n")
buffer.WriteString("Content-Disposition: form-data; name=" + contentType + "\n")
buffer.WriteString(fmt.Sprintf("Content-Length: %d \n\n", len(content)))
buffer.Write(content)
buffer.WriteString("\n")
}
buffer.WriteString(dashBoundary + "--\n\n")
return bytes.NewReader(buffer.Bytes())
}
// randomBoundary was borrowed from
// github.com/golang/go/mime/multipart/writer.go#randomBoundary
func randomBoundary() string {
var buf [30]byte
_, err := io.ReadFull(rand.Reader, buf[:])
if err != nil {
panic(err)
}
return fmt.Sprintf("%x", buf[:])
}
func isYesString(s string) bool {
switch strings.ToLower(s) {
case "1", "yes", "true", "y":
return true
}
return false
}
func createJar(s storage.Storage) http.CookieJar {
return &cookieJarSerializer{store: s, lock: &sync.RWMutex{}}
}
func (j *cookieJarSerializer) SetCookies(u *url.URL, cookies []*http.Cookie) {
j.lock.Lock()
defer j.lock.Unlock()
cookieStr := j.store.Cookies(u)
// Merge existing cookies, new cookies have precedence.
cnew := make([]*http.Cookie, len(cookies))
copy(cnew, cookies)
existing := storage.UnstringifyCookies(cookieStr)
for _, c := range existing {
if !storage.ContainsCookie(cnew, c.Name) {
cnew = append(cnew, c)
}
}
j.store.SetCookies(u, storage.StringifyCookies(cnew))
}
func (j *cookieJarSerializer) Cookies(u *url.URL) []*http.Cookie {
cookies := storage.UnstringifyCookies(j.store.Cookies(u))
// Filter.
now := time.Now()
cnew := make([]*http.Cookie, 0, len(cookies))
for _, c := range cookies {
// Drop expired cookies.
if c.RawExpires != "" && c.Expires.Before(now) {
continue
}
// Drop secure cookies if not over https.
if c.Secure && u.Scheme != "https" {
continue
}
cnew = append(cnew, c)
}
return cnew
}
func isMatchingFilter(fs []*regexp.Regexp, d []byte) bool {
for _, r := range fs {
if r.Match(d) {
return true
}
}
return false
}
func normalizeURL(u string) string {
parsed, err := urlParser.Parse(u)
if err != nil {
return u
}
return parsed.String()
}
func requestHash(url string, body io.Reader) uint64 {
h := fnv.New64a()
// reparse the url to fix ambiguities such as
// "http://example.com" vs "http://example.com/"
io.WriteString(h, normalizeURL(url))
if body != nil {
io.Copy(h, body)
}
return h.Sum64()
}
================================================
FILE: colly_test.go
================================================
// Copyright 2018 Adam Tauber
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package colly
import (
"bufio"
"bytes"
"context"
"errors"
"fmt"
"net/http"
"net/http/httptest"
"net/url"
"os"
"reflect"
"regexp"
"strings"
"testing"
"time"
"github.com/PuerkitoBio/goquery"
"github.com/gocolly/colly/v2/debug"
)
var serverIndexResponse = []byte("hello world\n")
var callbackTestHTML = []byte(`
Hello World
error
")) }) mux.HandleFunc("/user_agent", func(w http.ResponseWriter, r *http.Request) { w.WriteHeader(200) w.Write([]byte(r.Header.Get("User-Agent"))) }) mux.HandleFunc("/host_header", func(w http.ResponseWriter, r *http.Request) { w.WriteHeader(200) w.Write([]byte(r.Host)) }) mux.HandleFunc("/accept_header", func(w http.ResponseWriter, r *http.Request) { w.WriteHeader(200) w.Write([]byte(r.Header.Get("Accept"))) }) mux.HandleFunc("/custom_header", func(w http.ResponseWriter, r *http.Request) { w.WriteHeader(200) w.Write([]byte(r.Header.Get("Test"))) }) mux.HandleFunc("/base", func(w http.ResponseWriter, r *http.Request) { w.Header().Set("Content-Type", "text/html") w.Write([]byte(`hello
`)) }) mux.HandleFunc("/100%25", func(w http.ResponseWriter, r *http.Request) { w.Write([]byte("100 percent")) }) mux.HandleFunc("/large_binary", func(w http.ResponseWriter, r *http.Request) { w.Header().Set("Content-Type", "application/octet-stream") ww := bufio.NewWriter(w) defer ww.Flush() for { // have to check error to detect client aborting download if _, err := ww.Write([]byte{0x41}); err != nil { return } } }) mux.HandleFunc("/slow", func(w http.ResponseWriter, r *http.Request) { w.WriteHeader(200) ticker := time.NewTicker(100 * time.Millisecond) defer ticker.Stop() i := 0 for { select { case <-r.Context().Done(): return case t := <-ticker.C: fmt.Fprintf(w, "%s\n", t) if flusher, ok := w.(http.Flusher); ok { flusher.Flush() } i++ if i == 10 { return } } } }) mux.HandleFunc("/sitemap.xml.gz", func(w http.ResponseWriter, r *http.Request) { // Return a 404 HTML page for a non-existent .xml.gz URL. // This simulates the scenario in issue #745 where a server // returns an HTML error page for a missing gzipped sitemap. w.Header().Set("Content-Type", "text/html") w.WriteHeader(404) w.Write([]byte(` tags")
}
}
func TestCollectorContentSniffing(t *testing.T) {
ts := newTestServer()
defer ts.Close()
c := NewCollector()
htmlCallbackCalled := false
c.OnResponse(func(r *Response) {
if (*r.Headers)["Content-Type"] != nil {
t.Error("Content-Type unexpectedly not nil")
}
})
c.OnHTML("html", func(e *HTMLElement) {
htmlCallbackCalled = true
})
err := c.Visit(ts.URL + "/html?no-content-type=yes")
if err != nil {
t.Fatal(err)
}
if !htmlCallbackCalled {
t.Error("OnHTML was not called")
}
}
func TestCollectorURLRevisit(t *testing.T) {
ts := newTestServer()
defer ts.Close()
c := NewCollector()
visitCount := 0
c.OnRequest(func(r *Request) {
visitCount++
})
c.Visit(ts.URL)
c.Visit(ts.URL)
if visitCount != 1 {
t.Error("URL revisited")
}
c.AllowURLRevisit = true
c.Visit(ts.URL)
c.Visit(ts.URL)
if visitCount != 3 {
t.Error("URL not revisited")
}
}
func TestCollectorPostRevisit(t *testing.T) {
ts := newTestServer()
defer ts.Close()
postValue := "hello"
postData := map[string]string{
"name": postValue,
}
visitCount := 0
c := NewCollector()
c.OnResponse(func(r *Response) {
if postValue != string(r.Body) {
t.Error("Failed to send data with POST")
}
visitCount++
})
c.Post(ts.URL+"/login", postData)
c.Post(ts.URL+"/login", postData)
c.Post(ts.URL+"/login", map[string]string{
"name": postValue,
"lastname": "world",
})
if visitCount != 2 {
t.Error("URL POST revisited")
}
c.AllowURLRevisit = true
c.Post(ts.URL+"/login", postData)
c.Post(ts.URL+"/login", postData)
if visitCount != 4 {
t.Error("URL POST not revisited")
}
}
func TestCollectorURLRevisitCheck(t *testing.T) {
ts := newTestServer()
defer ts.Close()
c := NewCollector()
visited, err := c.HasVisited(ts.URL)
if err != nil {
t.Error(err.Error())
}
if visited != false {
t.Error("Expected URL to NOT have been visited")
}
c.Visit(ts.URL)
visited, err = c.HasVisited(ts.URL)
if err != nil {
t.Error(err.Error())
}
if visited != true {
t.Error("Expected URL to have been visited")
}
errorTestCases := []struct {
Path string
DestinationError string
}{
{"/", "/"},
{"/redirect?d=/", "/"},
// now that /redirect?d=/ itself is recorded as visited,
// it's now returned in error
{"/redirect?d=/", "/redirect?d=/"},
{"/redirect?d=/redirect%3Fd%3D/", "/redirect?d=/"},
{"/redirect?d=/redirect%3Fd%3D/", "/redirect?d=/redirect%3Fd%3D/"},
{"/redirect?d=/redirect%3Fd%3D/&foo=bar", "/redirect?d=/"},
}
for i, testCase := range errorTestCases {
err := c.Visit(ts.URL + testCase.Path)
if testCase.DestinationError == "" {
if err != nil {
t.Errorf("got unexpected error in test %d: %q", i, err)
}
} else {
var ave *AlreadyVisitedError
if !errors.As(err, &ave) {
t.Errorf("err=%q returned when trying to revisit, expected AlreadyVisitedError", err)
} else {
if got, want := ave.Destination.String(), ts.URL+testCase.DestinationError; got != want {
t.Errorf("wrong destination in AlreadyVisitedError in test %d, got=%q want=%q", i, got, want)
}
}
}
}
}
func TestSetCookieRedirect(t *testing.T) {
type middleware = func(http.Handler) http.Handler
for _, m := range []middleware{
requireSessionCookieSimple,
requireSessionCookieAuthPage,
} {
t.Run("", func(t *testing.T) {
ts := newUnstartedTestServer()
ts.Config.Handler = m(ts.Config.Handler)
ts.Start()
defer ts.Close()
c := NewCollector()
c.OnResponse(func(r *Response) {
if got, want := r.Body, serverIndexResponse; !bytes.Equal(got, want) {
t.Errorf("bad response body got=%q want=%q", got, want)
}
if got, want := r.StatusCode, http.StatusOK; got != want {
t.Errorf("bad response code got=%d want=%d", got, want)
}
})
if err := c.Visit(ts.URL); err != nil {
t.Fatal(err)
}
})
}
}
func TestCollectorPostURLRevisitCheck(t *testing.T) {
ts := newTestServer()
defer ts.Close()
c := NewCollector()
postValue := "hello"
postData := map[string]string{
"name": postValue,
}
posted, err := c.HasPosted(ts.URL+"/login", postData)
if err != nil {
t.Error(err.Error())
}
if posted != false {
t.Error("Expected URL to NOT have been visited")
}
c.Post(ts.URL+"/login", postData)
posted, err = c.HasPosted(ts.URL+"/login", postData)
if err != nil {
t.Error(err.Error())
}
if posted != true {
t.Error("Expected URL to have been visited")
}
postData["lastname"] = "world"
posted, err = c.HasPosted(ts.URL+"/login", postData)
if err != nil {
t.Error(err.Error())
}
if posted != false {
t.Error("Expected URL to NOT have been visited")
}
c.Post(ts.URL+"/login", postData)
posted, err = c.HasPosted(ts.URL+"/login", postData)
if err != nil {
t.Error(err.Error())
}
if posted != true {
t.Error("Expected URL to have been visited")
}
}
// TestCollectorURLRevisitDomainDisallowed ensures that disallowed URL is not considered visited.
func TestCollectorURLRevisitDomainDisallowed(t *testing.T) {
ts := newTestServer()
defer ts.Close()
parsedURL, err := url.Parse(ts.URL)
if err != nil {
t.Fatal(err)
}
c := NewCollector(DisallowedDomains(parsedURL.Hostname()))
err = c.Visit(ts.URL)
if got, want := err, ErrForbiddenDomain; got != want {
t.Fatalf("wrong error on first visit: got=%v want=%v", got, want)
}
err = c.Visit(ts.URL)
if got, want := err, ErrForbiddenDomain; got != want {
t.Fatalf("wrong error on second visit: got=%v want=%v", got, want)
}
}
func TestCollectorPost(t *testing.T) {
ts := newTestServer()
defer ts.Close()
postValue := "hello"
c := NewCollector()
c.OnResponse(func(r *Response) {
if postValue != string(r.Body) {
t.Error("Failed to send data with POST")
}
})
c.Post(ts.URL+"/login", map[string]string{
"name": postValue,
})
}
func TestCollectorPostRaw(t *testing.T) {
ts := newTestServer()
defer ts.Close()
postValue := "hello"
c := NewCollector()
c.OnResponse(func(r *Response) {
if postValue != string(r.Body) {
t.Error("Failed to send data with POST")
}
})
c.PostRaw(ts.URL+"/login", []byte("name="+postValue))
}
func TestCollectorPostRawRevisit(t *testing.T) {
ts := newTestServer()
defer ts.Close()
postValue := "hello"
postData := "name=" + postValue
visitCount := 0
c := NewCollector()
c.OnResponse(func(r *Response) {
if postValue != string(r.Body) {
t.Error("Failed to send data with POST RAW")
}
visitCount++
})
c.PostRaw(ts.URL+"/login", []byte(postData))
c.PostRaw(ts.URL+"/login", []byte(postData))
c.PostRaw(ts.URL+"/login", []byte(postData+"&lastname=world"))
if visitCount != 2 {
t.Error("URL POST RAW revisited")
}
c.AllowURLRevisit = true
c.PostRaw(ts.URL+"/login", []byte(postData))
c.PostRaw(ts.URL+"/login", []byte(postData))
if visitCount != 4 {
t.Error("URL POST RAW not revisited")
}
}
func TestRedirect(t *testing.T) {
ts := newTestServer()
defer ts.Close()
c := NewCollector()
c.OnHTML("a[href]", func(e *HTMLElement) {
u := e.Request.AbsoluteURL(e.Attr("href"))
if !strings.HasSuffix(u, "/redirected/test") {
t.Error("Invalid URL after redirect: " + u)
}
})
c.OnResponseHeaders(func(r *Response) {
if !strings.HasSuffix(r.Request.URL.String(), "/redirected/") {
t.Error("Invalid URL in Request after redirect (OnResponseHeaders): " + r.Request.URL.String())
}
})
c.OnResponse(func(r *Response) {
if !strings.HasSuffix(r.Request.URL.String(), "/redirected/") {
t.Error("Invalid URL in Request after redirect (OnResponse): " + r.Request.URL.String())
}
})
c.Visit(ts.URL + "/redirect")
}
func TestIssue594(t *testing.T) {
// This is a regression test for a data race bug. There's no
// assertions because it's meant to be used with race detector
ts := newTestServer()
defer ts.Close()
c := NewCollector()
// if timeout is set, this bug is not triggered
c.SetClient(&http.Client{Timeout: 0 * time.Second})
c.Visit(ts.URL)
}
func TestRedirectWithDisallowedURLs(t *testing.T) {
ts := newTestServer()
defer ts.Close()
c := NewCollector()
c.DisallowedURLFilters = []*regexp.Regexp{regexp.MustCompile(ts.URL + "/redirected/test")}
c.OnHTML("a[href]", func(e *HTMLElement) {
u := e.Request.AbsoluteURL(e.Attr("href"))
err := c.Visit(u)
if !errors.Is(err, ErrForbiddenURL) {
t.Error("URL should have been forbidden: " + u)
}
})
c.Visit(ts.URL + "/redirect")
}
func TestBaseTag(t *testing.T) {
ts := newTestServer()
defer ts.Close()
c := NewCollector()
c.OnHTML("a[href]", func(e *HTMLElement) {
u := e.Request.AbsoluteURL(e.Attr("href"))
if u != "http://xy.com/z" {
t.Error("Invalid
tags") } } func TestCollectorOnXMLWithXML(t *testing.T) { ts := newTestServer() defer ts.Close() c := NewCollector() titleCallbackCalled := false paragraphCallbackCount := 0 c.OnXML("//page/title", func(e *XMLElement) { titleCallbackCalled = true if e.Text != "Test Page" { t.Error("Title element text does not match, got", e.Text) } }) c.OnXML("//page/paragraph", func(e *XMLElement) { paragraphCallbackCount++ if e.Attr("type") != "description" { t.Error("Failed to get paragraph's type attribute") } }) c.OnXML("/page", func(e *XMLElement) { if e.ChildAttr("paragraph", "type") != "description" { t.Error("Invalid type value") } classes := e.ChildAttrs("paragraph", "type") if len(classes) != 2 { t.Error("Invalid type values") } }) c.Visit(ts.URL + "/xml") if !titleCallbackCalled { t.Error("Failed to call OnXML callback for
a
b
c
This is a regular text paragraph.