Showing preview only (1,231K chars total). Download the full file or copy to clipboard to get everything.
Repository: crwlrsoft/crawler
Branch: main
Commit: d6680f9e698a
Files: 326
Total size: 1.1 MB
Directory structure:
gitextract_d22hbn5_/
├── .editorconfig
├── .gitattributes
├── .github/
│ └── workflows/
│ └── ci.yml
├── .gitignore
├── .php-cs-fixer.php
├── CHANGELOG.md
├── CONTRIBUTING.md
├── LICENSE
├── README.md
├── bin/
│ └── add-git-hooks
├── composer.json
├── git-hooks/
│ └── pre-commit
├── phpstan.neon
├── phpunit.xml
├── src/
│ ├── Cache/
│ │ ├── CacheItem.php
│ │ ├── Exceptions/
│ │ │ ├── MissingZlibExtensionException.php
│ │ │ └── ReadingCacheFailedException.php
│ │ └── FileCache.php
│ ├── Crawler.php
│ ├── HttpCrawler/
│ │ └── AnonymousHttpCrawlerBuilder.php
│ ├── HttpCrawler.php
│ ├── Input.php
│ ├── Io.php
│ ├── Loader/
│ │ ├── Http/
│ │ │ ├── Browser/
│ │ │ │ ├── Screenshot.php
│ │ │ │ └── ScreenshotConfig.php
│ │ │ ├── Cache/
│ │ │ │ └── RetryManager.php
│ │ │ ├── Cookies/
│ │ │ │ ├── Cookie.php
│ │ │ │ ├── CookieJar.php
│ │ │ │ ├── Date.php
│ │ │ │ └── Exceptions/
│ │ │ │ └── InvalidCookieException.php
│ │ │ ├── Exceptions/
│ │ │ │ └── LoadingException.php
│ │ │ ├── HeadlessBrowserLoaderHelper.php
│ │ │ ├── HttpLoader.php
│ │ │ ├── Messages/
│ │ │ │ └── RespondedRequest.php
│ │ │ ├── Politeness/
│ │ │ │ ├── RetryErrorResponseHandler.php
│ │ │ │ ├── RobotsTxtHandler.php
│ │ │ │ ├── Throttler.php
│ │ │ │ └── TimingUnits/
│ │ │ │ └── MultipleOf.php
│ │ │ └── ProxyManager.php
│ │ ├── Loader.php
│ │ └── LoaderInterface.php
│ ├── Logger/
│ │ ├── CliLogger.php
│ │ └── PreStepInvocationLogger.php
│ ├── Output.php
│ ├── Result.php
│ ├── Steps/
│ │ ├── BaseStep.php
│ │ ├── Csv.php
│ │ ├── Dom/
│ │ │ ├── DomDocument.php
│ │ │ ├── HtmlDocument.php
│ │ │ ├── HtmlElement.php
│ │ │ ├── Node.php
│ │ │ ├── NodeList.php
│ │ │ ├── XmlDocument.php
│ │ │ └── XmlElement.php
│ │ ├── Dom.php
│ │ ├── Exceptions/
│ │ │ └── PreRunValidationException.php
│ │ ├── Filters/
│ │ │ ├── AbstractFilter.php
│ │ │ ├── ArrayFilter.php
│ │ │ ├── ClosureFilter.php
│ │ │ ├── ComparisonFilter.php
│ │ │ ├── Enums/
│ │ │ │ ├── ComparisonFilterRule.php
│ │ │ │ ├── StringFilterRule.php
│ │ │ │ ├── StringLengthFilterRule.php
│ │ │ │ └── UrlFilterRule.php
│ │ │ ├── Filter.php
│ │ │ ├── FilterInterface.php
│ │ │ ├── Filterable.php
│ │ │ ├── NegatedFilter.php
│ │ │ ├── StringFilter.php
│ │ │ ├── StringLengthFilter.php
│ │ │ └── UrlFilter.php
│ │ ├── Group.php
│ │ ├── Html/
│ │ │ ├── CssSelector.php
│ │ │ ├── DomQuery.php
│ │ │ ├── Exceptions/
│ │ │ │ └── InvalidDomQueryException.php
│ │ │ ├── GetLink.php
│ │ │ ├── GetLinks.php
│ │ │ ├── MetaData.php
│ │ │ ├── SchemaOrg.php
│ │ │ ├── SelectorTarget.php
│ │ │ └── XPathQuery.php
│ │ ├── Html.php
│ │ ├── Json.php
│ │ ├── Loading/
│ │ │ ├── GetSitemapsFromRobotsTxt.php
│ │ │ ├── Http/
│ │ │ │ ├── AbstractPaginator.php
│ │ │ │ ├── Browser/
│ │ │ │ │ └── BrowserAction.php
│ │ │ │ ├── Document.php
│ │ │ │ ├── Paginate.php
│ │ │ │ ├── Paginator.php
│ │ │ │ └── Paginators/
│ │ │ │ ├── QueryParams/
│ │ │ │ │ ├── AbstractQueryParamManipulator.php
│ │ │ │ │ ├── Decrementor.php
│ │ │ │ │ ├── Incrementor.php
│ │ │ │ │ └── QueryParamManipulator.php
│ │ │ │ ├── QueryParamsPaginator.php
│ │ │ │ ├── SimpleWebsitePaginator.php
│ │ │ │ └── StopRules/
│ │ │ │ ├── Contains.php
│ │ │ │ ├── IsEmptyInDom.php
│ │ │ │ ├── IsEmptyInHtml.php
│ │ │ │ ├── IsEmptyInJson.php
│ │ │ │ ├── IsEmptyInXml.php
│ │ │ │ ├── IsEmptyResponse.php
│ │ │ │ ├── NotContains.php
│ │ │ │ ├── PaginatorStopRules.php
│ │ │ │ └── StopRule.php
│ │ │ ├── Http.php
│ │ │ ├── HttpBase.php
│ │ │ ├── HttpCrawl.php
│ │ │ └── LoadingStep.php
│ │ ├── Refiners/
│ │ │ ├── AbstractRefiner.php
│ │ │ ├── DateTime/
│ │ │ │ └── DateTimeFormat.php
│ │ │ ├── DateTimeRefiner.php
│ │ │ ├── Html/
│ │ │ │ └── RemoveFromHtml.php
│ │ │ ├── HtmlRefiner.php
│ │ │ ├── RefinerInterface.php
│ │ │ ├── String/
│ │ │ │ ├── AbstractStringRefiner.php
│ │ │ │ ├── StrAfterFirst.php
│ │ │ │ ├── StrAfterLast.php
│ │ │ │ ├── StrBeforeFirst.php
│ │ │ │ ├── StrBeforeLast.php
│ │ │ │ ├── StrBetweenFirst.php
│ │ │ │ ├── StrBetweenLast.php
│ │ │ │ └── StrReplace.php
│ │ │ ├── StringRefiner.php
│ │ │ ├── Url/
│ │ │ │ ├── AbstractUrlRefiner.php
│ │ │ │ ├── WithFragment.php
│ │ │ │ ├── WithHost.php
│ │ │ │ ├── WithPath.php
│ │ │ │ ├── WithPort.php
│ │ │ │ ├── WithQuery.php
│ │ │ │ ├── WithScheme.php
│ │ │ │ └── WithoutPort.php
│ │ │ └── UrlRefiner.php
│ │ ├── Sitemap/
│ │ │ └── GetUrlsFromSitemap.php
│ │ ├── Sitemap.php
│ │ ├── Step.php
│ │ ├── StepInterface.php
│ │ ├── StepOutputType.php
│ │ └── Xml.php
│ ├── Stores/
│ │ ├── JsonFileStore.php
│ │ ├── SimpleCsvFileStore.php
│ │ ├── Store.php
│ │ └── StoreInterface.php
│ ├── UserAgents/
│ │ ├── BotUserAgent.php
│ │ ├── BotUserAgentInterface.php
│ │ ├── UserAgent.php
│ │ └── UserAgentInterface.php
│ └── Utils/
│ ├── Gzip.php
│ ├── HttpHeaders.php
│ ├── OutputTypeHelper.php
│ ├── RequestKey.php
│ └── TemplateString.php
└── tests/
├── Cache/
│ ├── CacheItemTest.php
│ ├── FileCacheTest.php
│ └── _cachefilecontent
├── CrawlerTest.php
├── HttpCrawler/
│ └── AnonymousHttpCrawlerBuilderTest.php
├── IoTest.php
├── Loader/
│ ├── Http/
│ │ ├── Browser/
│ │ │ └── ScreenshotConfigTest.php
│ │ ├── Cache/
│ │ │ └── RetryManagerTest.php
│ │ ├── Cookies/
│ │ │ ├── CookieJarTest.php
│ │ │ ├── CookieTest.php
│ │ │ └── DateTest.php
│ │ ├── HeadlessBrowserLoaderHelperTest.php
│ │ ├── HttpLoaderPolitenessTest.php
│ │ ├── HttpLoaderTest.php
│ │ ├── Messages/
│ │ │ └── RespondedRequestTest.php
│ │ ├── Politeness/
│ │ │ ├── RobotsTxtHandlerTest.php
│ │ │ ├── ThrottlerTest.php
│ │ │ └── TimingUnits/
│ │ │ └── MultipleOfTest.php
│ │ └── ProxyManagerTest.php
│ └── LoaderTest.php
├── Logger/
│ ├── CliLoggerTest.php
│ └── PreStepInvocationLoggerTest.php
├── Pest.php
├── ResultTest.php
├── Steps/
│ ├── BaseStepTest.php
│ ├── CsvTest.php
│ ├── Dom/
│ │ ├── HtmlDocumentTest.php
│ │ ├── HtmlElementTest.php
│ │ ├── NodeListTest.php
│ │ ├── NodeTest.php
│ │ ├── XmlDocumentTest.php
│ │ ├── XmlElementTest.php
│ │ └── _Stubs/
│ │ ├── HtmlNodeStub.php
│ │ └── XmlNodeStub.php
│ ├── DomTest.php
│ ├── Filters/
│ │ ├── ArrayFilterTest.php
│ │ ├── ClosureFilterTest.php
│ │ ├── ComparisonFilterTest.php
│ │ ├── Enums/
│ │ │ ├── ComparisonFilterRuleTest.php
│ │ │ ├── StringFilterRuleTest.php
│ │ │ ├── StringLengthFilterRuleTest.php
│ │ │ └── UrlFilterRuleTest.php
│ │ ├── FilterTest.php
│ │ ├── NegatedFilterTest.php
│ │ ├── StringFilterTest.php
│ │ ├── StringLengthFilterTest.php
│ │ └── UrlFilterTest.php
│ ├── GroupTest.php
│ ├── Html/
│ │ ├── CssSelectorTest.php
│ │ ├── Exceptions/
│ │ │ └── InvalidDomQueryExceptionTest.php
│ │ ├── GetLinkTest.php
│ │ ├── GetLinksTest.php
│ │ ├── MetaDataTest.php
│ │ ├── SchemaOrgTest.php
│ │ └── XPathQueryTest.php
│ ├── HtmlTest.php
│ ├── JsonTest.php
│ ├── Loading/
│ │ ├── GetSitemapsFromRobotsTxtTest.php
│ │ ├── Http/
│ │ │ ├── DocumentTest.php
│ │ │ └── Paginators/
│ │ │ ├── AbstractPaginatorTest.php
│ │ │ ├── QueryParams/
│ │ │ │ ├── AbstractQueryParamManipulatorTest.php
│ │ │ │ ├── DecrementorTest.php
│ │ │ │ └── IncrementorTest.php
│ │ │ ├── QueryParamsPaginatorTest.php
│ │ │ ├── SimpleWebsitePaginatorTest.php
│ │ │ └── StopRules/
│ │ │ ├── ContainsTest.php
│ │ │ ├── IsEmptyInHtmlTest.php
│ │ │ ├── IsEmptyInJsonTest.php
│ │ │ ├── IsEmptyInXmlTest.php
│ │ │ ├── IsEmptyResponseTest.php
│ │ │ └── NotContainsTest.php
│ │ ├── HttpTest.php
│ │ └── LoadingStepTest.php
│ ├── Refiners/
│ │ ├── AbstractRefinerTest.php
│ │ ├── DateTime/
│ │ │ └── DateTimeFormatTest.php
│ │ ├── Html/
│ │ │ └── RemoveFromHtmlTest.php
│ │ ├── String/
│ │ │ ├── AfterFirstTest.php
│ │ │ ├── AfterLastTest.php
│ │ │ ├── BeforeFirstTest.php
│ │ │ ├── BeforeLastTest.php
│ │ │ ├── BetweenFirstTest.php
│ │ │ ├── BetweenLastTest.php
│ │ │ └── ReplaceTest.php
│ │ └── Url/
│ │ ├── WithFragmentTest.php
│ │ ├── WithHostTest.php
│ │ ├── WithPathTest.php
│ │ ├── WithPortTest.php
│ │ ├── WithQueryTest.php
│ │ ├── WithSchemeTest.php
│ │ └── WithoutPortTest.php
│ ├── Sitemap/
│ │ └── GetUrlsFromSitemapTest.php
│ ├── StepTest.php
│ ├── XmlTest.php
│ └── _Files/
│ ├── Csv/
│ │ ├── basic.csv
│ │ ├── enclosure.csv
│ │ ├── escape.csv
│ │ ├── separator.csv
│ │ └── with-column-headlines.csv
│ ├── Html/
│ │ ├── basic.html
│ │ ├── bookstore.html
│ │ └── event.html
│ └── Xml/
│ ├── bookstore.xml
│ ├── events.xml
│ └── rss-with-bom.xml
├── Stores/
│ ├── JsonFileStoreTest.php
│ ├── SimpleCsvFileStoreTest.php
│ └── _files/
│ └── .gitkeep
├── UserAgents/
│ ├── BotUserAgentTest.php
│ └── UserAgentTest.php
├── Utils/
│ ├── GzipTest.php
│ ├── HttpHeadersTest.php
│ ├── OutputTypeHelperTest.php
│ ├── RequestKeyTest.php
│ └── TemplateStringTest.php
├── _Integration/
│ ├── GroupTest.php
│ ├── Http/
│ │ ├── CharsetTest.php
│ │ ├── CrawlingTest.php
│ │ ├── ErrorResponsesTest.php
│ │ ├── GzipTest.php
│ │ ├── HeadlessBrowserTest.php
│ │ ├── Html/
│ │ │ ├── PaginatedListingTest.php
│ │ │ └── SimpleListingTest.php
│ │ ├── PaginationTest.php
│ │ ├── ProxyingTest.php
│ │ ├── PublisherExampleTest.php
│ │ ├── QueryParamPaginationTest.php
│ │ ├── RedirectTest.php
│ │ ├── RequestParamsFromInputTest.php
│ │ ├── RetryErrorResponsesTest.php
│ │ ├── RobotsTxtTest.php
│ │ └── TimeoutTest.php
│ ├── ProxyServer.php
│ ├── Server.php
│ └── _Server/
│ ├── BlogPostWithJsonLd.php
│ ├── BrokenMimeTypeRss.php
│ ├── BrowserActions/
│ │ ├── ClickAndWaitForReload.php
│ │ ├── EvaluateAndWaitForReload.php
│ │ ├── EvaluateAndWaitForReloadReloaded.php
│ │ ├── Main.php
│ │ └── Wait.php
│ ├── Crawling.php
│ ├── HelloWorld.php
│ ├── JsGeneratedContent.php
│ ├── NonUtf8.php
│ ├── PageInitScript.php
│ ├── PaginatedListing/
│ │ └── Detail.php
│ ├── PaginatedListing.php
│ ├── PrintCookie.php
│ ├── PrintCookies.php
│ ├── PrintHeaders.php
│ ├── Publisher/
│ │ ├── AuthorDetailPage.php
│ │ ├── AuthorsListPage.php
│ │ ├── BookDetailPage.php
│ │ └── EditionDetailPage.php
│ ├── QueryParamPagination.php
│ ├── RssFeed.php
│ ├── ServiceUnavailable.php
│ ├── SetCookie.php
│ ├── SetCookieJs.php
│ ├── SetDelayedCookieJs.php
│ ├── SetMultipleCookiesJs.php
│ ├── SimpleListing/
│ │ └── Detail.php
│ ├── SimpleListing.php
│ └── TooManyRequests.php
├── _Stubs/
│ ├── AbstractTestPaginator.php
│ ├── Crawlers/
│ │ ├── DummyOne.php
│ │ ├── DummyTwo/
│ │ │ ├── DummyTwoLoader.php
│ │ │ ├── DummyTwoLogger.php
│ │ │ └── DummyTwoUserAgent.php
│ │ └── DummyTwo.php
│ ├── DummyLogger.php
│ ├── PhantasyLoader.php
│ └── RespondedRequestChild.php
└── _Temp/
├── _cachedir/
│ └── .gitkeep
└── _storagedir/
└── .gitkeep
================================================
FILE CONTENTS
================================================
================================================
FILE: .editorconfig
================================================
# EditorConfig is awesome: http://EditorConfig.org
root = true
[*]
charset = utf-8
end_of_line = lf
indent_style = space
indent_size = 4
insert_final_newline = true
trim_trailing_whitespace = true
[*.md]
trim_trailing_whitespace = false
[*.yml]
indent_size = 2
[_cachefilecontent]
insert_final_newline = false
================================================
FILE: .gitattributes
================================================
.github export-ignore
bin/add-git-hooks export-ignore
git-hooks export-ignore
tests export-ignore
.editorconfig export-ignore
.gitattributes export-ignore
.gitignore export-ignore
.php-cs-fixer.php export-ignore
phpstan.neon export-ignore
phpunit.xml export-ignore
================================================
FILE: .github/workflows/ci.yml
================================================
name: CI
on: pull_request
jobs:
tests:
name: PestPHP Tests
runs-on: ubuntu-latest
strategy:
matrix:
php-versions: ['8.1', '8.2', '8.3', '8.4', '8.5']
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Install PHP
uses: shivammathur/setup-php@v2
with:
php-version: ${{ matrix.php-versions }}
- name: Install dependencies
run: composer install --prefer-dist --no-progress
- name: Run tests
run: composer test
- name: Run integration tests
run: composer test-integration
tests84:
name: PestPHP Tests Running only on PHP >= 8.4
runs-on: ubuntu-latest
strategy:
matrix:
php-versions: ['8.4', '8.5']
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Install PHP
uses: shivammathur/setup-php@v2
with:
php-version: ${{ matrix.php-versions }}
- name: Install dependencies
run: composer install --prefer-dist --no-progress
- name: Run tests
run: composer test-php84
stanAndCs:
name: Static Analysis (phpstan) and Code Style (PHP CS Fixer)
runs-on: ubuntu-latest
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Install PHP
uses: shivammathur/setup-php@v2
with:
php-version: '8.1'
coverage: none
- name: Install dependencies
run: composer install --prefer-dist --no-progress
- name: Run PHPStan
run: composer stan
- name: Run PHP CS Fixer
run: composer cs
================================================
FILE: .gitignore
================================================
composer.lock
vendor
.php_cs.cache
.php-cs-fixer.cache
.phpunit.result.cache
.phpunit.cache
/cachedir
/storedir
/tests/_Temp/_cachedir/*
!/tests/_Temp/_cachedir/.gitkeep
================================================
FILE: .php-cs-fixer.php
================================================
<?php
use PhpCsFixer\Config;
use PhpCsFixer\Finder;
use PhpCsFixer\Runner\Parallel\ParallelConfigFactory;
$finder = Finder::create()
->exclude(['tests/_Integration/_Server', '.github', 'bin', 'git-hooks'])
->in(__DIR__);
return (new Config())
->setFinder($finder)
->setParallelConfig(ParallelConfigFactory::detect())
->setRules([
'@PER-CS' => true,
'strict_param' => true,
'array_syntax' => ['syntax' => 'short'],
'no_unused_imports' => true,
'operator_linebreak' => ['only_booleans' => true, 'position' => 'end'],
])
->setRiskyAllowed(true)
->setUsingCache(true);
================================================
FILE: CHANGELOG.md
================================================
# Changelog
All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
## [Unreleased]
## [3.5.6] - 2026-01-05
### Fixed
* Potential issues found with PHPStan 2 on level 8.
## [3.5.5] - 2025-08-05
### Fixed
* Removed the overriding `validateAndSanitizeInput()` method from the `Paginate` HTTP step to ensure features like `staticUrl()` and `useInputKeyAsUrl()` work correctly.
* The `Paginate` HTTP step now also supports receiving an array of URLs, initiating pagination separately for each one.
### Deprecated
* The `Crwlr\Crawler\Steps\Loading\Http\Paginate` class. It shall be removed and its behavior implemented in the `Http` class directly, in the next major version.
## [3.5.4] - 2025-07-28
### Fixed
* An issue in the `SimpleWebsitePaginator` when used with stop rules.
## [3.5.3] - 2025-06-10
### Fixed
* Issues with passing cookies from the cookie jar to the headless browser when using the `useBrowser()` method on `Http` steps, in cases where the loader wasn’t globally configured to use the browser for all requests.
## [3.5.2] - 2025-05-16
### Fixed
* The `Result::toArray()` method now converts all objects contained in the Result array (at any level of the array) to arrays. Also, if the only element in a result array has some autogenerated key containing "unnamed", but the value also is an associative array with string keys, the method only returns that child array.
## [3.5.1] - 2025-04-23
### Fixed
* An issue that occurred, when a step uses the `PreStepInvocationLogger`. As refiners also use the logger, a newer logger (replacing the `PreStepInvocationLogger`) is now also passed to all registered refiners of a step.
* Enable applying refiners to output properties with array value. E.g. if a step outputs an array of URLs (`['https://...', 'https://...']`), a `UrlRefiner` will be applied to all those URLs.
## [3.5.0] - 2025-04-10
### Added
* Dynamically building request URLs from extracted data: `Http` steps now have a new `staticUrl()` method, and you can also use variables within that static URL - as well as in request headers and the body - like `https://www.example.com/foo/[crwl:some_extracted_property]`. These variables will be replaced with the corresponding properties from input data (also works with kept data).
* New Refiners:
* `DateTimeRefiner::reformat('Y-m-d H:i:s')` to reformat a date time string to a different format. Tries to automatically recognize the input format. If this does not work, you can provide an input format to use as the second argument.
* `HtmlRefiner::remove('#foo')` to remove nodes matching the given selector from selected HTML.
* Steps that produce multiple outputs per input can now group them per input by calling the new `Step::oneOutputPerInput()` method.
## [3.4.5] - 2025-04-09
### Fixed
* When feeding an `Http` step with a string that is not a valid URL (e.g. `https://`), the exception when trying to parse it as a URL is caught, and an error logged.
## [3.4.4] - 2025-04-04
### Fixed
* As sometimes, XML parsing errors occur because of characters that aren't valid within XML documents, the library now catches XML parsing errors, tries to find and replace invalid characters (with transliterates or HTML entities) and retries parsing the document. Works best when you additionally install the `voku/portable-ascii` composer package.
## [3.4.3] - 2025-04-03
### Fixed
* When providing an empty base selector to an `Html` step (`Html::each('')`, `Html::first('')`, `Html::last('')`), it won't fail with an error, but instead log a warning, that it most likely doesn't make sense.
* The `Step::keep()` methods now also work when applied to child steps within a group step.
## [3.4.2] - 2025-03-08
### Fixed
* Issue when using `Http::get()->useBrowser()->postBrowserNavigateHook()`. Previously in this case, when the loader is configured to use the HTTP client, the post browser navigate hook was actually not set because of an issue with the order, things happened internally.
## [3.4.1] - 2025-03-08
### Fixed
* Since, when using the Chrome browser for loading, we can only execute GET requests:
* The loader now automatically switches to the HTTP client for POST, PUT, PATCH, and DELETE requests and logs a warning.
* A warning is logged when attempting to use "Post Browser Navigate Hooks" with POST, PUT, PATCH, or DELETE requests.
* Consequently, the `useBrowser()` method, introduced in v3.4.0, is also limited to GET requests.
## [3.4.0] - 2025-03-06
### Added
* Two new methods to the base class of all `Http` steps:
* `skipCache()` – Allows using the cache while skipping it for a specific loading step.
* `useBrowser()` – Switches the loader to use a (headless) Chrome browser for loading calls in a specific step and then reverts the loader to its previous setting.
* Introduced the new `BrowserAction::screenshot()` post browser navigate hook. It accepts an instance of the new `ScreenshotConfig` class, allowing you to configure various options (see the methods of `ScreenshotConfig`). If successful, the screenshot file paths are included in the `RespondedRequest` output object of the `Http` step.
## [3.3.0] - 2025-03-02
### Added
* New `BrowserAction`s to use with the `postBrowserNavigateHook()` method:
* `BrowserAction::clickInsideShadowDom()`
* `BrowserAction::moveMouseToElement()`
* `BrowserAction::moveMouseToPosition()`
* `BrowserAction::scrollDown()`
* `BrowserAction::scrollUp()`
* `BrowserAction::typeText()`
* `BrowserAction::waitForReload()`
* A new method in `HeadlessBrowserLoaderHelper` to include the HTML content of shadow DOM elements in the returned HTML. Use it like this: `$crawler->getLoader()->browser()->includeShadowElementsInHtml()`.
### Changed
* The `BrowserAction::clickElement()` action, now automatically waits for an element matching the selector to be rendered, before performing the click. This means you don't need to put a `BrowserAction::waitUntilDocumentContainsElement()` before it. It works the same in the new `BrowserAction::clickInsideShadowDom()` and `BrowserAction::moveMouseToElement()` actions.
### Deprecated
* `BrowserAction::clickElementAndWaitForReload()` and `BrowserAction::evaluateAndWaitForReload()`. As a replacement, please use `BrowserAction::clickElement()` or `BrowserAction::evaluate()` and `BrowserAction::waitForReload()` separately.
## [3.2.5] - 2025-02-26
### Fixed
* When a child step is nested in the `extract()` method of an `Html` or `Xml` step, and does not use `each()` as the base, the extracted value is an array with the keys defined in the `extract()` call, rather than an array of such arrays as it would be with `each()` as base.
## [3.2.4] - 2025-02-25
### Fixed
* Trying to load a relative reference URI (no scheme and host/authority, only path) via the `HttpLoader` now immediately logs (or throws when `loadOrFail()` is used) an error instead of trying to actually load it.
## [3.2.3] - 2025-01-28
### Fixed
* Fix deprecation warning triggered in the `DomQuery` class, when trying to get the value of an HTML/XML attribute that does not exist on the element.
## [3.2.2] - 2025-01-17
### Fixed
* Warnings about loader hooks being called multiple times, when using a `BotUserAgent` and therefore loading and respecting the robots.txt file, or when using the `Http::stopOnErrorResponse()` method.
## [3.2.1] - 2025-01-13
### Fixed
* Reuse previously opened page when using the (headless) Chrome browser, instead of opening a new page for each request.
## [3.2.0] - 2025-01-12
### Added
* `RespondedRequest::isServedFromCache()` to determine whether a response was served from cache or actually loaded.
## [3.1.5] - 2025-01-10
### Fixed
* Another improvement for getting XML source when using the browser, in cases where Chrome doesn't identify the response as an XML document (even though a Content-Type header is sent).
## [3.1.4] - 2025-01-10
### Fixed
* `HttpLoader::dontUseCookies()` now also works when using the Chrome browser. Cookies are cleared before every request.
## [3.1.3] - 2025-01-10
### Fixed
* Further improve getting the raw response body from non-HTML documents via Chrome browser.
## [3.1.2] - 2025-01-08
### Fixed
* When loading a non-HTML document (e.g., XML) via the Chrome browser, the library now retrieves the original source. Previously, it returned the outerHTML of the rendered document, which wrapped the content in an HTML structure.
## [3.1.1] - 2025-01-07
### Fixed
* When the `validateAndSanitize()` method of a step throws an `InvalidArgumentException`, the exception is now caught, logged and the step is not invoked with the invalid input. This improves fault tolerance. Feeding a step with one invalid input shouldn't cause the whole crawler run to fail. Exceptions other than `InvalidArgumentException` remain uncaught.
## [3.1.0] - 2025-01-03
### Added
* New method `HeadlessBrowserLoaderHelper::setPageInitScript()` (`$crawler->getLoader()->browser()->setPageInitScript()`) to provide javascript code that is executed on every new browser page before navigating anywhere.
* New method `HeadlessBrowserLoaderHelper::useNativeUserAgent()` (`$crawler->getLoader()->browser()->useNativeUserAgent()`) to allow using the native `User-Agent` that your Chrome browser sends by default.
## [3.0.4] - 2024-12-18
### Fixed
* Minor improvement for the `DomQuery` (base for `Dom::cssSelector()` and `Dom::xPath()`): enable providing an empty string as selector, to simply get the node that the selector is applied to.
## [3.0.3] - 2024-12-11
### Fixed
* Improved fix for non UTF-8 characters in HTML documents declared as UTF-8.
## [3.0.2] - 2024-12-11
### Fixed
* When the new PHP 8.4 DOM API is used, and HTML declared as UTF-8 contains non UTF-8 compatible characters, it does not replace them with a � character, but instead removes it. This behaviour is consistent with the data returned by Symfony DomCrawler.
## [3.0.1] - 2024-12-10
### Undeprecated
* Removed deprecations for all XPath functionality (`Dom::xPath()`, `XPathQuery` class and `Node::queryXPath()`), because it's still available with the net DOM API in PHP 8.4.
## [3.0.0] - 2024-12-08
The primary change in version 3.0.0 is that the library now leverages PHP 8.4’s new DOM API when used in an environment with PHP >= 8.4. To maintain compatibility with PHP < 8.4, an abstraction layer has been implemented. This layer dynamically uses either the Symfony DomCrawler component or the new DOM API, depending on the PHP version.
Since no direct interaction with an instance of the Symfony DomCrawler library was required at the step level provided by the library, it is highly likely that you won’t need to make any changes to your code to upgrade to v3. To ensure a smooth transition, please review the points under “Changed.”
### Changed
* __BREAKING__: The `DomQuery::innerText()` method (a.k.a. `Dom::cssSelector('...')->innerText()`) has been removed. `innerText` exists only in the Symfony DomCrawler component, and its usefulness is questionable. If you still require this variant of the DOM element text, please let us know or create a pull request yourself. Thank you!
* __BREAKING__: The `DomQueryInterface` was removed. As the `DomQuery` class offers a lot more functionality than the interface defines, the purpose of the interface was questionable. Please use the abstract `DomQuery` class instead. This also means that some method signatures, type hinting the interface, have changed. Look for occurrences of `DomQueryInterface` and replace them.
* __BREAKING__: The visibility of the `DomQuery::filter()` method was changed from public to protected. It is still needed in the `DomQuery` class, but outside of it, it is probably better and easier to directly use the new DOM abstraction (see the `src/Steps/Dom` directory). If you are extending the `DomQuery` class (which is not recommended), be aware that the argument now takes a `Node` (from the new DOM abstraction) instead of a Symfony `Crawler`.
* __BREAKING__: The `Step::validateAndSanitizeToDomCrawlerInstance()` method was removed. Please use the `Step::validateAndSanitizeToHtmlDocumentInstance()` and `Step::validateAndSanitizeToXmlDocumentInstance()` methods instead.
* __BREAKING__: The second argument in `Closure`s passed to the `Http::crawl()->customFilter()` has changed from an instance of Symfony `Crawler` class, to an `HtmlElement` instance from the new DOM abstraction (`Crwlr\Crawler\Steps\Dom\HtmlElement`).
* __BREAKING__: The Filter class was split into `AbstractFilter` (base class for actual filter classes) and `Filter` only hosting the static function for easy instantiation, because otherwise each filter class also has all the static methods.
* __BREAKING__: Further, the signatures of some methods that are mainly here for internal usage, have changed due to the new DOM abstraction:
* The static `GetLink::isSpecialNonHttpLink()` method now needs an instance of `HtmlElement` instead of a Symfony `Crawler`.
* `GetUrlsFromSitemap::fixUrlSetTag()` now takes an `XmlDocument` instead of a Symfony `Crawler`.
* The `DomQuery::apply()` method now takes a `Node` instead of a Symfony `Crawler`.
### Deprecated
* `Dom::xPath()` method and
* the `XPathQuery` class as well as
* the new `Node::queryXPath()` method.
### Added
* New step output filter `Filter::arrayHasElement()`. When a step produces array output with a property being a numeric array, you can now filter outputs by checking if one element of that array property, matches certain filter criteria. Example: The outputs look like `['foo' => 'bar', 'baz' => ['one', 'two', 'three']]`. You can filter all outputs where `baz` contains `two` like: `Filter::arrayHasElement()->where('baz', Filter::equal('two'))`.
## [2.1.3] - 2024-11-05
### Fixed
* Improvements for deprecations in PHP 8.4.
## [2.1.2] - 2024-10-22
### Fixed
* Issue when converting cookie objects received from the chrome-php library.
## [2.1.1] - 2024-10-21
### Fixed
* Also add cookies, set during headless browser usage, to the cookie jar. When switching back to the (guzzle) HTTP client the cookies should also be sent.
* Don't call `Loader::afterLoad()` when `Loader::beforeLoad()` was not called before. This can potentially happen, when an exception is thrown before the call to the `beforeLoad` hook, but it is caught and the `afterLoader` hook method is called anyway. As this most likely won't make sense to users, the `afterLoad` hook callback functions will just not be called in this case.
* The `Throttler` class now has protected methods `_internalTrackStartFor()`, `_requestToUrlWasStarted()` and `_internalTrackEndFor()`. When extending the `Throttler` class (be careful, actually that's not really recommended) they can be used to check if a request to a URL was actually started before.
## [2.1.0] - 2024-10-19
### Added
* The new `postBrowserNavigateHook()` method in the `Http` step classes, which allows to define callback functions that are triggered after the headless browser navigated to the specified URL. They are called with the chrome-php `Page` object as argument, so you can interact with the page. Also, there is a new class `BrowserAction` providing some simple actions (like wait for element, click element,...) as Closures via static methods. You can use it like `Http::get()->postBrowserNavigateHook(BrowserAction::clickElement('#element'))`.
## [2.0.1] - 2024-10-15
### Fixed
* Issue with the `afterLoad` hook of the `HttpLoader`, introduced in v2. Calling the hook was commented out, which slipped through because the test case was faulty.
## [2.0.0] - 2024-10-15
### Changed
* __BREAKING__: Removed methods `BaseStep::addToResult()`, `BaseStep::addLaterToResult()`, `BaseStep::addsToOrCreatesResult()`, `BaseStep::createsResult()`, and `BaseStep::keepInputData()`. These methods were deprecated in v1.8.0 and should be replaced with `Step::keep()`, `Step::keepAs()`, `Step::keepFromInput()`, and `Step::keepInputAs()`.
* __BREAKING__: Added the following keep methods to the `StepInterface`: `StepInterface::keep()`, `StepInterface::keepAs()`, `StepInterface::keepFromInput()`, `StepInterface::keepInputAs()`, as well as `StepInterface::keepsAnything()`, `StepInterface::keepsAnythingFromInputData()` and `StepInterface::keepsAnythingFromOutputData()`. If you have a class that implements this interface without extending `Step` (or `BaseStep`), you will need to implement these methods yourself. However, it is strongly recommended to extend `Step` instead.
* __BREAKING__: With the removal of the `addToResult()` method, the library no longer uses `toArrayForAddToResult()` methods on output objects. Instead, please use `toArrayForResult()`. Consequently, `RespondedRequest::toArrayForAddToResult()` has been renamed to `RespondedRequest::toArrayForResult()`.
* __BREAKING__: Removed the `result` and `addLaterToResult` properties from `Io` objects (`Input` and `Output`). These properties were part of the `addToResult` feature and are now removed. Instead, use the `keep` property where kept data is added.
* __BREAKING__: The signature of the `Crawler::addStep()` method has changed. You can no longer provide a result key as the first parameter. Previously, this key was passed to the `Step::addToResult()` method internally. Now, please handle this call yourself.
* __BREAKING__: The return type of the `Crawler::loader()` method no longer allows `array`. This means it's no longer possible to provide multiple loaders from the crawler. Instead, use the new functionality to directly provide a custom loader to a step described below. As part of this change, the `UnknownLoaderKeyException` was also removed as it is now obsolete. If you have any references to this class, please make sure to remove them.
* __BREAKING__: Refactored the abstract `LoadingStep` class to a trait and removed the `LoadingStepInterface`. Loading steps should now extend the `Step` class and use the trait. As multiple loaders are no longer supported, the `addLoader` method was renamed to `setLoader`. Similarly, the methods `useLoader()` and `usesLoader()` for selecting loaders by key are removed. Now, you can directly provide a different loader to a single step using the trait's new `withLoader()` method (e.g., `Http::get()->withLoader($loader)`). The trait now also uses phpdoc template tags, for a generic loader type. You can define the loader type by putting `/** @use LoadingStep<MyLoader> */` above `use LoadingStep;` in your step class. Then your IDE and static analysis (if supported) will know what type of loader, the trait methods return and accept.
* __BREAKING__: Removed the `PaginatorInterface` to allow for better extensibility. The old `Crwlr\Crawler\Steps\Loading\Http\Paginators\AbstractPaginator` class has also been removed. Please use the newer, improved version `Crwlr\Crawler\Steps\Loading\Http\AbstractPaginator`. This newer version has also changed: the first argument `UriInterface $url` is removed from the `processLoaded()` method, as the URL also is part of the request (`Psr\Http\Message\RequestInterface`) which is now the first argument. Additionally, the default implementation of the `getNextRequest()` method is removed. Child implementations must define this method themselves. If your custom paginator still has a `getNextUrl()` method, note that it is no longer needed by the library and will not be called. The `getNextRequest()` method now fulfills its original purpose.
* __BREAKING__: Removed methods from `HttpLoader`:
* `$loader->setHeadlessBrowserOptions()` => use `$loader->browser()->setOptions()` instead
* `$loader->addHeadlessBrowserOptions()` => use `$loader->browser()->addOptions()` instead
* `$loader->setChromeExecutable()` => use `$loader->browser()->setExecutable()` instead
* `$loader->browserHelper()` => use `$loader->browser()` instead
* __BREAKING__: Removed method `RespondedRequest::cacheKeyFromRequest()`. Use `RequestKey::from()` instead.
* __BREAKING__: The `HttpLoader::retryCachedErrorResponses()` method now returns an instance of the new `Crwlr\Crawler\Loader\Http\Cache\RetryManager` class. This class provides the methods `only()` and `except()` to restrict retries to specific HTTP response status codes. Previously, this method returned the `HttpLoader` itself (`$this`), so if you're using it in a chain and calling other loader methods after it, you will need to refactor your code.
* __BREAKING__: Removed the `Microseconds` class from this package. It has been moved to the `crwlr/utils` package, which you can use instead.
### Added
* New methods `FileCache::prolong()` and `FileCache::prolongAll()` to allow prolonging the time to live for cached responses.
### Fixed
* The `maxOutputs()` method is now also available and working on `Group` steps.
* Improved warning messages for step validations that are happening before running a crawler.
* A `PreRunValidationException` when the crawler finds a problem with the setup, before actually running, is not only logged as an error via the logger, but also rethrown to the user. This way the user won't get the impression, that the crawler ran successfully without looking at the log messages.
## [1.10.0] - 2024-08-05
### Added
* URL refiners: `UrlRefiner::withScheme()`, `UrlRefiner::withHost()`, `UrlRefiner::withPort()`, `UrlRefiner::withoutPort()`, `UrlRefiner::withPath()`, `UrlRefiner::withQuery()`, `UrlRefiner::withoutQuery()`, `UrlRefiner::withFragment()` and `UrlRefiner::withoutFragment()`.
* New paginator stop rules `PaginatorStopRules::contains()` and `PaginatorStopRules::notContains()`.
* Static method `UserAgent::mozilla5CompatibleBrowser()` to get a `UserAgent` instance with the user agent string `Mozilla/5.0 (compatible)` and also the new method `withMozilla5CompatibleUserAgent` in the `AnonymousHttpCrawlerBuilder` that you can use like this: `HttpCrawler::make()->withMozilla5CompatibleUserAgent()`.
## [1.9.5] - 2024-07-25
### Fixed
* Prevent PHP warnings when an HTTP response includes a `Content-Type: application/x-gzip` header, but the content is not actually compressed. This issue also occurred with cached responses, because compressed content is decoded during caching. Upon retrieval from the cache, the header indicated compression, but the content was already decoded.
## [1.9.4] - 2024-07-24
### Fixed
* When using `HttpLoader::cacheOnlyWhereUrl()` to restrict caching, the filter rule is not only applied when adding newly loaded responses to the cache, but also for using cached responses. Example: a response for `https://www.example.com/foo` is already available in the cache, but `$loader->cacheOnlyWhereUrl(Filter::urlPathStartsWith('/bar/'))` was called, the cached response is not used.
## [1.9.3] - 2024-07-05
### Fixed
* Add `HttpLoader::browser()` as a replacement for `HttpLoader::browserHelper()` and deprecate the `browserHelper()` method. It's an alias and just because it will read a little better: `$loader->browser()->xyz()` vs. `$loader->browserHelper()->xyz()`. `HttpLoader::browserHelper()` will be removed in v2.0.
* Also deprecate `HttpLoader::setHeadlessBrowserOptions()`, `HttpLoader::addHeadlessBrowserOptions()` and `HttpLoader::setChromeExecutable()`. Use `$loader->browser()->setOptions()`, `$loader->browser()->addOptions()` and `$loader->browser()->setExecutable()` instead.
## [1.9.2] - 2024-06-18
### Fixed
* Issue with setting the headless chrome executable, introduced in 1.9.0.
## [1.9.1] - 2024-06-17
### Added
* Also add `HeadlessBrowserLoaderHelper::getTimeout()` to get the currently configured timeout value.
## [1.9.0] - 2024-06-17
### Added
* New methods `HeadlessBrowserLoaderHelper::setTimeout()` and `HeadlessBrowserLoaderHelper::waitForNavigationEvent()` to allow defining the timeout for the headless chrome in milliseconds (default 30000 = 30 seconds) and the navigation event (`load` (default), `DOMContentLoaded`, `firstMeaningfulPaint`, `networkIdle`, etc.) to wait for when loading a URL.
## [1.8.0] - 2024-06-05
### Added
* New methods `Step::keep()` and `Step::keepAs()`, as well as `Step::keepFromInput()` and `Step::keepInputAs()`, as alternatives to `Step::addToResult()` (or `Step::addLaterToResult()`). The `keep()` method can be called without any argument, to keep all from the output data. It can be called with a string, to keep a certain key or with an array to keep a list of keys. If the step yields scalar value outputs (not an associative array or object with keys) you need to use the `keepAs()` method with the key you want the output value to have in the kept data. The methods `keepFromInput()` and `keepInputAs()` work the same, but uses the input (not the output) that the step receives. Most likely only needed with a first step, to keep data from initial inputs (or in a sub crawler, see below). Kept properties can also be accessed with the `Step::useInputKey()` method, so you can easily reuse properties from multiple steps ago as input.
* New method `Step::outputType()` with default implementation returning `StepOutputType::Mixed`. Please consider implementing this method yourself in all your custom steps, because it is going to be required in v2 of the library. It allows detecting (potential) problems in crawling procedures immediately when starting a run instead of failing after already running a while.
* New method `Step::subCrawlerFor()`, allowing to fill output properties from an actual full child crawling procedure. As the first argument, you give it a key from the step's output, that the child crawler uses as input(s). As the second argument you need to provide a `Closure` that receives a clone of the current `Crawler` without steps and with initial inputs, set from the current output. In the `Closure` you then define the crawling procedure by adding steps as you're used to do it, and return it. This allows to achieve nested output data, scraped from different (sub-)pages, more flexible and less complicated as with the usual linear crawling procedure and `Step::addToResult()`.
### Deprecated
* The `Step::addToResult()`, `Step::addLaterToResult()` and `Step::keepInputData()` methods. Instead, please use the new keep methods. This can cause some migration work for v2, because especially the add to result methods are a pretty central functionality, but the new "keep" methodology (plus the new sub crawler feature) will make a lot of things easier, less complex and the library will most likely work more efficiently in v2.
### Fixed
* When a cache file was generated with compression, and you're trying to read it with a `FileCache` instance without compression enabled, it also works. When unserializing the file content fails it tries decoding the string first before unserializing it.
## [1.7.2] - 2024-03-19
### Fixed
* When the `useInputKey()` method is used on a step and the defined key does not exist in input, it logs a warning and does not invoke the step instead of throwing an `Exception`.
## [1.7.1] - 2024-03-11
### Fixed
* A PHP error that happened when the loader returns `null` for the initial request in the `Http::crawl()` step.
## [1.7.0] - 2024-03-04
### Added
* Allow getting the whole decoded JSON as array with the new `Json::all()` and also allow to get the whole decoded JSON, when using `Json::get()`, inside a mapping using either empty string or `*` as target. Example: `Json::get(['all' => '*'])`. `*` only works, when there is no key `*` in the decoded data.
### Fixed
* Make it work with responses loaded by a headless browser. If decoding the input string fails, it now checks if it could be HTML. If that's the case, it extracts the text content of the `<body>` and tries to decode this instead.
## [1.6.2] - 2024-02-26
### Fixed
* When using `HttpLoader::cacheOnlyWhereUrl()` and a request was redirected (maybe even multiple times), previously all URLs in the chain had to match the filter rule. As this isn't really practicable, now only one of the URLs has to match the rule.
## [1.6.1] - 2024-02-16
### Changed
* Make method `HttpLoader::addToCache()` public, so steps can update a cached response with an extended version.
## [1.6.0] - 2024-02-13
### Added
* Enable dot notation in `Step::addToResult()`, so you can get data from nested output, like: `$step->addToResult(['url' => 'response.url', 'status' => 'response.status', 'foo' => 'bar'])`.
* When a step adds output properties to the result, and the output contains objects, it tries to serialize those objects to arrays, by calling `__serialize()`. If you want an object to be serialized differently for that purpose, you can define a `toArrayForAddToResult()` method in that class. When that method exists, it's preferred to the `__serialize()` method.
* Implemented above-mentioned `toArrayForAddToResult()` method in the `RespondedRequest` class, so on every step that somehow yields a `RespondedRequest` object, you can use the keys `url`, `uri`, `status`, `headers` and `body` with the `addToResult()` method. Previously this only worked for `Http` steps, because it defines output key aliases (`HttpBase::outputKeyAliases()`). Now, in combination with the ability to use dot notation when adding data to the result, if your custom step returns nested output like `['response' => RespondedRequest, 'foo' => 'bar']`, you can add response data to the result like this `$step->addToResult(['url' => 'response.url', 'body' => 'response.body'])`.
### Fixed
* Improvement regarding the timing when a store (`Store` class instance) is called by the crawler with a final crawling result. When a crawling step initiates a crawling result (so, `addToResult()` was called on the step instance), the crawler has to wait for all child outputs (resulting from one step-input) until it calls the store, because the child outputs can all add data to the same final result object. But previously this was not only the case for all child outputs starting from a step where `addToResult()` was called, but all children of one initial crawler input. So with this change, in a lot of cases, the store will earlier be called with finished `Result` objects and memory usage will be lowered.
## [1.5.3] - 2024-02-07
### Fixed
* Merge `HttpBaseLoader` back to `HttpLoader`. It's probably not a good idea to have multiple loaders. At least not multiple loaders just for HTTP. It should be enough to publicly expose the `HeadlessBrowserLoaderHelper` via `HttpLoader::browserHelper()` for the extension steps. But keep the `HttpBase` step, to share the general HTTP functionality implemented there.
## [1.5.2] - 2024-02-07
### Fixed
* Issue in `GetUrlsFromSitemap` (`Sitemap::getUrlsFromSitemap()`) step when XML content has no line breaks.
## [1.5.1] - 2024-02-06
### Fixed
* For being more flexible to build a separate headless browser loader (in an extension package) extract the most basic HTTP loader functionality to a new `HttpBaseLoader` and important functionality for the headless browser loader to a new `HeadlessBrowserLoaderHelper`. Further, also share functionality from the `Http` steps via a new abstract `HttpBase` step. It's considered a fix, because there's no new functionality, just refactoring existing code for better extendability.
## [1.5.0] - 2024-01-29
### Added
* The `DomQuery` class (parent of `CssSelector` (`Dom::cssSelector`) and `XPathQuery` (`Dom::xPath`)) has a new method `formattedText()` that uses the new crwlr/html-2-text package to convert the HTML to formatted plain text. You can also provide a customized instance of the `Html2Text` class to the `formattedText()` method.
### Fixed
* The `Http::crawl()` step won't yield a page again if a newly found URL responds with a redirect to a previously loaded URL.
## [1.4.0] - 2024-01-14
### Added
* The `QueryParamsPaginator` can now also increase and decrease non first level query param values like `foo[bar][baz]=5` using dot notation: `QueryParamsPaginator::paramsInUrl()->increaseUsingDotNotation('foo.bar.baz', 5)`.
## [1.3.5] - 2023-12-20
### Fixed
* The `FileCache` can now also read uncompressed cache files when compression is activated.
## [1.3.4] - 2023-12-19
### Fixed
* Reset paginator state after finishing paginating for one base input, to enable paginating multiple listings of the same structure.
## [1.3.3] - 2023-12-01
### Fixed
* Add forgotten getter method to get the DOM query that is attached to an `InvalidDomQueryException` instance.
## [1.3.2] - 2023-12-01
### Fixed
* When creating a `CssSelector` or `XPathQuery` instance with invalid selector/query syntax, an `InvalidDomQueryException` is now immediately thrown. This change is considered to be not only non-breaking, but actually a fix, because the `CssSelector` would otherwise throw an exception later when the `apply()` method is called. The `XPathQuery` would silently return no result without notifying you of the invalid query and generate a PHP warning.
## [1.3.1] - 2023-11-30
### Fixed
* Support usage with the new Symfony major version v7.
## [1.3.0] - 2023-10-28
### Added
* New methods `HttpLoader::useProxy()` and `HttpLoader::useRotatingProxies([...])` to define proxies that the loader shall use. They can be used with a guzzle HTTP client instance (default) and when the loader uses the headless Chrome browser. Using them when providing some other PSR-18 implementation will throw an exception.
* New `QueryParamsPaginator` to paginate by increasing and/or decreasing one or multiple query params, either in the URL or in the body of requests. Can be created via static method `Crwlr\Crawler\Steps\Loading\Http\Paginator::queryParams()`.
* New method `stopWhen` in the new `Crwlr\Crawler\Steps\Loading\Http\AbstractPaginator` class (for more info see the deprecation below). You can pass implementations of the new `StopRule` interface or custom closures to that method and then, every time the Paginator receives a loaded response to process, those stop rules are called with the response. If any of the conditions of the stop rules is met, the Paginator stops paginating. Of course also added a few stop rules to use with that new method: `IsEmptyInHtml`, `IsEmptyInJson`, `IsEmptyInXml` and `IsEmptyResponse`, also available via static methods: `PaginatorStopRules::isEmptyInHtml()`, `PaginatorStopRules::isEmptyInJson()`, `PaginatorStopRules::isEmptyInXml()` and `PaginatorStopRules::isEmptyResponse()`.
### Deprecated
* Deprecated the `Crwlr\Crawler\Steps\Loading\Http\PaginatorInterface` and the `Crwlr\Crawler\Steps\Loading\Http\Paginators\AbstractPaginator`. Instead, added a new version of the `AbstractPaginator` as `Crwlr\Crawler\Steps\Loading\Http\AbstractPaginator` that can be used. Usually there shouldn't be a problem switching from the old to the new version. If you want to make your custom paginator implementation ready for v2 of the library, extend the new `AbstractPaginator` class, implement your own `getNextRequest` method (new requirement, with a default implementation in the abstract class, which will be removed in v2) and check if properties and methods of your existing class don't collide with the new properties and methods in the abstract class.
### Fixed
* The `HttpLoader::load()` implementation won't throw any exception, because it shouldn't kill a crawler run. When you want any loading error to end the whole crawler execution `HttpLoader::loadOrFail()` should be used. Also adapted the phpdoc in the `LoaderInterface`.
## [1.2.2] - 2023-09-19
### Fixed
* Fix in `HttpCrawl` (`Http::crawl()`) step: when a page contains a broken link, that can't be resolved and throws an `Exception` from the URL library, ignore the link and log a warning message.
* Minor fix for merging HTTP headers when an `Http` step gets both, statically defined headers and headers to use from array input.
## [1.2.1] - 2023-08-21
### Fixed
* When a URL redirects, the `trackRequestEndFor()` method of the `HttpLoader`'s `Throttler` instance is called only once at the end and with the original request URL.
## [1.2.0] - 2023-08-18
### Added
* New `onCacheHit` hook in the `Loader` class (in addition to `beforeLoad`, `onSuccess`, `onError` and `afterLoad`) that is called in the `HttpLoader` class when a response for a request was found in the cache.
### Deprecated
* Moved the `Microseconds` value object class to the crwlr/utils package, as it is a very useful and universal tool. The class in this package still exists, but just extends the class from the utils package and will be removed in v2. So, if you're using this class, please change to use the version from the utils package.
## [1.1.6] - 2023-07-20
### Fixed
* Throttling now also works when using the headless browser.
## [1.1.5] - 2023-07-14
### Fixed
* The `Http::crawl()` step, as well as the `Html::getLink()` and `Html::getLinks()` steps now ignore links, when the `href` attribute starts with `mailto:`, `tel:` or `javascript:`. For the crawl step it obviously makes no sense, but it's also considered a bugfix for the getLink(s) steps, because they are meant to deliver absolute HTTP URLs. If you want to get the values of such links, use the HTML data extraction step.
## [1.1.4] - 2023-07-14
### Fixed
* The `Http::crawl()` step now also work with sitemaps as input URL, where the `<urlset>` tag contains attributes that would cause the symfony DomCrawler to not find any elements.
## [1.1.3] - 2023-06-29
### Fixed
* Improved `Json` step: if the target of the "each" (like `Json::each('target', [...])`) does not exist in the input JSON data, the step yields nothing and logs a warning.
## [1.1.2] - 2023-05-28
### Fixed
* Using the `only()` method of the `MetaData` (`Html::metaData()`) step class, the `title` property was always contained in the output, even if not listed in the `only` properties. This is fixed now.
## [1.1.1] - 2023-05-28
### Fixed
* There was an issue when adding multiple associative arrays with the same key to a `Result` object: let's say you're having a step producing array output like: `['bar' => 'something', 'baz' => 'something else']` and it (the whole array) shall be added to the result property `foo`. When the step produced multiple such array outputs, that led to a result like `['bar' => '...', 'baz' => '...', ['bar' => '...', 'baz' => '...'], ['bar' => '...', 'baz' => '...']`. Now it's fixed to result in `[['bar' => '...', 'baz' => '...'], ['bar' => '...', 'baz' => '...'], ['bar' => '...', 'baz' => '...']`.
## [1.1.0] - 2023-05-21
### Added
* `Http` steps can now receive body and headers from input data (instead of statically defining them via argument like `Http::method(headers: ...)`) using the new methods `useInputKeyAsBody(<key>)` and `useInputKeyAsHeader(<key>, <asHeader>)` or `useInputKeyAsHeaders(<key>)`. Further, when invoked with associative array input data, the step will by default use the value from `url` or `uri` for the request URL. If the input array contains the URL in a key with a different name, you can use the new `useInputKeyAsUrl(<key>)` method. That was basically already possible with the existing `useInputKey(<key>)` method, because the URL is the main input argument for the step. But if you want to use it in combination with the other new `useInputKeyAsXyz()` methods, you have to use `useInputKeyAsUrl()`, because using `useInputKey(<key>)` would invoke the whole step with that key only.
* `Crawler::runAndDump()` as a simple way to just run a crawler and dump all results, each as an array.
* `addToResult()` now also works with serializable objects.
* If you know certain keys that the output of a step will contain, you can now also define aliases for those keys, to be used with `addToResult()`. The output of an `Http` step (`RespondedRequest`) contains the keys `requestUri` and `effectiveUri`. The aliases `url` and `uri` refer to `effectiveUri`, so `addToResult(['url'])` will add the `effectiveUri` as `url` to the result object.
* The `GetLink` (`Html::getLink()`) and `GetLinks` (`Html::getLinks()`) steps, as well as the abstract `DomQuery` (parent of `CssSelector` (/`Dom::cssSelector`) and `XPathQuery` (/`Dom::xPath`)) now have a method `withoutFragment()` to get links respectively URLs without their fragment part.
* The `HttpCrawl` step (`Http::crawl()`) has a new method `useCanonicalLinks()`. If you call it, the step will not yield responses if its canonical link URL was already yielded. And if it discovers a link, and some document pointing to that URL via canonical link was already loaded, it treats it as if it was already loaded. Further this feature also sets the canonical link URL as the `effectiveUri` of the response.
* All filters can now be negated by calling the `negate()` method, so the `evaluate()` method will return the opposite bool value when called. The `negate()` method returns an instance of `NegatedFilter` that wraps the original filter.
* New method `cacheOnlyWhereUrl()` in the `HttpLoader` class, that takes an instance of the `FilterInterface` as argument. If you define one or multiple filters using this method, the loader will cache only responses for URLs that match all the filters.
### Fixed
* The `HttpCrawl` step (`Http::crawl()`) by default now removes the fragment part of URLs to not load the same page multiple times, because in almost any case, servers won't respond with different content based on the fragment. That's why this change is considered non-breaking. For the rare cases when servers respond with different content based on the fragment, you can call the new `keepUrlFragment()` method of the step.
* Although the `HttpCrawl` step (`Http::crawl()`) already respected the limit of outputs defined via the `maxOutputs()` method, it actually didn't stop loading pages. The limit had no effect on loading, only on passing on outputs (responses) to the next step. This is fixed in this version.
* A so-called byte order mark at the beginning of a file (/string) can cause issues. So just remove it, when a step's input string starts with a UTF-8 BOM.
* There seems to be an issue in guzzle when it gets a PSR-7 request object with a header with multiple string values (as array, like: `['accept-encoding' => ['gzip', 'deflate', 'br']]`). When testing it happened that it only sent the last part (in this case `br`). Therefore, the `HttpLoader` now prepares headers before sending (in this case to: `['accept-encoding' => ['gzip, deflate, br']]`).
* You can now also use the output key aliases when filtering step outputs. You can even use keys that are only present in the serialized version of an output object.
## [1.0.2] - 2023-03-20
### Fixed
* JSON step: another fix for JSON strings having keys without quotes with empty string value.
## [1.0.1] - 2023-03-17
### Fixed
* JSON step: improve attempt to fix JSON string having keys without quotes.
## [1.0.0] - 2023-02-08
### Added
* New method `Step::refineOutput()` to manually refine step output values. It takes either a `Closure` or an instance of the new `RefinerInterface` as argument. If the step produces array output, you can provide a key from the array output, to refine, as first argument and the refiner as second argument. You can call the method multiple times and all the refiners will be applied to the outputs in the order you add them. If you want to refine multiple output array keys with a `Closure`, you can skip providing a key and the `Closure` will receive the full output array for refinement. As mentioned you can provide an instance of the `RefinerInterface`. There are already a few implementations: `StringRefiner::afterFirst()`, `StringRefiner::afterLast()`, `StringRefiner::beforeFirst()`, `StringRefiner::beforeLast()`, `StringRefiner::betweenFirst()`, `StringRefiner::betweenLast()` and `StringRefiner::replace()`.
* New method `Step::excludeFromGroupOutput()` to exclude a normal steps output from the combined output of a group that it's part of.
* New method `HttpLoader::setMaxRedirects()` to customize the limit of redirects to follow. Works only when using the HTTP client.
* New filters to filter by string length, with the same options as the comparison filters (equal, not equal, greater than,...).
* New `Filter::custom()` that you can use with a Closure, so you're not limited to the available filters only.
* New method `DomQuery::link()` as a shortcut for `DomQuery::attribute('href')->toAbsoluteUrl()`.
* New static method `HttpCrawler::make()` returning an instance of the new class `AnonymousHttpCrawlerBuilder`. This makes it possible to create your own Crawler instance with a one-liner like: `HttpCrawler::make()->withBotUserAgent('MyCrawler')`. There's also a `withUserAgent()` method to create an instance with a normal (non bot) user agent.
### Changed
* __BREAKING__: The `FileCache` now also respects the `ttl` (time to live) argument and by default it is one hour (3600 seconds). If you're using the cache and expect the items to live (basically) forever, please provide a high enough value for default the time to live. When you try to get a cache item that is already expired, it (the file) is immediately deleted.
* __BREAKING__: The `TooManyRequestsHandler` (and with that also the constructor argument in the `HttpLoader`) was renamed to `RetryErrorResponseHandler`. It now reacts the same to 503 (Service Unavailable) responses as to the 429 (Too Many Requests) responses. If you're actively passing your own instance to the `HttpLoader`, you need to update it.
* You can now have multiple different loaders in a `Crawler`. To use this, return an array containing your loaders from the protected `Crawler::loader()` method with keys to name them. You can then selectively use them by calling the `Step::useLoader()` method on a loading step with the key of the loader it should use.
### Removed
* __BREAKING__: The loop feature. The only real world use case should be paginating listings and this should be solved with the Paginator feature.
* __BREAKING__: `Step::dontCascade()` and `Step::cascades()` because with the change in v0.7, that groups can only produce combined output, there should be no use case for this anymore. If you want to exclude one steps output from the combined group output, you can use the new `Step::excludeFromGroupOutput()` method.
## [0.7.0] - 2023-01-13
### Added
* New functionality to paginate: There is the new `Paginate` child class of the `Http` step class (easy access via `Http::get()->paginate()`). It takes an instance of the `PaginatorInterface` and uses it to iterate through pagination links. There is one implementation of that interface, the `SimpleWebsitePaginator`. The `Http::get()->paginate()` method uses it by default, when called just with a CSS selector to get pagination links. Paginators receive all loaded pages and implement the logic to find pagination links. The paginator class is also called before sending a request, with the request object that is about to be sent as an argument (`prepareRequest()`). This way, it should even be doable to implement more complex pagination functionality. For example when pagination is built using POST request with query strings in the request body.
* New methods `stopOnErrorResponse()` and `yieldErrorResponses()` that can be used with `Http` steps. By calling `stopOnErrorResponse()` the step will throw a `LoadingException` when a response has a 4xx or 5xx status code. By calling the `yieldErrorResponse()` even error responses will be yielded and passed on to the next steps (this was default behaviour until this version. See the breaking change below).
* The body of HTTP responses with a `Content-Type` header containing `application/x-gzip` are automatically decoded when `Http::getBodyString()` is used. Therefore, added `ext-zlib` to suggested in `composer.json`.
* New methods `addToResult()` and `addLaterToResult()`. `addToResult()` is a single replacement for `setResultKey()` and `addKeysToResult()` (they are removed, see `Changed` below) that can be used for array and non array output. `addLaterToResult()` is a new method that does not create a Result object immediately, but instead adds the output of the current step to all the Results that will later be created originating from the current output.
* New methods `outputKey()` and `keepInputData()` that can be used with any step. Using the `outputKey()` method, the step will convert non array output to an array and use the key provided as an argument to this method as array key for the output value. The `keepInputData()` method allows you to forward data from the step's input to the output. If the input is non array you can define a key using the method's argument. This is useful e.g. if you're having data in the initial inputs that you also want to add to the final crawling results.
* New method `createsResult()` that can be used with any step, so you can differentiate if a step creates a Result object, or just keeps data to add to results later (new `addLaterToResult()` method). But primarily relevant for library internal use.
* The `FileCache` class can compress the cache data now to save disk space. Use the `useCompression()` method to do so.
* New method `retryCachedErrorResponses()` in `HttpLoader`. When called, the loader will only use successful responses (status code < 400) from the cache and therefore retry already cached error responses.
* New method `writeOnlyCache()` in `HttpLoader` to only write to, but don't read from the response cache. Can be used to renew cached responses.
* `Filter::urlPathMatches()` to filter URL paths using a regex.
* Option to provide a chrome executable name to the `chrome-php/chrome` library via `HttpLoader::setChromeExecutable()`.
### Changed
* __BREAKING__: Group steps can now only produce combined outputs, as previously done when `combineToSingleOutput()` method was called. The method is removed.
* __BREAKING__: `setResultKey()` and `addKeysToResult()` are removed. Calls to those methods can both be replaced with calls to the new `addToResult()` method.
* __BREAKING__: `getResultKey()` is also removed with `setResultKey()`. It's removed without replacement, as it doesn't really make sense any longer.
* __BREAKING__: Error responses (4xx as well as 5xx), by default, won't produce any step outputs any longer. If you want to receive error responses, use the new `yieldErrorResponses()` method.
* __BREAKING__: Removed the `httpClient()` method in the `HttpCrawler` class. If you want to provide your own HTTP client, implement a custom `loader` method passing your client to the `HttpLoader` instead.
* __Deprecated__ the loop feature (class `Loop` and `Crawler::loop()` method). Probably the only use case is iterating over paginated list pages, which can be done using the new Paginator functionality. It will be removed in v1.0.
* In case of a 429 (Too Many Requests) response, the `HttpLoader` now automatically waits and retries. By default, it retries twice and waits 10 seconds for the first retry and a minute for the second one. In case the response also contains a `Retry-After` header with a value in seconds, it complies to that. Exception: by default it waits at max `60` seconds (you can set your own limit if you want), if the `Retry-After` value is higher, it will stop crawling. If all the retries also receive a `429` it also throws an Exception.
* Removed logger from `Throttler` as it doesn't log anything.
* Fail silently when `robots.txt` can't be parsed.
* Default timeout configuration for the default guzzle HTTP client: `connect_timeout` is `10` seconds and `timeout` is `60` seconds.
* The `validateAndSanitize...()` methods in the abstract `Step` class, when called with an array with one single element, automatically try to use that array element as input value.
* With the `Html` and `Xml` data extraction steps you can now add layers to the data that is being extracted, by just adding further `Html`/`Xml` data extraction steps as values in the mapping array that you pass as argument to the `extract()` method.
* The base `Http` step can now also be called with an array of URLs as a single input. Crawl and Paginate steps still require a single URL input.
### Fixed
* The `CookieJar` now also works with `localhost` or other hosts without a registered domain name.
* Improve the `Sitemap::getUrlsFromSitemap()` step to also work when the `<urlset>` tag contains attributes that would cause the symfony DomCrawler to not find any elements.
* Fixed possibility of infinite redirects in `HttpLoader` by adding a redirects limit of 10.
## [0.6.0] - 2022-10-03
### Added
* New step `Http::crawl()` (class `HttpCrawl` extending the normal `Http` step class) for conventional crawling. It loads all pages of a website (same host or domain) by following links. There's also a lot of options like depth, filtering by paths, and so on.
* New steps `Sitemap::getSitemapsFromRobotsTxt()` (`GetSitemapsFromRobotsTxt`) and `Sitemap::getUrlsFromSitemap()` (`GetUrlsFromSitemap`) to get sitemap (URLs) from a robots.txt file and to get all the URLs from those sitemaps.
* New step `Html::metaData()` to get data from meta tags (and title tag) in HTML documents.
* New step `Html::schemaOrg()` (`SchemaOrg`) to get schema.org structured data in JSON-LD format from HTML documents.
* The abstract `DomQuery` class (parent of the `CssSelector` and `XPathQuery` classes) now has some methods to narrow the selected matches further: `first()`, `last()`, `nth(n)`, `even()`, `odd()`.
### Changed
* __BREAKING__: Removed `PoliteHttpLoader` and traits `WaitPolitely` and `CheckRobotsTxt`. Converted the traits to classes `Throttler` and `RobotsTxtHandler` which are dependencies of the `HttpLoader`. The `HttpLoader` internally gets default instances of those classes. The `RobotsTxtHandler` will respect robots.txt rules by default if you use a `BotUserAgent` and it won't if you use a normal `UserAgent`. You can access the loader's `RobotsTxtHandler` via `HttpLoader::robotsTxt()`. You can pass your own instance of the `Throttler` to the loader and also access it via `HttpLoader::throttle()` to change settings.
### Fixed
* Getting absolute links via the `GetLink` and `GetLinks` steps and the `toAbsoluteUrl()` method of the `CssSelector` and `XPathQuery` classes, now also look for `<base>` tags in HTML when resolving the URLs.
* The `SimpleCsvFileStore` can now also save results with nested data (but only second level). It just concatenates the values separated with a ` | `.
## [0.5.0] - 2022-09-03
### Added
* You can now call the new `useHeadlessBrowser` method on the `HttpLoader` class to use a headless Chrome browser to load pages. This is enough to get HTML after executing javascript in the browser. For more sophisticated tasks a separate Loader and/or Steps should better be created.
* With the `maxOutputs()` method of the abstract `Step` class you can now limit how many outputs a certain step should yield at max. That's for example helpful during development, when you want to run the crawler only with a small subset of the data/requests it will actually have to process when you eventually remove the limits. When a step has reached its limit, it won't even call the `invoke()` method any longer until the step is reset after a run.
* With the new `outputHook()` method of the abstract `Crawler` class you can set a closure that'll receive all the outputs from all the steps. Should be only for debugging reasons.
* The `extract()` method of the `Html` and `Xml` (children of `Dom`) steps now also works with a single selector instead of an array with a mapping. Sometimes you'll want to just get a simple string output e.g. for a next step, instead of an array with mapped extracted data.
* In addition to `uniqueOutputs()` there is now also `uniqueInputs()`. It works exactly the same as `uniqueOutputs()`, filtering duplicate input values instead. Optionally also by a key when expected input is an array or an object.
* In order to be able to also get absolute links when using the `extract()` method of Dom steps, the abstract `DomQuery` class now has a method `toAbsoluteUrl()`. The Dom step will automatically provide the `DomQuery` instance with the base url, presumed that the input was an instance of the `RespondedRequest` class and resolve the selected value against that base url.
### Changed
* Remove some not so important log messages.
* Improve behavior of group step's `combineToSingleOutput()`. When steps yield multiple outputs, don't combine all yielded outputs to one. Instead, combine the first output from the first step with the first output from the second step, and so on.
* When results are not explicitly composed, but the outputs of the last step are arrays with string keys, it sets those keys on the Result object instead of setting a key `unnamed` with the whole array as value.
### Fixed
* The static methods `Html::getLink()` and `Html::getLinks()` now also work without argument, like the `GetLink` and `GetLinks` classes.
* When a `DomQuery` (CSS selector or XPath query) doesn't match anything, its `apply()` method now returns `null` (instead of an empty string). When the `Html(/Xml)::extract()` method is used with a single, not matching selector/query, nothing is yielded. When it's used with an array with a mapping, it yields an array with null values. If the selector for one of the methods `Html(/Xml)::each()`, `Html(/Xml)::first()` or `Html(/Xml)::last()` doesn't match anything, that's not causing an error any longer, it just won't yield anything.
* Removed the (unnecessary) second argument from the `Loop::withInput()` method because when `keepLoopingWithoutOutput()` is called and `withInput()` is called after that call, it resets the behavior.
* Issue when date format for expires date in cookie doesn't have dashes in `d-M-Y` (so `d M Y`).
## [0.4.1] - 2022-05-10
### Fixed
* The `Json` step now also works with Http responses as input.
## [0.4.0] - 2022-05-06
### Added
* The `BaseStep` class now has `where()` and `orWhere()` methods to filter step outputs. You can set multiple filters that will be applied to all outputs. When setting a filter using `orWhere` it's linked to the previously added Filter with "OR". Outputs not matching one of the filters, are not yielded. The available filters can be accessed through static methods on the new `Filter` class. Currently available filters are comparison filters (equal, greater/less than,...), a few string filters (contains, starts/ends with) and url filters (scheme, domain, host,...).
* The `GetLink` and `GetLinks` steps now have methods `onSameDomain()`, `notOnSameDomain()`, `onDomain()`, `onSameHost()`, `notOnSameHost()`, `onHost()` to restrict the which links to find.
* Automatically add the crawler's logger to the `Store` so you can also log messages from there. This can be breaking as the `StoreInterface` now also requires the `addLogger` method. The new abstract `Store` class already implements it, so you can just extend it.
### Changed
* The `Csv` step can now also be used without defining a column mapping. In that case it will use the values from the first line (so this makes sense when there are column headlines) as output array keys.
## [0.3.0] - 2022-04-27
### Added
* By calling `monitorMemoryUsage()` you can tell the Crawler to add log messages with the current memory usage after every step invocation. You can also set a limit in bytes when to start monitoring and below the limit it won't log memory usage.
### Fixed
* Previously the __use of Generators__ actually didn't make a lot of sense, because the outputs of one step were only iterated and passed on to the next step, after the current step was invoked with all its inputs. That makes steps with a lot of inputs bottlenecks and causes bigger memory consumption. So, changed the crawler to immediately pass on outputs of one step to the next step if there is one.
## [0.2.0] - 2022-04-25
### Added
* `uniqueOutputs()` method to Steps to get only unique output values. If outputs are array or object, you can provide a key that will be used as identifier to check for uniqueness. Otherwise, the arrays or objects will be serialized for comparison which will probably be slower.
* `runAndTraverse()` method to Crawler, so you don't need to manually traverse the Generator, if you don't need the results where you're calling the crawler.
* Implement the behaviour for when a `Group` step should add something to the Result using `setResultKey()` or `addKeysToResult()`, which was still missing. For groups this will only work when using `combineToSingleOutput`.
================================================
FILE: CONTRIBUTING.md
================================================
# Contributing to this Package
That you're reading this must mean you consider contributing to
this package. So first off: Awesome! 👍🤘
## Bugs
In case you encounter any bugs please
[file an issue](https://github.com/crwlrsoft/crawler/issues/new).
Describe the issue as well as you can and provide an example to
reproduce it.
Maybe you're not 100 percent sure whether what you've discovered
is a bug or the intended behavior. You can still file an issue
and tell us which results you'd expect.
If you know how to fix the issue you're welcome to send a pull
request. 💪
## New Features
If you have ideas for new features you can tell us about it on
[Twitter](https://twitter.com/crwlrsoft) or via
[crwlr.software](https://www.crwlr.software/contact) or just
send a pull request. Please keep in mind that there is no
guarantee that your feature will be merged.
## Conventions
### Coding Style
This package follows the
[PSR-12](https://www.php-fig.org/psr/psr-12/) coding standard.
You can run PHP CS Fixer via `composer cs` for a dry run or
`composer cs-fix` to automatically fix code style issues.
### Code quality tools
When you're making changes to this package please always run
tests and linting. Commands:
`composer test`
`composer test-integration`
`composer cs`
`composer stan`
Ideally you add the pre-commit git hook that is shipped with
this repo that will run tests and linting. Add it to your local
clone by running:
`composer add-git-hooks`
The integration tests start a simple PHP web server for the
testing purpose on port 8000. If you have anything else running
on that port, the integration tests won't work.
Also, please don't forget to add new test cases if necessary.
### Documentation
For any code change that changes/adds something for users of
the package, please don't forget to add an entry to the
`CHANGELOG.md` file.
## Appreciation
When your pull request is merged I will show some love and tweet
about it. Also, if you meet me in person I will be glad to buy you
a beer.
================================================
FILE: LICENSE
================================================
Copyright (c) 2026 Christian Olear
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject
to the following conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
================================================
FILE: README.md
================================================
<p align="center"><a href="https://www.crwlr.software" target="_blank"><img src="https://github.com/crwlrsoft/graphics/blob/eee6cf48ee491b538d11b9acd7ee71fbcdbe3a09/crwlr-logo.png" alt="crwlr.software logo" width="260"></a></p>
# Library for Rapid (Web) Crawler and Scraper Development
This library provides kind of a framework and a lot of ready to use, so-called __steps__, that you can use as building blocks, to build your own crawlers and scrapers with.
To give you an overview, here's a list of things that it helps you with:
* [Crawler __Politeness__](https://www.crwlr.software/packages/crawler/the-crawler/politeness) 😇 (respecting robots.txt, throttling,...)
* Load URLs using
* [a __(PSR-18) HTTP client__](https://www.crwlr.software/packages/crawler/the-crawler/loaders) (default is of course Guzzle)
* or a [__headless browser__](https://www.crwlr.software/packages/crawler/the-crawler/loaders#using-a-headless-browser) (chrome) to get source after Javascript execution
* [Get __absolute links__ from HTML documents](https://www.crwlr.software/packages/crawler/included-steps/html#html-get-link) 🔗
* [Get __sitemaps__ from robots.txt and get all URLs from those sitemaps](https://www.crwlr.software/packages/crawler/included-steps/sitemap)
* [__Crawl__ (load) all pages of a website](https://www.crwlr.software/packages/crawler/included-steps/http#crawling) 🕷
* [Use __cookies__ (or don't)](https://www.crwlr.software/packages/crawler/the-crawler/loaders#http-loader) 🍪
* [Use any __HTTP methods__ (GET, POST,...) and send any headers or body](https://www.crwlr.software/packages/crawler/included-steps/http#http-requests)
* [Easily iterate over __paginated__ list pages](https://www.crwlr.software/packages/crawler/included-steps/http#paginating) 🔁
* Extract data from:
* [__HTML__](https://www.crwlr.software/packages/crawler/included-steps/html#extracting-data) and also [__XML__](https://www.crwlr.software/packages/crawler/included-steps/xml) (using CSS selectors or XPath queries)
* [__JSON__](https://www.crwlr.software/packages/crawler/included-steps/json) (using dot notation)
* [__CSV__](https://www.crwlr.software/packages/crawler/included-steps/csv) (map columns)
* [Extract __schema.org__ structured data](https://www.crwlr.software/packages/crawler/included-steps/html#schema-org) in __JSON-LD__ format from HTML documents
* [Keep memory usage low](https://www.crwlr.software/packages/crawler/crawling-procedure#memory-usage) by using PHP __Generators__ 💪
* [__Cache__ HTTP responses](https://www.crwlr.software/packages/crawler/response-cache) during development, so you don't have to load pages again and again after every code change
* [Get __logs__](https://www.crwlr.software/packages/crawler/the-crawler#loggers) about what your crawler is doing (accepts any PSR-3 LoggerInterface)
* And a lot more...
## Documentation
You can find the documentation at [crwlr.software](https://www.crwlr.software/packages/crawler/getting-started).
## Contributing
If you consider contributing something to this package, read the [contribution guide (CONTRIBUTING.md)](CONTRIBUTING.md).
================================================
FILE: bin/add-git-hooks
================================================
#!/usr/bin/env php
<?php
$src = __DIR__ . '/../git-hooks/pre-commit';
$dest = __DIR__ . '/../.git/hooks/pre-commit';
copy($src, $dest);
chmod($dest, 0755);
================================================
FILE: composer.json
================================================
{
"name": "crwlr/crawler",
"description": "Web crawling and scraping library.",
"type": "library",
"keywords": [
"crwlr",
"crawl",
"crawler",
"crawling",
"scrape",
"scraping",
"scraper",
"web",
"bot"
],
"homepage": "https://www.crwlr.software/packages/crawler",
"license": "MIT",
"authors": [
{
"name": "Christian Olear",
"homepage": "https://www.otsch.codes",
"role": "Developer"
}
],
"support": {
"issues": "https://github.com/crwlrsoft/crawler/issues",
"source": "https://github.com/crwlrsoft/crawler",
"docs": "https://www.crwlr.software/packages/crawler"
},
"require": {
"ext-dom": "*",
"php": "^8.1",
"crwlr/robots-txt": "^1.1",
"crwlr/schema-org": "^0.2|^0.3",
"crwlr/url": "^2.1",
"psr/log": "^2.0|^3.0",
"symfony/dom-crawler": "^6.0|^7.0",
"symfony/css-selector": "^6.0|^7.0",
"psr/simple-cache": "^1.0|^2.0|^3.0",
"guzzlehttp/guzzle": "^7.4",
"adbario/php-dot-notation": "^3.1",
"chrome-php/chrome": "^1.7",
"crwlr/utils": "^1.2",
"crwlr/html-2-text": "^0.1.0"
},
"require-dev": {
"pestphp/pest": "^2.3|^3.0|^4.0",
"mockery/mockery": "^1.5",
"phpstan/phpstan": "^1.4|^2.0",
"phpstan/phpstan-mockery": "^1.0|^2.0",
"phpstan/extension-installer": "^1.1",
"phpstan/phpstan-phpunit": "^1.0|^2.0",
"friendsofphp/php-cs-fixer": "^3.57",
"spatie/invade": "^2.0",
"symfony/process": "^6.0|^7.0"
},
"suggest": {
"ext-zlib": "Needed to uncompress compressed responses",
"voku/portable-ascii": "^2.0"
},
"funding": [
{
"type": "github",
"url": "https://github.com/sponsors/otsch"
}
],
"autoload": {
"psr-4": {
"Crwlr\\Crawler\\": "src/"
}
},
"autoload-dev": {
"psr-4": {
"tests\\": "tests/"
}
},
"scripts": {
"test": "./vendor/bin/pest --exclude-group=integration --exclude-group=php84 --display-warnings --bail",
"test-php84": "./vendor/bin/pest --group=php84 --display-warnings --bail",
"test-integration": "./vendor/bin/pest --group=integration --display-warnings --bail",
"stan": "@php -d memory_limit=4G vendor/bin/phpstan analyse",
"cs": "php-cs-fixer fix -v --dry-run",
"cs-fix": "php-cs-fixer fix -v",
"add-git-hooks": "@php bin/add-git-hooks"
},
"config": {
"allow-plugins": {
"pestphp/pest-plugin": true,
"phpstan/extension-installer": true
}
}
}
================================================
FILE: git-hooks/pre-commit
================================================
#!/usr/bin/env php
<?php
run('composer test', 'Unit tests');
run('composer test-integration', 'Integration tests');
run('composer cs-fix', 'PHP Coding Standards Fixer');
run('composer stan', 'PHPStan');
exit(0);
function run(string $command, ?string $descriptiveName = null)
{
printLine(blue('RUN ' . ($descriptiveName ?? $command) . '...'));
exec($command, $output, $returnCode);
handleFail($output, $returnCode);
showSummary($output);
}
function handleFail($output, $returnCode)
{
if ($returnCode !== 0) {
printLine(red('Failed:'));
printLines($output);
printLine(red('Aborting commit...'));
exit(1);
}
}
function showSummary(array $output)
{
printBlankLine();
printLine(green('Summary:'));
outputLastNotEmptyLine($output);
printBlankLine();
}
function outputLastNotEmptyLine(array $output)
{
while (count($output) > 0) {
$lastLine = array_pop($output);
if (trim($lastLine) !== '') {
printLine($lastLine);
return;
}
}
}
function printLine(string $string)
{
echo $string . PHP_EOL;
}
function printLines(array $lines)
{
echo implode(PHP_EOL, $lines) . PHP_EOL;
}
function printBlankLine()
{
printLine('');
}
function red(string $string): string
{
return color('0;31', $string);
}
function green(string $string): string
{
return color('0;32', $string);
}
function blue(string $string): string
{
return color('0;34', $string);
}
function color(string $colorCode, string $string): string
{
return "\e[" . $colorCode . "m" . $string . "\e[0m";
}
================================================
FILE: phpstan.neon
================================================
parameters:
level: 8
paths:
- src
- tests
excludePaths:
analyse:
- tests/_Integration/_Server
reportUnmatchedIgnoredErrors: false
ignoreErrors:
- "#^Call to an undefined method Pest\\\\PendingCalls\\\\TestCall\\|Pest\\\\Support\\\\HigherOrderTapProxy\\:\\:(with|throws)\\(\\).$#"
- "#^Access to an undefined property Spatie\\\\Invade\\\\Invader#"
- "#^Call to an undefined method Spatie\\\\Invade\\\\Invader#"
- "#^Call to protected method [a-zA-Z]{5,30}\\(\\) of class PHPUnit\\\\Framework\\\\TestCase.#"
- "#^(?:Parameter|Method) .+ has invalid (return )?type Dom\\\\.+\\.#"
- "#^Call to .+ on an unknown class Dom\\\\.+\\.#"
- "#^Property .+ has unknown class Dom\\\\.+ as its type\\.#"
- "#^Class Dom\\\\.+ not found.#"
- "#^Access to property .+ on an unknown class Dom\\\\.+\\.#"
- "#^PHPDoc tag .+ contains unknown class Dom\\\\.+\\.#"
- "#^Call to an undefined (static )?method Dom\\\\.+::.+\\(\\)\\.#"
- "#^Access to an undefined property Dom\\\\.+::\\$.+\\.#"
- "#^Function .+ has invalid return type Dom\\\\.+\\.#"
- "#^(?:Used )?(?:C|c)onstant DOM\\\\.+ not found\\.#"
- "#^Instantiated class Dom\\\\.+ not found.#"
================================================
FILE: phpunit.xml
================================================
<?xml version="1.0" encoding="UTF-8"?>
<phpunit xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:noNamespaceSchemaLocation="https://schema.phpunit.de/10.1/phpunit.xsd" bootstrap="vendor/autoload.php" colors="true" cacheDirectory=".phpunit.cache">
<testsuites>
<testsuite name="Test Suite">
<directory suffix="Test.php">./tests</directory>
</testsuite>
</testsuites>
<coverage/>
<source>
<include>
<directory suffix=".php">./app</directory>
<directory suffix=".php">./src</directory>
</include>
</source>
</phpunit>
================================================
FILE: src/Cache/CacheItem.php
================================================
<?php
namespace Crwlr\Crawler\Cache;
use DateInterval;
use DateTimeImmutable;
use Exception;
class CacheItem
{
protected string $key;
public function __construct(
protected mixed $value,
?string $key = null,
public readonly int|DateInterval $ttl = 3600,
public readonly DateTimeImmutable $createdAt = new DateTimeImmutable(),
) {
if (!$key) {
if (is_object($this->value) && method_exists($this->value, 'cacheKey')) {
$this->key = $this->value->cacheKey();
} else {
$this->key = md5(serialize($this->value));
}
} else {
$this->key = $key;
}
}
public function key(): string
{
return $this->key;
}
public function value(): mixed
{
return $this->value;
}
/**
* @throws Exception
*/
public function isExpired(): bool
{
$ttl = $this->ttl instanceof DateInterval ? $this->ttl : new DateInterval('PT' . $this->ttl . 'S');
return time() > $this->createdAt->add($ttl)->getTimestamp();
}
/**
* Get a new instance with same data but a different time to live.
*/
public function withTtl(DateInterval|int $ttl): CacheItem
{
return new CacheItem($this->value, $this->key, $ttl, $this->createdAt);
}
/**
* @return mixed[]
*/
public function __serialize(): array
{
return [
'value' => $this->value,
'key' => $this->key,
'ttl' => $this->ttl,
'createdAt' => $this->createdAt,
];
}
/**
* @param mixed[] $data
*/
public function __unserialize(array $data): void
{
$this->value = $data['value'];
$this->key = $data['key'];
$this->ttl = $data['ttl'];
$this->createdAt = $data['createdAt'];
}
}
================================================
FILE: src/Cache/Exceptions/MissingZlibExtensionException.php
================================================
<?php
namespace Crwlr\Crawler\Cache\Exceptions;
use Exception;
use Psr\SimpleCache\CacheException;
class MissingZlibExtensionException extends Exception implements CacheException {}
================================================
FILE: src/Cache/Exceptions/ReadingCacheFailedException.php
================================================
<?php
namespace Crwlr\Crawler\Cache\Exceptions;
use Exception;
use Psr\SimpleCache\CacheException;
class ReadingCacheFailedException extends Exception implements CacheException {}
================================================
FILE: src/Cache/FileCache.php
================================================
<?php
namespace Crwlr\Crawler\Cache;
use Crwlr\Crawler\Cache\Exceptions\MissingZlibExtensionException;
use Crwlr\Crawler\Cache\Exceptions\ReadingCacheFailedException;
use Crwlr\Crawler\Utils\Gzip;
use DateInterval;
use Exception;
use Psr\SimpleCache\CacheInterface;
use Psr\SimpleCache\InvalidArgumentException;
use Throwable;
class FileCache implements CacheInterface
{
protected DateInterval|int $ttl = 3600;
protected bool $useCompression = false;
public function __construct(
protected readonly string $basePath,
) {}
public function useCompression(): static
{
$this->useCompression = true;
return $this;
}
public function ttl(DateInterval|int $ttl): static
{
$this->ttl = $ttl;
return $this;
}
/**
* @throws MissingZlibExtensionException|ReadingCacheFailedException|Exception|InvalidArgumentException
*/
public function has(string $key): bool
{
if (file_exists($this->basePath . '/' . $key)) {
$cacheItem = $this->getCacheItem($key);
if (!$cacheItem->isExpired()) {
return true;
}
$this->delete($key);
}
return false;
}
/**
* @throws ReadingCacheFailedException|MissingZlibExtensionException|Exception|InvalidArgumentException
*/
public function get(string $key, mixed $default = null): mixed
{
if (file_exists($this->basePath . '/' . $key)) {
$cacheItem = $this->getCacheItem($key);
if (!$cacheItem->isExpired()) {
return $cacheItem->value();
}
$this->delete($key);
}
return $default;
}
/**
* @throws MissingZlibExtensionException
*/
public function set(string $key, mixed $value, DateInterval|int|null $ttl = null): bool
{
if (!$value instanceof CacheItem) {
$value = new CacheItem($value, $key, $ttl ?? $this->ttl);
} elseif ($value->key() !== $key) {
$value = new CacheItem($value->value(), $key, $ttl ?? $value->ttl);
}
return $this->saveCacheItem($value);
}
public function delete(string $key): bool
{
return unlink($this->basePath . '/' . $key);
}
public function prolong(string $key, DateInterval|int $ttl): bool
{
try {
$item = $this->getCacheItem($key);
return $this->saveCacheItem($item->withTtl($ttl));
} catch (Throwable) {
return false;
}
}
/**
* @throws InvalidArgumentException
*/
public function clear(): bool
{
$allFiles = scandir($this->basePath);
if (is_array($allFiles)) {
foreach ($allFiles as $file) {
if ($file !== '.' && $file !== '..' && $file !== '.gitkeep' && !$this->delete($file)) {
return false;
}
}
}
return true;
}
public function prolongAll(DateInterval|int $ttl): bool
{
$allFiles = scandir($this->basePath);
if (is_array($allFiles)) {
foreach ($allFiles as $file) {
if ($file !== '.' && $file !== '..' && $file !== '.gitkeep' && !$this->prolong($file, $ttl)) {
return false;
}
}
}
return true;
}
/**
* @return iterable<mixed>
* @throws MissingZlibExtensionException|ReadingCacheFailedException|InvalidArgumentException
*/
public function getMultiple(iterable $keys, mixed $default = null): iterable
{
$items = [];
foreach ($keys as $key) {
$items[$key] = $this->get($key, $default);
}
return $items;
}
/**
* @param iterable<mixed> $values
* @throws MissingZlibExtensionException
*/
public function setMultiple(iterable $values, DateInterval|int|null $ttl = null): bool
{
foreach ($values as $key => $value) {
if (!$this->set($key, $value, $ttl)) {
return false;
}
}
return true;
}
public function deleteMultiple(iterable $keys): bool
{
foreach ($keys as $key) {
if (!$this->delete($key)) {
return false;
}
}
return true;
}
/**
* @throws MissingZlibExtensionException
* @throws ReadingCacheFailedException
*/
protected function getCacheItem(string $key): CacheItem
{
$fileContent = $this->getFileContents($key);
if ($this->useCompression) {
$fileContent = $this->decode($fileContent);
}
$unserialized = $this->unserialize($fileContent);
if (!$unserialized instanceof CacheItem) {
$unserialized = new CacheItem($unserialized, $key);
}
return $unserialized;
}
/**
* @throws MissingZlibExtensionException
*/
protected function saveCacheItem(CacheItem $item): bool
{
$content = serialize($item);
if ($this->useCompression) {
$content = $this->encode($content);
}
return file_put_contents($this->basePath . '/' . $item->key(), $content) !== false;
}
protected function unserialize(string $content): mixed
{
// Temporarily set a new error handler, so unserializing a compressed string does not result in a PHP warning.
set_error_handler(function ($errno, $errstr) {
return $errno === E_WARNING && str_starts_with($errstr, 'unserialize(): Error at offset 0 of ');
});
$unserialized = unserialize($content);
if ($unserialized === false) { // if unserializing fails, try if the string is compressed.
try {
$content = $this->decode($content);
$unserialized = unserialize($content);
} catch (Throwable) {
}
}
restore_error_handler();
return $unserialized;
}
/**
* @throws ReadingCacheFailedException
*/
protected function getFileContents(string $key): string
{
$fileContent = file_get_contents($this->basePath . '/' . $key);
if ($fileContent === false) {
throw new ReadingCacheFailedException('Failed to read cache file.');
}
return $fileContent;
}
/**
* @throws MissingZlibExtensionException
*/
protected function encode(string $content): string
{
try {
return Gzip::encode($content, true);
} catch (MissingZlibExtensionException) {
throw new MissingZlibExtensionException(
'Can\'t compress response cache data. Compression needs PHP ext-zlib installed.',
);
}
}
/**
* @throws MissingZlibExtensionException
*/
protected function decode(string $content): string
{
try {
return Gzip::decode($content, true);
} catch (MissingZlibExtensionException) {
throw new MissingZlibExtensionException('FileCache compression needs PHP ext-zlib installed.');
}
}
}
================================================
FILE: src/Crawler.php
================================================
<?php
namespace Crwlr\Crawler;
use Closure;
use Crwlr\Crawler\Loader\LoaderInterface;
use Crwlr\Crawler\Logger\CliLogger;
use Crwlr\Crawler\Steps\BaseStep;
use Crwlr\Crawler\Steps\Exceptions\PreRunValidationException;
use Crwlr\Crawler\Steps\Group;
use Crwlr\Crawler\Steps\StepInterface;
use Crwlr\Crawler\Stores\StoreInterface;
use Crwlr\Crawler\UserAgents\UserAgentInterface;
use Exception;
use Generator;
use InvalidArgumentException;
use Psr\Log\LoggerInterface;
abstract class Crawler
{
protected UserAgentInterface $userAgent;
/**
* @var LoaderInterface
*/
protected LoaderInterface $loader;
protected LoggerInterface $logger;
protected mixed $inputs = [];
/**
* @var array<int, StepInterface>
*/
protected array $steps = [];
protected ?StoreInterface $store = null;
protected bool|int $monitorMemoryUsage = false;
protected ?Closure $outputHook = null;
public function __construct()
{
$this->userAgent = $this->userAgent();
$this->logger = $this->logger();
$this->loader = $this->loader($this->userAgent, $this->logger);
}
public function __clone(): void
{
$this->inputs = [];
$this->steps = [];
$this->store = null;
$this->outputHook = null;
}
abstract protected function userAgent(): UserAgentInterface;
/**
* @param UserAgentInterface $userAgent
* @param LoggerInterface $logger
* @return LoaderInterface
*/
abstract protected function loader(UserAgentInterface $userAgent, LoggerInterface $logger): LoaderInterface;
public static function group(): Group
{
return new Group();
}
public static function setMemoryLimit(string $memoryLimit): false|string
{
return ini_set('memory_limit', $memoryLimit);
}
public static function getMemoryLimit(): false|string
{
return ini_get('memory_limit');
}
public function getSubCrawler(): Crawler
{
return clone $this;
}
public function getUserAgent(): UserAgentInterface
{
return $this->userAgent;
}
public function setUserAgent(UserAgentInterface $userAgent): static
{
$this->userAgent = $userAgent;
$this->loader = $this->loader($userAgent, $this->logger);
return $this;
}
public function getLogger(): LoggerInterface
{
return $this->logger;
}
/**
* @return LoaderInterface|array<string, LoaderInterface>
*/
public function getLoader(): LoaderInterface|array
{
return $this->loader;
}
public function setStore(StoreInterface $store): static
{
$store->addLogger($this->logger);
$this->store = $store;
return $this;
}
public function input(mixed $input): static
{
$this->inputs[] = $input;
return $this;
}
/**
* @param mixed[] $inputs
*/
public function inputs(array $inputs): static
{
$this->inputs = array_merge($this->inputs, $inputs);
return $this;
}
/**
* @param StepInterface $step
* @return $this
* @throws InvalidArgumentException
*/
public function addStep(StepInterface $step): static
{
$step->addLogger($this->logger);
if (method_exists($step, 'setLoader')) {
$step->setLoader($this->loader);
}
if ($step instanceof BaseStep) {
$step->setParentCrawler($this);
}
$this->steps[] = $step;
return $this;
}
/**
* Run the crawler and traverse results
*
* When you've set a store, or you just don't need the results for any other reason (e.g. you use the crawler for
* cache warming) where you're calling the crawler, use this method.
*
* @throws Exception
*/
public function runAndTraverse(): void
{
foreach ($this->run() as $result) {
}
}
/**
* Easy way to just crawl and dump the results
*
* @throws Exception
*/
public function runAndDump(): void
{
foreach ($this->run() as $result) {
var_dump($result->toArray());
}
}
/**
* Run the Crawler
*
* Handles calling all the steps and cascading the data from step to step.
* It returns a Generator, so when using this method directly, you need to traverse the Generator, otherwise nothing
* happens. Alternatively you can use runAndTraverse().
*
* @return Generator<Result>
* @throws Exception|PreRunValidationException
*/
public function run(): Generator
{
$this->validateSteps();
$inputs = $this->prepareInput();
if ($this->firstStep()) {
foreach ($inputs as $input) {
$results = $this->invokeStepsRecursive($input, $this->firstStep(), 0);
/** @var Generator<Result> $results */
yield from $results;
}
}
$this->reset();
}
/**
* Use this method if you want the crawler to add log messages with the current memory usage after every step
* invocation.
*
* @param int|null $ifAboveXBytes You can provide an int of bytes as a limit above which the crawler should log
* the usage.
*/
public function monitorMemoryUsage(?int $ifAboveXBytes = null): static
{
$this->monitorMemoryUsage = $ifAboveXBytes ?? true;
return $this;
}
public function outputHook(Closure $callback): static
{
$this->outputHook = $callback;
return $this;
}
protected function logger(): LoggerInterface
{
return new CliLogger();
}
/**
* @return Generator<Output|Result>
*/
protected function invokeStepsRecursive(Input $input, StepInterface $step, int $stepIndex): Generator
{
$outputs = $step->invokeStep($input);
$nextStep = $this->nextStep($stepIndex);
if (!$nextStep) {
yield from $this->storeAndReturnOutputsAsResults($outputs);
return;
}
foreach ($outputs as $output) {
if ($this->monitorMemoryUsage !== false) {
$this->logMemoryUsage();
}
$this->outputHook?->call($this, $output, $stepIndex, $step);
yield from $this->invokeStepsRecursive(
new Input($output),
$nextStep,
$stepIndex + 1,
);
}
}
/**
* @param Generator<Output> $outputs
* @return Generator<Result>
*/
protected function storeAndReturnOutputsAsResults(Generator $outputs): Generator
{
foreach ($outputs as $output) {
$this->outputHook?->call($this, $output, count($this->steps) - 1, end($this->steps));
$result = new Result();
foreach ($output->keep as $key => $value) {
$result->set($key, $value);
}
if (!$this->lastStep()?->keepsAnything()) {
if ($output->isArrayWithStringKeys()) {
foreach ($output->get() as $key => $value) {
$result->set($key, $value);
}
} else {
$result->set('unnamed', $output->get());
}
}
$this->store?->store($result);
yield $result;
}
}
/**
* @throws PreRunValidationException
*/
protected function validateSteps(): void
{
$previousStep = null;
foreach ($this->steps as $index => $step) {
if ($index > 0) {
$previousStep = $this->steps[$index - 1];
}
if (method_exists($step, 'validateBeforeRun')) {
try {
$step->validateBeforeRun($previousStep ?? $this->inputs);
} catch (PreRunValidationException $exception) {
$this->logger->error(
'Pre-Run validation error in step number ' . ($index + 1) . ': ' . $exception->getMessage(),
);
throw $exception;
}
}
}
}
/**
* @return Input[]
* @throws Exception
*/
protected function prepareInput(): array
{
return array_map(function ($input) {
return new Input($input);
}, $this->inputs);
}
protected function logMemoryUsage(): void
{
$memoryUsage = memory_get_usage();
if (!is_int($this->monitorMemoryUsage) || $memoryUsage > $this->monitorMemoryUsage) {
$this->logger->info('memory usage: ' . $memoryUsage);
}
}
protected function firstStep(): ?StepInterface
{
return $this->steps[0] ?? null;
}
protected function lastStep(): ?BaseStep
{
$lastStep = end($this->steps);
if (!$lastStep instanceof BaseStep) {
return null;
}
return $lastStep;
}
protected function nextStep(int $afterIndex): ?StepInterface
{
return $this->steps[$afterIndex + 1] ?? null;
}
protected function reset(): void
{
$this->inputs = [];
foreach ($this->steps as $step) {
$step->resetAfterRun();
}
}
}
================================================
FILE: src/HttpCrawler/AnonymousHttpCrawlerBuilder.php
================================================
<?php
namespace Crwlr\Crawler\HttpCrawler;
use Crwlr\Crawler\HttpCrawler;
use Crwlr\Crawler\UserAgents\BotUserAgent;
use Crwlr\Crawler\UserAgents\UserAgent;
use Crwlr\Crawler\UserAgents\UserAgentInterface;
class AnonymousHttpCrawlerBuilder
{
public function __construct() {}
public function withBotUserAgent(string $productToken): HttpCrawler
{
$instance = new class extends HttpCrawler {
protected function userAgent(): UserAgentInterface
{
return new UserAgent('temp');
}
};
$instance->setUserAgent(new BotUserAgent($productToken));
return $instance;
}
public function withUserAgent(string|UserAgentInterface $userAgent): HttpCrawler
{
$instance = new class extends HttpCrawler {
protected function userAgent(): UserAgentInterface
{
return new UserAgent('temp');
}
};
$userAgent = $userAgent instanceof UserAgentInterface ? $userAgent : new UserAgent($userAgent);
$instance->setUserAgent($userAgent);
return $instance;
}
public function withMozilla5CompatibleUserAgent(): HttpCrawler
{
return $this->withUserAgent(UserAgent::mozilla5CompatibleBrowser());
}
}
================================================
FILE: src/HttpCrawler.php
================================================
<?php
namespace Crwlr\Crawler;
use Crwlr\Crawler\HttpCrawler\AnonymousHttpCrawlerBuilder;
use Crwlr\Crawler\Loader\Http\HttpLoader;
use Crwlr\Crawler\Loader\LoaderInterface;
use Crwlr\Crawler\UserAgents\UserAgentInterface;
use Psr\Log\LoggerInterface;
/**
* @method HttpLoader getLoader()
*/
abstract class HttpCrawler extends Crawler
{
/**
* @return LoaderInterface
*/
protected function loader(UserAgentInterface $userAgent, LoggerInterface $logger): LoaderInterface
{
return new HttpLoader($userAgent, logger: $logger);
}
public static function make(): HttpCrawler\AnonymousHttpCrawlerBuilder
{
return new AnonymousHttpCrawlerBuilder();
}
}
================================================
FILE: src/Input.php
================================================
<?php
namespace Crwlr\Crawler;
class Input extends Io {}
================================================
FILE: src/Io.php
================================================
<?php
namespace Crwlr\Crawler;
use Crwlr\Crawler\Utils\OutputTypeHelper;
class Io
{
protected string|int|float|bool|null $key = null;
/**
* @param mixed[] $keep
*/
final public function __construct(
protected mixed $value,
public array $keep = [],
) {
if ($value instanceof self) {
$this->value = $value->value;
$this->keep = $value->keep;
}
}
public function withValue(mixed $value): static
{
return new static($value, $this->keep);
}
public function withPropertyValue(string $key, mixed $value): static
{
if (!$this->isArrayWithStringKeys()) {
return new static($this);
}
$newValue = $this->value;
$newValue[$key] = $value;
return $this->withValue($newValue);
}
public function get(): mixed
{
return $this->value;
}
public function getProperty(string $key, mixed $fallbackValue = null): mixed
{
if (is_array($this->value)) {
return $this->value[$key] ?? $fallbackValue;
} elseif (is_object($this->value)) {
$array = OutputTypeHelper::objectToArray($this->value);
return $array[$key] ?? $fallbackValue;
}
return $fallbackValue;
}
/**
* Sets and returns a key to use as identifier
*
* To only get unique results from a step use the key this method creates for comparison.
* In case the output values are arrays or objects and contain a unique identifier that can be used, provide that
* key name, so it doesn't need to create a key from the whole array/object.
*/
public function setKey(?string $useFromValue = null): string
{
if ($useFromValue && is_array($this->value) && array_key_exists($useFromValue, $this->value)) {
$this->key = $this->valueToString($this->value[$useFromValue]);
} elseif ($useFromValue && is_object($this->value) && property_exists($this->value, $useFromValue)) {
$this->key = $this->valueToString($this->value->{$useFromValue});
} else {
$this->key = $this->valueToString($this->value);
}
return $this->key;
}
public function getKey(): string|int|float|bool|null
{
if ($this->key === null) {
$this->setKey();
}
return $this->key;
}
/**
* @param mixed[] $data
*/
public function keep(array $data): static
{
$this->keep = array_merge_recursive($this->keep, $data);
return $this;
}
public function isArrayWithStringKeys(): bool
{
if (!is_array($this->value)) {
return false;
}
foreach ($this->value as $key => $value) {
if (!is_string($key)) {
return false;
}
}
return true;
}
protected function valueToString(mixed $value): string
{
if (is_array($value) || is_object($value)) {
return md5(serialize($this->value));
} elseif (is_int($value) || is_float($value)) {
return (string) $value;
} elseif (is_bool($value)) {
return $value ? 'true' : 'false';
} elseif (is_null($value)) {
return 'null';
}
return $value;
}
}
================================================
FILE: src/Loader/Http/Browser/Screenshot.php
================================================
<?php
namespace Crwlr\Crawler\Loader\Http\Browser;
class Screenshot
{
public function __construct(
public readonly string $path,
) {}
}
================================================
FILE: src/Loader/Http/Browser/ScreenshotConfig.php
================================================
<?php
namespace Crwlr\Crawler\Loader\Http\Browser;
use Crwlr\Utils\Microseconds;
use HeadlessChromium\Clip;
use HeadlessChromium\Exception\CommunicationException\CannotReadResponse;
use HeadlessChromium\Exception\CommunicationException\InvalidResponse;
use HeadlessChromium\Page;
class ScreenshotConfig
{
public function __construct(
public string $storePath,
public string $fileType = 'png',
public ?int $quality = null,
public bool $fullPage = false,
) {}
public static function make(string $storePath): self
{
return new self($storePath);
}
/**
* @throws CannotReadResponse
* @throws InvalidResponse
*/
public function getFullPath(Page $page): string
{
$filename = md5($page->getCurrentUrl()) . '-' . Microseconds::now()->value . '.' . $this->fileType;
return $this->storePath . (!str_ends_with($this->storePath, '/') ? '/' : '') . $filename;
}
public function setImageFileType(string $type): self
{
if (in_array($type, ['jpeg', 'png', 'webp'], true)) {
$this->fileType = $type;
if (in_array($type, ['jpeg', 'webp'], true) && $this->quality === null) {
$this->quality = 80;
} elseif ($type === 'png' && $this->quality !== null) {
$this->quality = null;
}
}
return $this;
}
public function setQuality(int $quality): self
{
if (in_array($this->fileType, ['jpeg', 'webp'], true) && $quality > 0 && $quality <= 100) {
$this->quality = $quality;
}
return $this;
}
public function setFullPage(): self
{
$this->fullPage = true;
return $this;
}
/**
* @return array<string, int|string|bool|Clip>
*/
public function toChromePhpScreenshotConfig(Page $page): array
{
$config = ['format' => $this->fileType];
if ($this->quality && in_array($this->fileType, ['jpeg', 'webp'], true)) {
$config['quality'] = $this->quality;
}
if ($this->fullPage) {
$config['captureBeyondViewport'] = true;
$config['clip'] = $page->getFullPageClip();
}
return $config;
}
}
================================================
FILE: src/Loader/Http/Cache/RetryManager.php
================================================
<?php
namespace Crwlr\Crawler\Loader\Http\Cache;
/**
* @internal
*/
class RetryManager
{
/**
* @param int[]|null $only
* @param int[]|null $except
*/
public function __construct(
private ?array $only = null,
private ?array $except = null,
) {}
/**
* @param int|int[] $statusCodes
*/
public function only(int|array $statusCodes): static
{
$statusCodes = is_array($statusCodes) ? $statusCodes : [$statusCodes];
$this->only = $statusCodes;
return $this;
}
/**
* @param int|int[] $statusCodes
*/
public function except(int|array $statusCodes): static
{
$statusCodes = is_array($statusCodes) ? $statusCodes : [$statusCodes];
$this->except = $statusCodes;
return $this;
}
public function shallBeRetried(int $statusCode): bool
{
return $statusCode >= 400 &&
($this->except === null || !in_array($statusCode, $this->except, true)) &&
($this->only === null || in_array($statusCode, $this->only, true));
}
}
================================================
FILE: src/Loader/Http/Cookies/Cookie.php
================================================
<?php
namespace Crwlr\Crawler\Loader\Http\Cookies;
use Crwlr\Crawler\Loader\Http\Cookies\Exceptions\InvalidCookieException;
use Crwlr\Url\Psr\Uri;
use Crwlr\Url\Url;
use Exception;
use Psr\Http\Message\UriInterface;
class Cookie
{
protected Url $receivedFromUrl;
protected string $receivedFromHost;
protected string $cookieName;
protected string $cookieValue;
protected ?Date $expires = null;
protected ?int $maxAge = null;
protected int $receivedAtTimestamp = 0;
protected string $domain;
protected bool $domainSetViaAttribute = false;
protected ?string $path = null;
protected bool $secure = false;
protected bool $httpOnly = false;
protected string $sameSite = 'Lax';
/**
* @throws InvalidCookieException
* @throws Exception
*/
public function __construct(
string|Url $receivedFromUrl,
protected readonly string $setCookieHeader,
) {
$this->receivedFromUrl = $receivedFromUrl instanceof Url ? $receivedFromUrl : Url::parse($receivedFromUrl);
if (
!is_string($this->receivedFromUrl->host()) ||
empty($this->receivedFromUrl->host())
) {
throw new InvalidCookieException('Url where cookie was received from has no host or domain');
}
$this->receivedFromHost = $this->receivedFromUrl->host();
$this->setDomain($this->receivedFromUrl->domain() ?? $this->receivedFromUrl->host());
$this->parseSetCookieHeader($this->setCookieHeader);
}
/**
* @throws Exception
*/
public function shouldBeSentTo(string|UriInterface|Url $url): bool
{
$url = $url instanceof Url ? $url : Url::parse($url);
$urlHost = $url->host() ?? '';
return
str_contains($urlHost, $this->domain()) &&
(!$this->hasHostPrefix() || $urlHost === $this->receivedFromHost) &&
(!$this->secure() || $url->scheme() === 'https' || in_array($urlHost, ['localhost', '127.0.0.1'], true)) &&
(!$this->path() || $this->pathMatches($url)) &&
!$this->isExpired();
}
public function __toString(): string
{
return $this->name() . '=' . $this->value();
}
public function receivedFromUrl(): UriInterface
{
return new Uri($this->receivedFromUrl);
}
public function name(): string
{
return $this->cookieName;
}
public function value(): string
{
return $this->cookieValue;
}
public function expires(): ?Date
{
return $this->expires;
}
public function maxAge(): ?int
{
return $this->maxAge;
}
public function isExpired(): bool
{
if ($this->expires() === null && $this->maxAge() === null) {
return false;
}
$nowTimestamp = time();
if ($this->expires() instanceof Date && $nowTimestamp >= $this->expires()->dateTime()->getTimestamp()) {
return true;
}
return $this->maxAge() !== null &&
($this->maxAge() <= 0 || $nowTimestamp > ($this->receivedAtTimestamp + $this->maxAge()));
}
public function domain(): string
{
return $this->domain;
}
public function path(): ?string
{
return $this->path;
}
public function secure(): bool
{
return $this->secure;
}
public function httpOnly(): bool
{
return $this->httpOnly;
}
public function sameSite(): string
{
return $this->sameSite;
}
/**
* @throws Exception
*/
public function isReceivedSecure(): bool
{
return $this->receivedFromUrl->scheme() === 'https';
}
public function hasSecurePrefix(): bool
{
return str_starts_with($this->cookieName, '__Secure-');
}
public function hasHostPrefix(): bool
{
return str_starts_with($this->cookieName, '__Host-');
}
/**
* @throws InvalidCookieException
*/
protected function parseSetCookieHeader(string $setCookieHeader): void
{
$splitAtSemicolon = explode(';', $setCookieHeader);
$splitFirstPart = explode('=', trim(array_shift($splitAtSemicolon)), 2);
if (count($splitFirstPart) !== 2) {
throw new InvalidCookieException('Invalid cookie string');
}
[$this->cookieName, $this->cookieValue] = $splitFirstPart;
foreach ($splitAtSemicolon as $attribute) {
$this->parseAttribute($attribute);
}
$this->checkPrefixes();
}
/**
* @throws InvalidCookieException
*/
protected function parseAttribute(string $attribute): void
{
$splitAtEquals = explode('=', trim($attribute), 2);
$attributeName = strtolower($splitAtEquals[0]);
$attributeValue = $splitAtEquals[1] ?? '';
if ($attributeName === 'expires') {
$this->setExpires($attributeValue);
} elseif ($attributeName === 'max-age') {
$this->setMaxAge($attributeValue);
} elseif ($attributeName === 'domain') {
$this->setDomain($attributeValue, true);
} elseif ($attributeName === 'path') {
$this->setPath($attributeValue);
} elseif ($attributeName === 'secure') {
$this->setSecure();
} elseif ($attributeName === 'httponly') {
$this->httpOnly = true;
} elseif ($attributeName === 'samesite') {
$this->setSameSite($attributeValue);
}
}
/**
* @see https://datatracker.ietf.org/doc/html/draft-west-cookie-prefixes#section-3
* @throws InvalidCookieException
* @throws Exception
*/
protected function checkPrefixes(): void
{
if ($this->hasSecurePrefix() || $this->hasHostPrefix()) {
if (!$this->isReceivedSecure()) {
throw new InvalidCookieException(
'Cookie is prefixed with __Secure- or __Host- but was not sent via https',
);
}
if (!$this->secure()) {
throw new InvalidCookieException(
'Cookie is prefixed with __Secure- or __Host- but Secure flag was not sent',
);
}
}
if ($this->hasHostPrefix()) {
if ($this->domainSetViaAttribute) {
throw new InvalidCookieException('Cookie with __Host- prefix must not contain a Domain attribute');
}
if ($this->path !== '/') {
throw new InvalidCookieException('Cookie with __Host- prefix must have a Path attribute with value /');
}
}
}
protected function setExpires(string $value): void
{
$this->expires = new Date($value);
}
protected function setMaxAge(string $value): void
{
$this->maxAge = (int) $value;
$this->receivedAtTimestamp = time();
}
/**
* @throws InvalidCookieException
* @throws Exception
*/
protected function setDomain(string $value, bool $viaAttribute = false): void
{
if (str_starts_with($value, '.')) {
$value = substr($value, 1);
}
if (!str_contains($this->receivedFromHost, $value)) {
throw new InvalidCookieException(
'Setting cookie for ' . $value . ' from ' . $this->receivedFromUrl->host() . ' is not allowed.',
);
}
$this->domain = $value;
if ($viaAttribute) {
$this->domainSetViaAttribute = true;
}
}
protected function setPath(string $path): void
{
$this->path = $path;
}
/**
* @throws InvalidCookieException
* @throws Exception
*/
protected function setSecure(): void
{
if (!$this->isReceivedSecure()) {
throw new InvalidCookieException(
'Secure flag can\'t be set when cookie was sent from non-https document url.',
);
}
$this->secure = true;
}
/**
* @throws InvalidCookieException
*/
protected function setSameSite(string $value): void
{
$value = strtolower($value);
if (!in_array(strtolower($value), ['strict', 'lax', 'none'], true)) {
throw new InvalidCookieException('Invalid value for attribute SameSite');
}
$this->sameSite = ucfirst($value);
}
/**
* @throws Exception
*/
protected function pathMatches(Url $url): bool
{
$path = $this->path() ?? '';
$urlPath = $url->path() ?? '';
return str_starts_with($urlPath, $path) &&
(
$urlPath === $path ||
$path === '/' ||
str_starts_with($urlPath, $path . '/')
);
}
}
================================================
FILE: src/Loader/Http/Cookies/CookieJar.php
================================================
<?php
namespace Crwlr\Crawler\Loader\Http\Cookies;
use Crwlr\Crawler\Loader\Http\Cookies\Exceptions\InvalidCookieException;
use Crwlr\Url\Url;
use DateTime;
use Exception;
use HeadlessChromium\Cookies\Cookie as BrowserCookie;
use HeadlessChromium\Cookies\CookiesCollection;
use Psr\Http\Message\ResponseInterface;
use Psr\Http\Message\UriInterface;
class CookieJar
{
/**
* @var Cookie[][]
*/
protected array $jar = [];
/**
* @param string $domain
* @return Cookie[]
*/
public function allByDomain(string $domain): array
{
if (array_key_exists($domain, $this->jar)) {
return $this->jar[$domain];
}
return [];
}
public function flush(): void
{
$this->jar = [];
}
/**
* @throws InvalidCookieException
* @throws Exception
*/
public function addFrom(string|UriInterface|Url $url, ResponseInterface|CookiesCollection $response): void
{
if ($response instanceof CookiesCollection) {
$this->addFromBrowserCookieCollection($url, $response);
} else {
$cookieHeaders = $response->getHeader('set-cookie');
if (!empty($cookieHeaders)) {
$url = !$url instanceof Url ? Url::parse($url) : $url;
$domain = $this->getForDomainFromUrl($url);
if ($domain) {
foreach ($cookieHeaders as $cookieHeader) {
$cookie = new Cookie($url, $cookieHeader);
$this->jar[$domain][$cookie->name()] = $cookie;
}
}
}
}
}
/**
* @throws InvalidCookieException
* @throws Exception
*/
public function addFromBrowserCookieCollection(string|UriInterface|Url $url, CookiesCollection $collection): void
{
if ($collection->count() === 0) {
return;
}
if (!$url instanceof Url) {
$url = Url::parse($url);
}
$domain = $this->getForDomainFromUrl($url);
if ($domain) {
foreach ($collection as $cookie) {
$setCookie = new Cookie($url, $this->buildSetCookieHeaderFromBrowserCookie($cookie));
$this->jar[$domain][$setCookie->name()] = $setCookie;
}
}
}
/**
* @return Cookie[]
* @throws Exception
*/
public function getFor(string|UriInterface $url): array
{
$forDomain = $this->getForDomainFromUrl($url);
if (!$forDomain || !array_key_exists($forDomain, $this->jar)) {
return [];
}
$cookiesToSend = [];
foreach ($this->jar[$forDomain] as $cookie) {
if ($cookie->shouldBeSentTo($url)) {
$cookiesToSend[] = $cookie;
}
}
return $cookiesToSend;
}
/**
* @throws Exception
*/
protected function getForDomainFromUrl(string|UriInterface|Url $url): ?string
{
if (!$url instanceof Url) {
$url = Url::parse($url);
}
$forDomain = empty($url->domain()) ? $url->host() : $url->domain();
if (!is_string($forDomain)) {
return null;
}
return $forDomain;
}
protected function buildSetCookieHeaderFromBrowserCookie(BrowserCookie $cookie): string
{
$attributes = [
'domain' => 'Domain',
'expires' => 'Expires',
'max-age' => 'Max-Age',
'path' => 'Path',
'secure' => 'Secure',
'httpOnly' => 'HttpOnly',
'sameSite' => 'SameSite',
];
$parts = [sprintf('%s=%s', $cookie->getName(), $cookie->getValue())];
foreach ($attributes as $name => $setCookieName) {
$setCookieValue = $cookie->offsetGet($name);
if (empty($setCookieValue)) {
continue;
}
// "Expires" attribute
if ($name === 'expires') {
if ($setCookieValue !== -1) {
$parts[] = sprintf('%s=%s', $setCookieName, $this->formatExpiresValue($setCookieValue));
}
continue;
}
// Flag attributes
if ($setCookieValue === true) {
$parts[] = $setCookieName;
continue;
}
$parts[] = sprintf('%s=%s', $setCookieName, $setCookieValue);
}
return implode('; ', $parts);
}
private function formatExpiresValue(mixed $value): string
{
if (is_numeric($value)) {
$value = (string) $value;
if (str_contains($value, '.')) {
$expires = strlen(explode('.', $value, 2)[1]) <= 3 ?
DateTime::createFromFormat('U.v', $value) :
DateTime::createFromFormat('U.u', $value);
} else {
$expires = DateTime::createFromFormat('U', $value);
}
if ($expires !== false) {
return $expires->format('l, d M Y H:i:s T');
}
}
return (string) $value;
}
}
================================================
FILE: src/Loader/Http/Cookies/Date.php
================================================
<?php
namespace Crwlr\Crawler\Loader\Http\Cookies;
use DateTime;
use DateTimeInterface;
use InvalidArgumentException;
class Date
{
protected ?DateTime $dateTime = null;
public function __construct(protected readonly string $httpDateString) {}
/**
* @throws InvalidArgumentException
*/
public function dateTime(): DateTime
{
if (!$this->dateTime instanceof DateTime) {
$dateTime = DateTime::createFromFormat(DateTimeInterface::COOKIE, $this->httpDateString);
if (!$dateTime instanceof DateTime) {
$dateTime = DateTime::createFromFormat('l, d M Y H:i:s T', $this->httpDateString);
if (!$dateTime instanceof DateTime) {
throw new InvalidArgumentException('Can\'t parse date string ' . $this->httpDateString);
}
}
$this->dateTime = $dateTime;
}
return $this->dateTime;
}
}
================================================
FILE: src/Loader/Http/Cookies/Exceptions/InvalidCookieException.php
================================================
<?php
namespace Crwlr\Crawler\Loader\Http\Cookies\Exceptions;
use Exception;
class InvalidCookieException extends Exception {}
================================================
FILE: src/Loader/Http/Exceptions/LoadingException.php
================================================
<?php
namespace Crwlr\Crawler\Loader\Http\Exceptions;
use Exception;
use Psr\Http\Message\UriInterface;
use Throwable;
class LoadingException extends Exception
{
public ?int $httpStatusCode = null;
public static function from(Throwable $previousException): self
{
return new self(
'Loading failed. Exception of type ' . get_class($previousException) . ' was thrown. Exception message: ' .
$previousException->getMessage(),
previous: $previousException,
);
}
public static function make(string|UriInterface $uri, ?int $httpStatusCode = null): self
{
if ($uri instanceof UriInterface) {
$uri = (string) $uri;
}
$message = 'Failed to load ' . $uri;
if ($httpStatusCode !== null) {
$message .= ' (' . $httpStatusCode . ').';
} else {
$message .= '.';
}
$instance = new self($message);
if ($httpStatusCode !== null) {
$instance->httpStatusCode = $httpStatusCode;
}
return $instance;
}
}
================================================
FILE: src/Loader/Http/HeadlessBrowserLoaderHelper.php
================================================
<?php
namespace Crwlr\Crawler\Loader\Http;
use Closure;
use Crwlr\Crawler\Loader\Http\Browser\Screenshot;
use Crwlr\Crawler\Loader\Http\Cookies\CookieJar;
use Crwlr\Crawler\Loader\Http\Cookies\Exceptions\InvalidCookieException;
use Crwlr\Crawler\Loader\Http\Messages\RespondedRequest;
use Crwlr\Crawler\Loader\Http\Politeness\Throttler;
use Exception;
use GuzzleHttp\Psr7\Response;
use HeadlessChromium\Browser;
use HeadlessChromium\BrowserFactory;
use HeadlessChromium\Communication\Message;
use HeadlessChromium\Exception\CommunicationException;
use HeadlessChromium\Exception\CommunicationException\CannotReadResponse;
use HeadlessChromium\Exception\CommunicationException\InvalidResponse;
use HeadlessChromium\Exception\CommunicationException\ResponseHasError;
use HeadlessChromium\Exception\JavascriptException;
use HeadlessChromium\Exception\NavigationExpired;
use HeadlessChromium\Exception\NoResponseAvailable;
use HeadlessChromium\Exception\OperationTimedOut;
use HeadlessChromium\Exception\TargetDestroyed;
use HeadlessChromium\Page;
use Psr\Http\Message\RequestInterface;
use Psr\Http\Message\UriInterface;
use Psr\Log\LoggerInterface;
use Throwable;
class HeadlessBrowserLoaderHelper
{
protected ?string $executable = null;
/**
* @var array<string, mixed>
*/
protected array $options = [
'windowSize' => [1920, 1000],
];
protected bool $optionsDirty = false;
protected ?Browser $browser = null;
protected ?Page $page = null;
protected ?string $proxy = null;
protected ?string $waitForEvent = null;
protected int $timeout = 30_000;
protected ?string $pageInitScript = null;
protected bool $useNativeUserAgent = false;
protected bool $includeShadowElements = false;
/**
* @var Closure[]
*/
protected array $tempPostNavigateHooks = [];
public function __construct(
private ?BrowserFactory $browserFactory = null,
protected ?LoggerInterface $logger = null,
) {}
/**
* Set temporary post navigate hooks
*
* They will be executed after the next call to navigateToPageAndGetRespondedRequest()
* and forgotten afterward.
*
* @param Closure[] $hooks
*/
public function setTempPostNavigateHooks(array $hooks): static
{
$this->tempPostNavigateHooks = $hooks;
return $this;
}
/**
* @throws OperationTimedOut
* @throws CommunicationException
* @throws NoResponseAvailable
* @throws NavigationExpired
* @throws InvalidResponse
* @throws CannotReadResponse
* @throws ResponseHasError
* @throws JavascriptException
* @throws Exception
*/
public function navigateToPageAndGetRespondedRequest(
RequestInterface $request,
Throttler $throttler,
?string $proxy = null,
?CookieJar $cookieJar = null,
): RespondedRequest {
if (!$this->page || $this->shouldRenewBrowser($proxy)) {
$this->page = $this->getBrowser($request, $proxy)->createPage();
} else {
try {
$this->page->assertNotClosed();
} catch (TargetDestroyed) {
$this->page = $this->getBrowser($request, $proxy)->createPage();
}
}
if ($cookieJar === null) {
$this->page->getSession()->sendMessageSync(new Message('Network.clearBrowserCookies'));
}
$statusCode = 200;
$responseHeaders = [];
$requestId = null;
$this->page->getSession()->once(
"method:Network.responseReceived",
function ($params) use (&$statusCode, &$responseHeaders, &$requestId) {
$statusCode = $params['response']['status'];
$responseHeaders = $this->sanitizeResponseHeaders($params['response']['headers']);
$requestId = $params['requestId'] ?? null;
},
);
$throttler->trackRequestStartFor($request->getUri());
$this->navigate($request->getUri()->__toString());
$throttler->trackRequestEndFor($request->getUri());
$hookActionData = $this->callPostNavigateHooks();
if (is_string($requestId) && $this->page && !$this->responseIsHtmlDocument($this->page)) {
$html = $this->tryToGetRawResponseBody($this->page, $requestId) ?? $this->getHtmlFromPage();
} else {
$html = $this->getHtmlFromPage();
}
$this->addCookiesToJar($cookieJar, $request->getUri());
return new RespondedRequest(
$request,
new Response($statusCode, $responseHeaders, $html),
$hookActionData['screenshots'] ?? [],
);
}
public function getOpenBrowser(): ?Browser
{
return $this->browser;
}
public function getOpenPage(): ?Page
{
return $this->page;
}
/**
* @throws Exception
*/
public function closeBrowser(): void
{
if ($this->browser) {
if ($this->page) {
$this->page->close();
$this->page = null;
}
$this->browser->close();
$this->browser = null;
}
}
public function setExecutable(string $executable): static
{
$this->executable = $executable;
return $this;
}
/**
* @param array<string, mixed> $options
*/
public function setOptions(array $options): static
{
$this->options = $options;
$this->optionsDirty = true;
return $this;
}
/**
* @param array<string, mixed> $options
*/
public function addOptions(array $options): static
{
foreach ($options as $key => $value) {
$this->options[$key] = $value;
}
$this->optionsDirty = true;
return $this;
}
public function waitForNavigationEvent(string $eventName): static
{
$this->waitForEvent = $eventName;
return $this;
}
public function getTimeout(): int
{
return $this->timeout;
}
public function setTimeout(int $timeout): static
{
$this->timeout = $timeout;
return $this;
}
/**
* @param string[] $headers
* @return string[]
*/
public function sanitizeResponseHeaders(array $headers): array
{
foreach ($headers as $key => $value) {
$headers[$key] = explode(PHP_EOL, $value)[0];
}
return $headers;
}
/**
* @param string $scriptSource
* @return $this
*/
public function setPageInitScript(string $scriptSource): static
{
$this->pageInitScript = $scriptSource;
return $this;
}
public function useNativeUserAgent(): static
{
$this->useNativeUserAgent = true;
return $this;
}
public function includeShadowElementsInHtml(): static
{
$this->includeShadowElements = true;
return $this;
}
/**
* @throws OperationTimedOut
* @throws CommunicationException
* @throws NavigationExpired
* @throws NoResponseAvailable
* @throws InvalidResponse
* @throws CannotReadResponse
* @throws ResponseHasError
*/
protected function navigate(string $url): void
{
if ($this->waitForEvent) {
$this->page?->navigate($url)->waitForNavigation($this->waitForEvent, $this->timeout);
} else {
$this->page?->navigate($url)->waitForNavigation(timeout: $this->timeout);
}
}
/**
* @return array<string, mixed>
*/
protected function callPostNavigateHooks(): array
{
$returnData = [];
if (!empty($this->tempPostNavigateHooks)) {
foreach ($this->tempPostNavigateHooks as $hook) {
$returnValue = $hook->call($this, $this->page, $this->logger);
if ($returnValue instanceof Screenshot) {
if (!array_key_exists('screenshots', $returnData)) {
$returnData['screenshots'] = [$returnValue];
} else {
$returnData['screenshots'][] = $returnValue;
}
}
}
}
$this->tempPostNavigateHooks = [];
return $returnData;
}
/**
* @throws CommunicationException
* @throws OperationTimedOut
* @throws NoResponseAvailable
* @throws InvalidCookieException
*/
protected function addCookiesToJar(?CookieJar $cookieJar, UriInterface $requestUrl): void
{
if (!$cookieJar) {
return;
}
$cookies = $this->page?->getCookies();
if ($cookies) {
$cookieJar->addFrom($requestUrl, $cookies);
}
}
/**
* @throws Exception
*/
protected function getBrowser(
RequestInterface $request,
?string $proxy = null,
): Browser {
if (!$this->browser || $this->shouldRenewBrowser($proxy)) {
$this->closeBrowser();
$options = $this->optionsFromRequest($request, $proxy);
if (!$this->browserFactory) {
$this->browserFactory = new BrowserFactory($this->executable);
}
$this->browser = $this->browserFactory->createBrowser($options);
if ($this->pageInitScript) {
$this->browser->setPagePreScript($this->pageInitScript);
}
$this->optionsDirty = false;
}
return $this->browser;
}
protected function shouldRenewBrowser(?string $proxy): bool
{
return $this->optionsDirty || ($proxy !== $this->proxy);
}
/**
* @param RequestInterface $request
* @return array<string, mixed>
*/
protected function optionsFromRequest(RequestInterface $request, ?string $proxy = null): array
{
$options = $this->options;
if (isset($request->getHeader('User-Agent')[0]) && !$this->useNativeUserAgent) {
$options['userAgent'] = $request->getHeader('User-Agent')[0];
} elseif ($this->useNativeUserAgent && !empty($request->getHeader('User-Agent'))) {
$request = $request->withoutHeader('User-Agent');
}
$options['headers'] = array_merge(
$options['headers'] ?? [],
$this->prepareRequestHeaders($request->getHeaders()),
);
if (!empty($proxy)) {
$this->proxy = $options['proxyServer'] = $proxy;
} else {
$this->proxy = null;
}
return $options;
}
/**
* @param mixed[] $headers
* @return array<string, string>
*/
protected function prepareRequestHeaders(array $headers = []): array
{
$headers = $this->removeHeadersCausingErrorWithHeadlessBrowser($headers);
return array_map(function ($headerValue) {
return is_array($headerValue) ? implode(';', $headerValue) : $headerValue;
}, $headers);
}
/**
* @param mixed[] $headers
* @return mixed[]
*/
protected function removeHeadersCausingErrorWithHeadlessBrowser(array $headers = []): array
{
$removeHeaders = ['host'];
foreach ($headers as $headerName => $headerValue) {
if (in_array(strtolower($headerName), $removeHeaders, true)) {
unset($headers[$headerName]);
}
}
return $headers;
}
protected function responseIsHtmlDocument(?Page $page = null): bool
{
if (!$page) {
return false;
}
try {
return $page->evaluate(
<<<JS
(document.contentType === 'text/html' || document instanceof HTMLDocument) &&
!(document.contentType === 'text/plain' && document.body.textContent.trimLeft().startsWith('<?xml '))
JS,
)->getReturnValue(3000);
} catch (Throwable $e) {
return true;
}
}
/**
* In production, retrieving the raw response body using the Network.getResponseBody message sometimes failed.
* Waiting briefly before sending the message appeared to resolve the issue.
* So, this method tries up to three times with a brief wait between each attempt.
*/
protected function tryToGetRawResponseBody(Page $page, string $requestId): ?string
{
for ($i = 1; $i <= 3; $i++) {
try {
$message = $page->getSession()->sendMessageSync(new Message('Network.getResponseBody', [
'requestId' => $requestId,
]));
if ($message->isSuccessful() && $message->getData()['result']['body']) {
return $message->getData()['result']['body'];
}
} catch (Throwable) {
}
usleep($i * 100000);
}
return null;
}
/**
* @throws CommunicationException
* @throws JavascriptException
*/
protected function getHtmlFromPage(): string
{
if ($this->page instanceof Page && $this->includeShadowElements) {
try {
// Found this script on
// https://stackoverflow.com/questions/69867758/how-can-i-get-all-the-html-in-a-document-or-node-containing-shadowroot-elements
return $this->page->evaluate(<<<JS
function extractHTML(node) {
if (!node) return ''
if (node.nodeType===3) return node.textContent;
if (node.nodeType!==1) return ''
let html = ''
let outer = node.cloneNode();
node = node.shadowRoot || node
if (node.children.length) {
for (let n of node.childNodes) {
if (n.assignedNodes) {
if (n.assignedNodes()[0]) {
html += extractHTML(n.assignedNodes()[0])
} else { html += n.innerHTML }
} else { html += extractHTML(n) }
}
} else { html = node.innerHTML }
outer.innerHTML = html
return outer.outerHTML
}
extractHTML(document.documentElement);
JS)->getReturnValue();
} catch (Throwable) {
return $this->page->getHtml();
}
}
return $this->page?->getHtml() ?? '';
}
}
================================================
FILE: src/Loader/Http/HttpLoader.php
================================================
<?php
namespace Crwlr\Crawler\Loader\Http;
use Crwlr\Crawler\Loader\Http\Cache\RetryManager;
use Crwlr\Crawler\Loader\Http\Cookies\CookieJar;
use Crwlr\Crawler\Loader\Http\Exceptions\LoadingException;
use Crwlr\Crawler\Loader\Http\Messages\RespondedRequest;
use Crwlr\Crawler\Loader\Http\Politeness\RetryErrorResponseHandler;
use Crwlr\Crawler\Loader\Http\Politeness\RobotsTxtHandler;
use Crwlr\Crawler\Loader\Http\Politeness\Throttler;
use Crwlr\Crawler\Loader\Loader;
use Crwlr\Crawler\Steps\Filters\FilterInterface;
use Crwlr\Crawler\UserAgents\UserAgentInterface;
use Crwlr\Crawler\Utils\RequestKey;
use Crwlr\Url\Exceptions\InvalidUrlException;
use Crwlr\Url\Url;
use Error;
use Exception;
use GuzzleHttp\Client;
use GuzzleHttp\Exception\GuzzleException;
use GuzzleHttp\Psr7\Request;
use HeadlessChromium\Exception\CommunicationException;
use HeadlessChromium\Exception\CommunicationException\CannotReadResponse;
use HeadlessChromium\Exception\CommunicationException\InvalidResponse;
use HeadlessChromium\Exception\CommunicationException\ResponseHasError;
use HeadlessChromium\Exception\JavascriptException;
use HeadlessChromium\Exception\NavigationExpired;
use HeadlessChromium\Exception\NoResponseAvailable;
use HeadlessChromium\Exception\OperationTimedOut;
use InvalidArgumentException;
use Psr\Http\Client\ClientExceptionInterface;
use Psr\Http\Client\ClientInterface;
use Psr\Http\Message\RequestInterface;
use Psr\Http\Message\ResponseInterface;
use Psr\Http\Message\UriInterface;
use Psr\Log\LoggerInterface;
use Throwable;
class HttpLoader extends Loader
{
protected ClientInterface $httpClient;
protected CookieJar $cookieJar;
protected bool $useCookies = true;
protected ?HeadlessBrowserLoaderHelper $browserHelper = null;
protected bool $useHeadlessBrowser = false;
protected ?RobotsTxtHandler $robotsTxtHandler = null;
protected Throttler $throttler;
/**
* @var mixed[]
*/
protected array $defaultGuzzleClientConfig = [
'connect_timeout' => 10,
'timeout' => 60,
];
protected int $maxRedirects = 10;
protected ?RetryManager $retryCachedErrorResponses = null;
protected bool $writeOnlyCache = false;
/**
* @var array<int, FilterInterface>
*/
protected array $cacheUrlFilters = [];
protected bool $skipCacheForNextRequest = false;
protected ?ProxyManager $proxies = null;
/**
* @param mixed[] $defaultGuzzleClientConfig
*/
public function __construct(
UserAgentInterface $userAgent,
?ClientInterface $httpClient = null,
?LoggerInterface $logger = null,
?Throttler $throttler = null,
protected RetryErrorResponseHandler $retryErrorResponseHandler = new RetryErrorResponseHandler(),
array $defaultGuzzleClientConfig = [],
) {
parent::__construct($userAgent, $logger);
$this->retryErrorResponseHandler->setLogger($this->logger);
$this->httpClient = $httpClient ?? new Client($this->mergeClientConfigWithDefaults($defaultGuzzleClientConfig));
$this->onSuccess(function (RequestInterface $request, ResponseInterface $response, LoggerInterface $logger) {
$logger->info('Loaded ' . $request->getUri()->__toString());
});
$this->onError(function (RequestInterface $request, Exception|Error|ResponseInterface $exceptionOrResponse, $logger) {
$logMessage = 'Failed to load ' . $request->getUri()->__toString() . ': ';
if ($exceptionOrResponse instanceof ResponseInterface) {
$logMessage .= 'got response ' . $exceptionOrResponse->getStatusCode() . ' - ' .
$exceptionOrResponse->getReasonPhrase();
} else {
$logMessage .= $exceptionOrResponse->getMessage();
}
$logger->error($logMessage);
});
$this->cookieJar = new CookieJar();
$this->throttler = $throttler ?? new Throttler();
}
/**
* @param mixed $subject
* @return RespondedRequest|null
*/
public function load(mixed $subject): ?RespondedRequest
{
$this->_resetCalledHooks();
try {
$request = $this->validateSubjectType($subject);
} catch (InvalidArgumentException|Exception $exception) {
$url = $subject instanceof RequestInterface ? (string) $subject->getUri() : (string) $subject;
$this->logger->error('Invalid input URL: ' . $url . ' - ' . $exception->getMessage());
return null;
}
try {
if (!$this->isAllowedToBeLoaded($request->getUri())) {
return null;
}
$isFromCache = false;
$respondedRequest = $this->tryLoading($request, $isFromCache);
if ($respondedRequest->response->getStatusCode() < 400) {
$this->callHook('onSuccess', $request, $respondedRequest->response);
} else {
$this->callHook('onError', $request, $respondedRequest->response);
}
if (!$isFromCache) {
$this->addToCache($respondedRequest);
}
return $respondedRequest;
} catch (Throwable $exception) {
// Don't move to finally so hooks don't run before it.
$this->throttler->trackRequestEndFor($request->getUri());
$this->callHook('onError', $request, $exception);
return null;
} finally {
$this->callHook('afterLoad', $request);
$this->_resetCalledHooks();
}
}
/**
* @throws LoadingException|InvalidArgumentException|Exception
*/
public function loadOrFail(mixed $subject): RespondedRequest
{
$this->_resetCalledHooks();
$request = $this->validateSubjectType($subject);
try {
$this->isAllowedToBeLoaded($request->getUri(), true);
$isFromCache = false;
$respondedRequest = $this->tryLoading($request, $isFromCache);
if ($respondedRequest->response->getStatusCode() >= 400) {
throw LoadingException::make($request->getUri(), $respondedRequest->response->getStatusCode());
}
$this->callHook('onSuccess', $request, $respondedRequest->response);
$this->callHook('afterLoad', $request);
if (!$isFromCache) {
$this->addToCache($respondedRequest);
}
return $respondedRequest;
} catch (Throwable $exception) {
$this->_resetCalledHooks();
throw LoadingException::from($exception);
}
}
public function dontUseCookies(): static
{
$this->useCookies = false;
return $this;
}
public function flushCookies(): void
{
$this->cookieJar->flush();
}
public function useHeadlessBrowser(): static
{
$this->useHeadlessBrowser = true;
return $this;
}
/**
* @throws Exception
*/
public function useHttpClient(): static
{
$this->useHeadlessBrowser = false;
$this->browser()->closeBrowser();
return $this;
}
public function usesHeadlessBrowser(): bool
{
return $this->useHeadlessBrowser;
}
public function setMaxRedirects(int $maxRedirects): static
{
$this->maxRedirects = $maxRedirects;
return $this;
}
public function robotsTxt(): RobotsTxtHandler
{
if (!$this->robotsTxtHandler) {
$this->robotsTxtHandler = new RobotsTxtHandler($this, $this->logger);
}
return $this->robotsTxtHandler;
}
public function throttle(): Throttler
{
return $this->throttler;
}
public function retryCachedErrorResponses(): RetryManager
{
$this->retryCachedErrorResponses = new RetryManager();
return $this->retryCachedErrorResponses;
}
public function writeOnlyCache(): static
{
$this->writeOnlyCache = true;
return $this;
}
public function cacheOnlyWhereUrl(FilterInterface $filter): static
{
$this->cacheUrlFilters[] = $filter;
return $this;
}
/**
* @throws Exception
*/
public function useProxy(string $proxyUrl): void
{
$this->checkIfProxiesCanBeUsed();
$this->proxies = new ProxyManager([$proxyUrl]);
}
/**
* @param string[] $proxyUrls
* @throws Exception
*/
public function useRotatingProxies(array $proxyUrls): void
{
$this->checkIfProxiesCanBeUsed();
$this->proxies = new ProxyManager($proxyUrls);
}
public function browser(): HeadlessBrowserLoaderHelper
{
if (!$this->browserHelper) {
$this->browserHelper = new HeadlessBrowserLoaderHelper(logger: $this->logger);
}
return $this->browserHelper;
}
/**
* @throws \Psr\SimpleCache\InvalidArgumentException
*/
public function addToCache(RespondedRequest $respondedRequest): void
{
if ($this->cache && $this->shouldResponseBeCached($respondedRequest)) {
$this->cache->set($respondedRequest->cacheKey(), $respondedRequest);
}
}
public function skipCacheForNextRequest(): static
{
$this->skipCacheForNextRequest = true;
return $this;
}
/**
* @throws LoadingException|Throwable|\Psr\SimpleCache\InvalidArgumentException
*/
protected function tryLoading(
RequestInterface $request,
bool &$isFromCache,
): RespondedRequest {
$request = $this->prepareRequest($request);
$this->callHook('beforeLoad', $request);
$respondedRequest = $this->shouldRequestBeServedFromCache($request) ? $this->getFromCache($request) : null;
if ($respondedRequest) {
$isFromCache = true;
$respondedRequest->setIsServedFromCache();
$this->callHook('onCacheHit', $request, $respondedRequest->response);
}
$this->skipCacheForNextRequest = false;
if (!$respondedRequest) {
$respondedRequest = $this->waitForGoAndLoad($request);
}
return $respondedRequest;
}
/**
* @throws ClientExceptionInterface
* @throws GuzzleException
* @throws LoadingException
* @throws CommunicationException
* @throws CannotReadResponse
* @throws InvalidResponse
* @throws ResponseHasError
* @throws JavascriptException
* @throws NavigationExpired
* @throws NoResponseAvailable
* @throws OperationTimedOut
* @throws Exception
*/
protected function waitForGoAndLoad(RequestInterface $request): RespondedRequest
{
$this->throttler->waitForGo($request->getUri());
$respondedRequest = $this->loadViaClientOrHeadlessBrowser($request);
if ($this->retryErrorResponseHandler->shouldWait($respondedRequest)) {
$respondedRequest = $this->retryErrorResponseHandler->handleRetries(
$respondedRequest,
function () use ($request) {
$request = $this->prepareRequest($request);
return $this->loadViaClientOrHeadlessBrowser($request);
},
);
}
return $respondedRequest;
}
/**
* @throws ClientExceptionInterface
* @throws GuzzleException
* @throws LoadingException
* @throws CommunicationException
* @throws CannotReadResponse
* @throws InvalidResponse
* @throws ResponseHasError
* @throws JavascriptException
* @throws NavigationExpired
* @throws NoResponseAvailable
* @throws OperationTimedOut
*/
protected function loadViaClientOrHeadlessBrowser(RequestInterface $request): RespondedRequest
{
if ($this->useHeadlessBrowser) {
$proxy = $this->proxies?->getProxy() ?? null;
return $this->browser()->navigateToPageAndGetRespondedRequest(
$request,
$this->throttler,
$proxy,
$this->useCookies ? $this->cookieJar : null,
);
}
return $this->handleRedirects($request);
}
/**
* @throws ClientExceptionInterface
* @throws LoadingException
* @throws GuzzleException
* @throws Exception
*/
protected function handleRedirects(
RequestInterface $request,
?RespondedRequest $respondedRequest = null,
int $redirectNumber = 0,
): RespondedRequest {
if ($redirectNumber >= $this->maxRedirects) {
throw new LoadingException('Too many redirects.');
}
if (!$respondedRequest) {
$this->throttler->trackRequestStartFor($request->getUri());
}
if ($this->proxies && $this->httpClient instanceof Client) {
$response = $this->sendProxiedRequestUsingGuzzle($request, $this->httpClient);
} else {
$response = $this->httpClient->sendRequest($request);
}
if (!$respondedRequest) {
$respondedRequest = new RespondedRequest($request, $response);
} else {
$respondedRequest->setResponse($response);
}
$this->addCookiesToJar($respondedRequest);
if ($respondedRequest->isRedirect()) {
$this->logger()->info('Load redirect to: ' . $respondedRequest->effectiveUri());
$newRequest = $request->withUri(Url::parsePsr7($respondedRequest->effectiveUri()));
$redirectNumber++;
return $this->handleRedirects($newRequest, $respondedRequest, $redirectNumber);
} else {
$this->throttler->trackRequestEndFor($respondedRequest->request->getUri());
}
return $respondedRequest;
}
/**
* @throws GuzzleException
*/
protected function sendProxiedRequestUsingGuzzle(RequestInterface $request, Client $client): ResponseInterface
{
return $client->request(
$request->getMethod(),
$request->getUri(),
[
'headers' => $request->getHeaders(),
'proxy' => $this->proxies?->getProxy(),
'version' => $request->getProtocolVersion(),
'body' => $request->getBody(),
],
);
}
/**
* @return void
* @throws Exception
*/
protected function checkIfProxiesCanBeUsed(): void
{
if (!$this->usesHeadlessBrowser() && !$this->httpClient instanceof Client) {
throw new Exception(
'The included proxy feature can only be used when using a guzzle HTTP client or headless chrome ' .
'browser for loading.',
);
}
}
/**
* @param mixed[] $config
* @return mixed[]
*/
protected function mergeClientConfigWithDefaults(array $config): array
{
$merged = $this->defaultGuzzleClientConfig;
foreach ($config as $key => $value) {
$merged[$key] = $value;
}
return $merged;
}
/**
* @throws LoadingException
* @throws Exception
*/
protected function isAllowedToBeLoaded(UriInterface $uri, bool $throwsException = false): bool
{
if (!$this->robotsTxt()->isAllowed($uri)) {
$message = 'Crawler is not allowed to load ' . $uri . ' according to robots.txt file.';
$this->logger->warning($message);
if ($throwsException) {
throw new LoadingException($message);
}
return false;
}
return true;
}
/**
* @throws \Psr\SimpleCache\InvalidArgumentException
* @throws Exception
*/
protected function getFromCache(RequestInterface $request): ?RespondedRequest
{
if (!$this->cache || $this->writeOnlyCache) {
return null;
}
$key = RequestKey::from($request);
if ($this->cache->has($key)) {
$this->logger->info('Found ' . $request->getUri()->__toString() . ' in cache.');
$respondedRequest = $this->cache->get($key);
// Previously, until v0.7 just used serialized arrays. Leave this for backwards compatibility.
if (is_array($respondedRequest)) {
$respondedRequest = RespondedRequest::fromArray($respondedRequest);
}
if ($this->retryCachedErrorResponses?->shallBeRetried($respondedRequest->response->getStatusCode())) {
$this->logger->info('Cached response was an error response, retry.');
return null;
}
return $respondedRequest;
}
return null;
}
protected function shouldResponseBeCached(RespondedRequest $respondedRequest): bool
{
if (!empty($this->cacheUrlFilters)) {
foreach ($this->cacheUrlFilters as $filter) {
$noUrlMatched = true;
foreach ($respondedRequest->allUris() as $url) {
if ($filter->evaluate($url)) {
$noUrlMatched = false;
}
}
if ($noUrlMatched) {
return false;
}
}
}
return true;
}
protected function shouldRequestBeServedFromCache(RequestInterface $request): bool
{
if ($this->skipCacheForNextRequest === true) {
return false;
}
if (!empty($this->cacheUrlFilters)) {
foreach ($this->cacheUrlFilters as $filter) {
if (!$filter->evaluate((string) $request->getUri())) {
return false;
}
}
}
return true;
}
/**
* @throws InvalidArgumentException|Exception
*/
protected function validateSubjectType(RequestInterface|string $requestOrUri): RequestInterface
{
if (is_string($requestOrUri)) {
try {
$url = Url::parse($requestOrUri);
if ($url->isRelativeReference()) {
throw new InvalidArgumentException(
'The URI is a relative reference and therefore can\'t be loaded.',
);
}
return new Request('GET', $url->toPsr7());
} catch (InvalidUrlException) {
throw new InvalidArgumentException('Invalid URL.');
}
} elseif (
empty(trim($requestOrUri->getUri()->getScheme())) &&
Url::parse($requestOrUri->getUri())->isRelativeReference()
) {
throw new InvalidArgumentException('The URI is a relative reference and therefore can\'t be loaded.');
}
return $requestOrUri;
}
/**
* @throws Exception
*/
protected function prepareRequest(RequestInterface $request): RequestInterface
{
$request = $request->withHeader('User-Agent', $this->userAgent->__toString());
// When writing tests I found that guzzle somehow messed up headers with multiple strings as value in the PSR-7
// request object. It sent only the last part of the array, instead of concatenating the array of strings to a
// comma separated string. Don't know if that happens with all handlers (curl, stream), will investigate
// further. But until this is fixed, we just prepare the headers ourselves.
foreach ($request->getHeaders() as $headerName => $headerValues) {
$request = $request->withHeader($headerName, $request->getHeaderLine($headerName));
}
return $this->addCookiesToRequest($request);
}
protected function addCookiesToJar(RespondedRequest $respondedRequest): void
{
if ($this->useCookies) {
try {
$this->cookieJar->addFrom($respondedRequest->effectiveUri(), $respondedRequest->response);
} catch (Exception $exception) {
$this->logger->warning('Problem when adding cookies to the Jar: ' . $exception->getMessage());
}
}
}
/**
* @throws Exception
*/
protected function addCookiesToRequest(RequestInterface $request): RequestInterface
{
if (!$this->useCookies) {
return $request;
}
foreach ($this->cookieJar->getFor($request->getUri()) as $cookie) {
$request = $request->withAddedHeader('Cookie', $cookie->__toString());
}
return $request;
}
}
================================================
FILE: src/Loader/Http/Messages/RespondedRequest.php
================================================
<?php
namespace Crwlr\Crawler\Loader\Http\Messages;
use Crwlr\Crawler\Cache\Exceptions\MissingZlibExtensionException;
use Crwlr\Crawler\Loader\Http\Browser\Screenshot;
use Crwlr\Crawler\Steps\Loading\Http;
use Crwlr\Crawler\Utils\RequestKey;
use Crwlr\Url\Url;
use Exception;
use GuzzleHttp\Psr7\Request;
use GuzzleHttp\Psr7\Response;
use Psr\Http\Message\RequestInterface;
use Psr\Http\Message\ResponseInterface;
class RespondedRequest
{
/**
* @var string[]
*/
protected array $redirects = [];
protected bool $isServedFromCache = false;
/**
* @param Screenshot[] $screenshots
* @throws Exception
*/
public function __construct(
public RequestInterface $request,
public ResponseInterface $response,
public array $screenshots = [],
) {
$this->setResponse($this->response);
}
/**
* @param mixed[] $data
* @return RespondedRequest
* @throws Exception
*/
public static function fromArray(array $data): RespondedRequest
{
$respondedRequest = new RespondedRequest(
self::requestFromArray($data),
self::responseFromArray($data),
self::screenshotsFromArray($data),
);
if ($data['effectiveUri'] && $data['effectiveUri'] !== $data['requestUri']) {
$respondedRequest->addRedirectUri($data['effectiveUri']);
}
return $respondedRequest;
}
/**
* @return mixed[]
* @throws MissingZlibExtensionException
*/
public function __serialize(): array
{
return [
'requestMethod' => $this->request->getMethod(),
'requestUri' => $this->request->getUri()->__toString(),
'requestHeaders' => $this->request->getHeaders(),
'requestBody' => Http::getBodyString($this->request),
'effectiveUri' => $this->effectiveUri(),
'responseStatusCode' => $this->response->getStatusCode(),
'responseHeaders' => $this->response->getHeaders(),
'responseBody' => Http::getBodyString($this->response),
'screenshots' => array_map(fn(Screenshot $screenshot) => $screenshot->path, $this->screenshots),
];
}
/**
* @return mixed[]
* @throws MissingZlibExtensionException
*/
public function toArrayForResult(): array
{
$serialized = $this->__serialize();
$mapping = [
'url' => 'effectiveUri',
'uri' => 'effectiveUri',
'status' => 'responseStatusCode',
'headers' => 'responseHeaders',
'body' => 'responseBody',
];
foreach ($mapping as $newKey => $originalKey) {
$serialized[$newKey] = $serialized[$originalKey];
}
return $serialized;
}
/**
* @param mixed[] $data
* @throws Exception
*/
public function __unserialize(array $data): void
{
$this->request = self::requestFromArray($data);
$this->response = self::responseFromArray($data);
if ($data['effectiveUri'] && $data['effectiveUri'] !== $data['requestUri']) {
$this->addRedirectUri($data['effectiveUri']);
}
$this->screenshots = self::screenshotsFromArray($data);
}
public function effectiveUri(): string
{
return empty($this->redirects) ? $this->requestedUri() : end($this->redirects);
}
public function requestedUri(): string
{
return $this->request->getUri();
}
/**
* @return array<int, string>
*/
public function allUris(): array
{
$uris = [$this->requestedUri() => $this->requestedUri()];
foreach ($this->redirects as $redirect) {
$uris[$redirect] = $redirect;
}
return array_values($uris);
}
public function isRedirect(): bool
{
return $this->response->getStatusCode() >= 300 && $this->response->getStatusCode() < 400;
}
/**
* @return string[]
*/
public function redirects(): array
{
return $this->redirects;
}
/**
* @throws Exception
*/
public function setResponse(ResponseInterface $response): void
{
$this->response = $response;
if ($this->isRedirect()) {
$this->addRedirectUri();
}
}
/**
* @throws Exception
*/
public function addRedirectUri(?string $redirectUri = null): void
{
$redirectUri = Url::parse($this->effectiveUri())
->resolve($redirectUri ?? $this->response->getHeaderLine('Location'))
->__toString();
// Add it only if different from the previous one.
if ($redirectUri !== end($this->redirects)) {
$this->redirects[] = $redirectUri;
}
}
public function cacheKey(): string
{
return RequestKey::from($this->request);
}
public function isServedFromCache(): bool
{
return $this->isServedFromCache;
}
public function setIsServedFromCache(bool $value = true): void
{
$this->isServedFromCache = $value;
}
/**
* @param mixed[] $data
*/
protected static function requestFromArray(array $data): Request
{
return new Request(
$data['requestMethod'],
$data['requestUri'],
$data['requestHeaders'],
$data['requestBody'],
);
}
/**
* @param mixed[] $data
*/
protected static function responseFromArray(array $data): Response
{
return new Response(
$data['responseStatusCode'],
$data['responseHeaders'],
$data['responseBody'],
);
}
/**
* @param mixed[] $data
* @return Screenshot[]
*/
protected static function screenshotsFromArray(array $data): array
{
$screenshots = [];
if (array_key_exists('screenshots', $data)) {
foreach ($data['screenshots'] as $screenshot) {
if (file_exists($screenshot)) {
$screenshots[] = new Screenshot($screenshot);
}
}
}
return $screenshots;
}
}
================================================
FILE: src/Loader/Http/Politeness/RetryErrorResponseHandler.php
================================================
<?php
namespace Crwlr\Crawler\Loader\Http\Politeness;
use Closure;
use Crwlr\Crawler\Loader\Http\Exceptions\LoadingException;
use Crwlr\Crawler\Loader\Http\Messages\RespondedRequest;
use Psr\Http\Message\ResponseInterface;
use Psr\Log\LoggerInterface;
class RetryErrorResponseHandler
{
protected ?LoggerInterface $logger = null;
/**
* @var array<int, string>
*/
protected array $waitErrors = [
429 => 'Too many Requests',
503 => 'Service Unavailable',
];
/**
* @param int[] $wait
*/
public function __construct(
protected int $retries = 2,
protected array $wait = [10, 60],
protected int $maxWait = 60,
) {}
public function shouldWait(RespondedRequest $respondedRequest): bool
{
if (array_key_exists($respondedRequest->response->getStatusCode(), $this->waitErrors)) {
return true;
}
return false;
}
public function setLogger(LoggerInterface $logger): void
{
$this->logger = $logger;
}
/**
* @throws LoadingException
*/
public function handleRetries(
RespondedRequest $respondedRequest,
Closure $retryCallback,
): RespondedRequest {
$this->logReceivedErrorResponseMessage($respondedRequest);
$retries = 0;
$this->wait[0] = $this->getWaitTimeFromResponse($respondedRequest->response) ?? $this->wait[0];
while ($retries < $this->retries) {
$this->logWaitForRetryMessage($retries);
sleep($this->wait[$retries]);
$respondedRequest = $retryCallback();
if ($respondedRequest instanceof RespondedRequest && !$this->shouldWait($respondedRequest)) {
return $respondedRequest;
} elseif ($respondedRequest) {
$this->logRepeatedErrorMessage($respondedRequest);
}
$retries++;
}
$this->logger?->error('Stop crawling');
throw new LoadingException('Stopped crawling because of repeated error responses.');
}
/**
* @throws LoadingException
*/
protected function getWaitTimeFromResponse(ResponseInterface $response): ?int
{
$retryAfterHeader = $response->getHeader('Retry-After');
if (!empty($retryAfterHeader)) {
$retryAfterHeader = reset($retryAfterHeader);
if (is_numeric($retryAfterHeader)) {
$waitFor = (int) $retryAfterHeader;
if ($waitFor > $this->maxWait) {
$this->retryAfterExceedsLimitMessage($response);
}
return (int) $retryAfterHeader;
}
}
return null;
}
protected function getResponseCodeAndReasonPhrase(RespondedRequest|ResponseInterface $respondedRequest): string
{
$response = $respondedRequest instanceof RespondedRequest ? $respondedRequest->response : $respondedRequest;
$statusCode = $response->getStatusCode();
if (array_key_exists($statusCode, $this->waitErrors)) {
return $statusCode . ' (' . $this->waitErrors[$statusCode] . ')';
}
return '?';
}
protected function logReceivedErrorResponseMessage(RespondedRequest $respondedRequest): void
{
$statusCodeAndReasonPhrase = $this->getResponseCodeAndReasonPhrase($respondedRequest);
$this->logger?->warning(
'Request to ' . $respondedRequest->requestedUri() . ' returned ' . $statusCodeAndReasonPhrase,
);
}
protected function logWaitForRetryMessage(int $retryNumber): void
{
$this->logger?->warning('Will wait for ' . $this->wait[$retryNumber] . ' seconds and then retry');
}
protected function logRepeatedErrorMessage(RespondedRequest $respondedRequest): void
{
$statusCodeAndReasonPhrase = $this->getResponseCodeAndReasonPhrase($respondedRequest);
$this->logger?->warning('Retry again received an error response: ' . $statusCodeAndReasonPhrase);
}
/**
* @throws LoadingException
*/
protected function retryAfterExceedsLimitMessage(ResponseInterface $response): string
{
$statusCodeAndReasonPhrase = $this->getResponseCodeAndReasonPhrase($response);
$message = 'Retry-After header in ' . $statusCodeAndReasonPhrase . ' response, requires to wait longer ' .
'than the defined max wait time for this case. If you want to increase this limit, set it ' .
'in the ErrorResponseHandler of your HttpLoader instance.';
$this->logger?->error($message);
throw new LoadingException($message);
}
}
================================================
FILE: src/Loader/Http/Politeness/RobotsTxtHandler.php
================================================
<?php
namespace Crwlr\Crawler\Loader\Http\Politeness;
use Crwlr\Crawler\Loader\Http\HttpLoader;
use Crwlr\Crawler\Loader\Loader;
use Crwlr\Crawler\Steps\Loading\Http;
use Crwlr\Crawler\UserAgents\BotUserAgent;
use Crwlr\Crawler\UserAgents\UserAgentInterface;
use Crwlr\RobotsTxt\Exceptions\InvalidRobotsTxtFileException;
use Crwlr\RobotsTxt\RobotsTxt;
use Crwlr\Url\Url;
use Exception;
use Psr\Http\Message\UriInterface;
use Psr\Log\LoggerInterface;
class RobotsTxtHandler
{
protected UserAgentInterface $userAgent;
/**
* @var array<string, RobotsTxt>
*/
protected array $robotsTxts = [];
protected bool $ignoreWildcardRules = false;
public function __construct(
protected Loader $loader,
protected ?LoggerInterface $logger = null,
) {
$this->userAgent = $this->loader->userAgent();
}
public function ignoreWildcardRules(): void
{
$this->ignoreWildcardRules = true;
}
/**
* @throws Exception
*/
public function isAllowed(string|UriInterface|Url $url): bool
{
if (!$this->userAgent instanceof BotUserAgent) {
return true;
}
$url = $this->getUrlInstance($url);
if ($url->path() === '/robots.txt') {
return true;
}
$robotsTxt = $this->getRobotsTxtFor($url);
if ($this->ignoreWildcardRules) {
return !$robotsTxt->isExplicitlyNotAllowedFor($url, $this->userAgent->productToken());
}
return $robotsTxt->isAllowed($url, $this->userAgent->productToken());
}
/**
* @return string[]
* @throws InvalidRobotsTxtFileException
*/
public function getSitemaps(string|UriInterface|Url $url): array
{
return $this->getRobotsTxtFor($url)->sitemaps();
}
/**
* @throws InvalidRobotsTxtFileException|Exception
*/
protected function getRobotsTxtFor(string|UriInterface|Url $url): RobotsTxt
{
$url = $this->getUrlInstance($url);
$root = $url->root();
if (isset($this->robotsTxts[$root])) {
return $this->robotsTxts[$root];
}
$robotsTxtContent = $this->loadRobotsTxtContent($root . '/robots.txt');
try {
$this->robotsTxts[$root] = RobotsTxt::parse($robotsTxtContent);
} catch (Exception $exception) {
$this->logger?->warning('Failed to parse robots.txt: ' . $exception->getMessage());
$this->robotsTxts[$root] = RobotsTxt::parse('');
}
return $this->robotsTxts[$root];
}
protected function loadRobotsTxtContent(string $robotsTxtUrl): string
{
$usedHeadlessBrowser = false;
if ($this->loader instanceof HttpLoader) {
// If loader is set to use headless browser, temporary switch to using PSR-18 HTTP Client.
$usedHeadlessBrowser = $this->loader->usesHeadlessBrowser();
$this->loader->useHttpClient();
}
$response = $this->loader->load($robotsTxtUrl);
if ($this->loader instanceof HttpLoader && $usedHeadlessBrowser) {
$this->loader->useHeadlessBrowser();
}
return $response ? Http::getBodyString($response) : '';
}
protected function getUrlInstance(string|UriInterface|Url $url): Url
{
if (is_string($url) || $url instanceof UriInterface) {
return Url::parse($url);
}
return $url;
}
}
================================================
FILE: src/Loader/Http/Politeness/Throttler.php
================================================
<?php
namespace Crwlr\Crawler\Loader\Http\Politeness;
use Crwlr\Crawler\Loader\Http\Politeness\TimingUnits\MultipleOf;
use Crwlr\Url\Url;
use Crwlr\Utils\Microseconds;
use Exception;
use InvalidArgumentException;
use Psr\Http\Message\UriInterface;
class Throttler
{
/**
* @var array<string, Microseconds>
*/
protected array $latestRequestTimes = [];
/**
* @var array<string, Microseconds>
*/
protected array $latestResponseTimes = [];
/**
* @var array<string, Microseconds>
*/
protected array $latestDurations = [];
protected Microseconds|MultipleOf $from;
protected Microseconds|MultipleOf $to;
protected Microseconds $min;
/**
* @var string[]
*/
private array $_currentRequestUrls = [];
/**
* @throws InvalidArgumentException
*/
public function __construct(
Microseconds|MultipleOf|null $from = null,
Microseconds|MultipleOf|null $to = null,
?Microseconds $min = null,
protected ?Microseconds $max = null,
) {
$this->from = $from ?? new MultipleOf(1.0);
$this->to = $to ?? new MultipleOf(2.0);
$this->validateFromAndTo();
$this->min = $min ?? Microseconds::fromSeconds(0.25);
}
/**
* @throws InvalidArgumentException
*/
public function waitBetween(Microseconds|MultipleOf $from, Microseconds|MultipleOf $to): static
{
$this->from = $from;
$this->to = $to;
$this->validateFromAndTo();
return $this;
}
public function waitAtLeast(Microseconds $seconds): static
{
$this->min = $seconds;
return $this;
}
public function waitAtMax(Microseconds $seconds): static
{
$this->max = $seconds;
return $this;
}
/**
* @throws Exception
*/
public function trackRequestStartFor(UriInterface $url): void
{
$domain = $this->getDomain($url);
$this->latestRequestTimes[$domain] = $this->time();
$this->_internalTrackStartFor($url);
}
/**
* @throws Exception
*/
public function trackRequestEndFor(UriInterface $url): void
{
if (!$this->_requestToUrlWasStarted($url)) {
return;
}
$domain = $this->getDomain($url);
if (!isset($this->latestRequestTimes[$domain])) {
return;
}
$this->latestResponseTimes[$domain] = $responseTime = $this->time();
$this->latestDurations[$domain] = $responseTime->subtract($this->latestRequestTimes[$domain]);
unset($this->latestRequestTimes[$domain]);
$this->_internalTrackEndFor($url);
}
/**
* @throws Exception
*/
public function waitForGo(UriInterface $url): void
{
$domain = $this->getDomain($url);
if (!isset($this->latestDurations[$domain])) {
return;
}
$waitUntil = $this->calcWaitUntil($this->latestDurations[$domain], $this->latestResponseTimes[$domain]);
$now = $this->time();
if ($now->isGreaterThanOrEqual($waitUntil)) {
return;
}
$wait = $waitUntil->subtract($now);
usleep($wait->value);
}
protected function time(): Microseconds
{
return Microseconds::fromSeconds(microtime(true));
}
/**
* @throws Exception
*/
protected function getDomain(UriInterface $url): string
{
$domain = Url::parse($url)->domain();
if (!$domain) {
$domain = $url->getHost();
}
if (!is_string($domain)) {
$domain = '*';
}
return $domain;
}
protected function calcWaitUntil(
Microseconds $latestResponseDuration,
Microseconds $latestResponseTime,
): Microseconds {
$from = $this->from instanceof MultipleOf ? $this->from->calc($latestResponseDuration) : $this->from;
$to = $this->to instanceof MultipleOf ? $this->to->calc($latestResponseDuration) : $this->to;
$waitValue = $this->getRandBetween($from, $to);
if ($this->min->isGreaterThan($waitValue)) {
$waitValue = $this->min;
}
if ($this->max && $this->max->isLessThan($waitValue)) {
$waitValue = $this->max;
}
return $latestResponseTime->add($waitValue);
}
protected function getRandBetween(Microseconds $from, Microseconds $to): Microseconds
{
if ($from->equals($to)) {
return $from;
}
return new Microseconds(rand($from->value, $to->value));
}
/**
* @internal
*/
protected function _internalTrackStartFor(UriInterface $url): void
{
$urlString = (string) $url;
$this->_currentRequestUrls[$urlString] = $urlString;
}
/**
* @internal
*/
protected function _internalTrackEndFor(UriInterface $url): void
{
unset($this->_currentRequestUrls[(string) $url]);
}
protected function _requestToUrlWasStarted(UriInterface $url): bool
{
$urlString = (string) $url;
if (array_key_exists($urlString, $this->_currentRequestUrls)) {
return true;
}
return false;
}
protected function validateFromAndTo(): void
{
if (!$this->fromAndToAreOfSameType()) {
throw new InvalidArgumentException('From and to values must be of the same type (Seconds or MultipleOf).');
}
if ($this->fromIsGreaterThanTo()) {
throw new InvalidArgumentException('From value can\'t be greater than to value.');
}
}
protected function fromAndToAreOfSameType(): bool
{
return ($this->from instanceof Microseconds && $this->to instanceof Microseconds) ||
($this->from instanceof MultipleOf && $this->to instanceof MultipleOf);
}
protected function fromIsGreaterThanTo(): bool
{
if ($this->from instanceof Microseconds && $this->to instanceof Microseconds) {
return $this->from->isGreaterThan($this->to);
}
if ($this->from instanceof MultipleOf && $this->to instanceof MultipleOf) {
return $this->from->factorIsGreaterThan($this->to);
}
return false;
}
}
================================================
FILE: src/Loader/Http/Politeness/TimingUnits/MultipleOf.php
================================================
<?php
namespace Crwlr\Crawler\Loader\Http\Politeness\TimingUnits;
use Crwlr\Utils\Microseconds;
class MultipleOf
{
public function __construct(public readonly float $factor) {}
public function calc(Microseconds $microseconds): Microseconds
{
$factorTwoDecimalsAsInt = (int) (round($this->factor, 2) * 100);
$result = (int) round(($microseconds->value * $factorTwoDecimalsAsInt) / 100);
return new Microseconds($result);
}
public function factorIsGreaterThan(MultipleOf $multipleOf): bool
{
return $this->factor > $multipleOf->factor;
}
}
================================================
FILE: src/Loader/Http/ProxyManager.php
================================================
<?php
namespace Crwlr\Crawler\Loader\Http;
class ProxyManager
{
protected ?int $lastUsedProxy = null;
/**
* @param string[] $proxies
*/
public function __construct(protected array $proxies)
{
$this->proxies = array_values($this->proxies);
}
public function singleProxy(): bool
{
return count($this->proxies) === 1;
}
public function hasOnlySingleProxy(): bool
{
return count($this->proxies) === 1;
}
public function hasMultipleProxies(): bool
{
return count($this->proxies) > 1;
}
public function getProxy(): string
{
if ($this->hasOnlySingleProxy()) {
return $this->proxies[0];
}
if ($this->lastUsedProxy === null || !isset($this->proxies[$this->lastUsedProxy + 1])) {
$this->lastUsedProxy = 0;
} else {
$this->lastUsedProxy += 1;
}
return $this->proxies[$this->lastUsedProxy];
}
}
================================================
FILE: src/Loader/Loader.php
================================================
<?php
namespace Crwlr\Crawler\Loader;
use Crwlr\Crawler\Logger\CliLogger;
use Crwlr\Crawler\UserAgents\UserAgentInterface;
use Psr\Http\Message\UriInterface;
use Psr\Log\LoggerInterface;
use Psr\SimpleCache\CacheInterface;
abstract class Loader implements LoaderInterface
{
protected LoggerInterface $logger;
protected ?CacheInterface $cache = null;
/**
* @var array<string, callable[]>
*/
protected array $hooks = [
'beforeLoad' => [],
'onCacheHit' => [],
'onSuccess' => [],
'onError' => [],
'afterLoad' => [],
];
/**
* @var array<string, bool>
*/
private array $_hooksCalledInCurrentLoadCall = [];
public function __construct(
protected UserAgentInterface $userAgent,
?LoggerInterface $logger = null,
) {
$this->logger = $logger ?? new CliLogger();
}
public function beforeLoad(callable $callback): void
{
$this->addHookCallback('beforeLoad', $callback);
}
public function onCacheHit(callable $callback): void
{
$this->addHookCallback('onCacheHit', $callback);
}
public function onSuccess(callable $callback): void
{
$this->addHookCallback('onSuccess', $callback);
}
public function onError(callable $callback): void
{
$this->addHookCallback('onError', $callback);
}
public function afterLoad(callable $callback): void
{
$this->addHookCallback('afterLoad', $callback);
}
public function setCache(CacheInterface $cache): static
{
$this->cache = $cache;
return $this;
}
public function userAgent(): UserAgentInterface
{
return $this->userAgent;
}
/**
* Can be implemented in a child class to check if it is allowed to load a certain uri (e.g. check robots.txt)
* Throw a LoadingException when it's not allowed and $throwsException is set to true.
*/
protected function isAllowedToBeLoaded(UriInterface $uri, bool $throwsException = false): bool
{
return true;
}
protected function callHook(string $hook, mixed ...$arguments): void
{
if (!array_key_exists($hook, $this->hooks)) {
return;
}
if (array_key_exists($hook, $this->_hooksCalledInCurrentLoadCall)) {
$this->logger->warning(
$hook . ' was already called in this load call. Probably a problem in the loader implementation.',
);
}
if (
$hook === 'afterLoad' &&
!empty($this->hooks[$hook]) &&
!array_key_exists('beforeLoad', $this->_hooksCalledInCurrentLoadCall)
) {
$this->logger->warning(
'The afterLoad hook was called without a preceding call to the beforeLoad hook. Therefore don\'t ' .
'run the hook callbacks. Most likely an exception/error occurred before the beforeLoad hook call.',
);
return;
}
$arguments[] = $this->logger;
foreach ($this->hooks[$hook] as $callback) {
call_user_func($callback, ...$arguments);
}
$this->_hooksCalledInCurrentLoadCall[$hook] = true;
}
protected function logger(): LoggerInterface
{
return $this->logger;
}
protected function addHookCallback(string $hook, callable $callback): void
{
$this->hooks[$hook][] = $callback;
}
/**
* @internal
* @return void
*/
protected function _resetCalledHooks(): void
{
$this->_hooksCalledInCurrentLoadCall = [];
}
}
================================================
FILE: src/Loader/LoaderInterface.php
================================================
<?php
namespace Crwlr\Crawler\Loader;
use Crwlr\Crawler\Loader\Http\Exceptions\LoadingException;
use InvalidArgumentException;
use Psr\SimpleCache\CacheInterface;
interface LoaderInterface
{
/**
* @param mixed $subject The subject to load, whatever the Loader implementation needs to load something.
* @return mixed
*/
public function load(mixed $subject): mixed;
/**
* @throws InvalidArgumentException Throw an InvalidArgumentException when the type of $subject argument isn't
* valid for the Loader implementation.
* @throws LoadingException Throw one when loading failed.
*/
public function loadOrFail(mixed $subject): mixed;
/**
* Add an implementation of the PSR-16 CacheInterface that the Loader will use to cache loaded resources.
*/
public function setCache(CacheInterface $cache): static;
}
================================================
FILE: src/Logger/CliLogger.php
================================================
<?php
namespace Crwlr\Crawler\Logger;
use DateTime;
use InvalidArgumentException;
use Psr\Log\LoggerInterface;
use Stringable;
use UnexpectedValueException;
class CliLogger implements LoggerInterface
{
public function emergency(string|Stringable $message, array $context = []): void
{
$this->log('emergency', $message, $context);
}
public function alert(string|Stringable $message, array $context = []): void
{
$this->log('alert', $message, $context);
}
public function critical(string|Stringable $message, array $context = []): void
{
$this->log('critical', $message, $context);
}
public function error(string|Stringable $message, array $context = []): void
{
$this->log('error', $message, $context);
}
public function warning(string|Stringable $message, array $context = []): void
{
$this->log('warning', $message, $context);
}
public function notice(string|Stringable $message, array $context = []): void
{
$this->log('notice', $message, $context);
}
public function info(string|Stringable $message, array $context = []): void
{
$this->log('info', $message, $context);
}
public function debug(string|Stringable $message, array $context = []): void
{
$this->log('debug', $message, $context);
}
/**
* @param mixed $level
* @param mixed[] $context
*/
public function log($level, string|Stringable $message, array $context = []): void
{
if (!is_string($level)) {
throw new InvalidArgumentException('Level must be string.');
}
if (!in_array($level, ['emergency', 'alert', 'critical', 'error', 'warning', 'notice', 'info', 'debug'], true)) {
throw new UnexpectedValueException('Unknown log level.');
}
$this->printTimeAndLevel($level);
echo $message . "\n";
}
protected function printTimeAndLevel(string $level): void
{
echo $this->time() . " \033[0;" . $this->levelColor($level) . "m[" . strtoupper($level) . "]\033[0m ";
}
protected function time(): string
{
return (new DateTime())->format('H:i:s:u');
}
protected function levelColor(string $level): string
{
$levelColors = [
'emergency' => '91', // bright red
'alert' => '91',
'critical' => '91',
'error' => '31', // red
'warning' => '36', // cyan
'notice' => '34', // blue
'info' => '32', // green
'debug' => '33', // yellow
];
return $levelColors[$level];
}
}
================================================
FILE: src/Logger/PreStepInvocationLogger.php
================================================
<?php
namespace Crwlr\Crawler\Logger;
use InvalidArgumentException;
use Psr\Log\LoggerInterface;
use Stringable;
use UnexpectedValueException;
class PreStepInvocationLogger implements LoggerInterface
{
/**
* @var array<int, array<string, string>>
*/
public array $messages = [];
public function emergency(string|Stringable $message, array $context = []): void
{
$this->log('emergency', $message, $context);
}
public function alert(string|Stringable $message, array $context = []): void
{
$this->log('alert', $message, $context);
}
public function critical(string|Stringable $message, array $context = []): void
{
$this->log('critical', $message, $context);
}
public function error(string|Stringable $message, array $context = []): void
{
$this->log('error', $message, $context);
}
public function warning(string|Stringable $message, array $context = []): void
{
$this->log('warning', $message, $context);
}
public function notice(string|Stringable $message, array $context = []): void
{
$this->log('notice', $message, $context);
}
public function info(string|Stringable $message, array $context = []): void
{
$this->log('info', $message, $context);
}
public function debug(string|Stringable $message, array $context = []): void
{
$this->log('debug', $message, $context);
}
/**
* @param mixed $level
* @param mixed[] $context
*/
public function log($level, string|Stringable $message, array $context = []): void
{
if (!is_string($level)) {
throw new InvalidArgumentException('Level must be string.');
}
if (!in_array($level, ['emergency', 'alert', 'critical', 'error', 'warning', 'notice', 'info', 'debug'], true)) {
throw new UnexpectedValueException('Unknown log level.');
}
$this->messages[] = ['level' => $level, 'message' => $message];
}
public function passToOtherLogger(LoggerInterface $logger): void
{
foreach ($this->messages as $message) {
$logger->{$message['level']}($message['message']);
}
}
}
================================================
FILE: src/Output.php
================================================
<?php
namespace Crwlr\Crawler;
class Output extends Io {}
================================================
FILE: src/Result.php
================================================
<?php
namespace Crwlr\Crawler;
use Crwlr\Crawler\Utils\OutputTypeHelper;
final class Result
{
/**
* @var mixed[]
*/
private array $data = [];
public function __construct(protected ?Result $result = null)
{
if ($result) {
$this->data = $result->data;
}
}
public function set(string $key, mixed $value): self
{
if ($key === '') {
$key = $this->getUnnamedKey();
}
if (array_key_exists($key, $this->data)) {
if (!is_array($this->data[$key]) || $this->isAssociativeArray($this->data[$key])) {
$this->data[$key] = [$this->data[$key], $value];
} else {
$this->data[$key][] = $value;
}
} else {
$this->data[$key] = $value;
}
return $this;
}
public function has(string $key): bool
{
return array_key_exists($key, $this->data);
}
public function get(string $key, mixed $default = null): mixed
{
if ($this->has($key)) {
return $this->data[$key];
}
return $default;
}
/**
* @return mixed[]
*/
public function toArray(): array
{
$data = OutputTypeHelper::recursiveChildObjectsToArray($this->data);
if (
count($data) === 1 &&
str_contains('unnamed', array_key_first($data)) &&
OutputTypeHelper::isAssociativeArray($data[array_key_first($data)])
) {
return $data[array_key_first($data)];
}
return $data;
}
private function getUnnamedKey(): string
{
$i = 1;
while ($this->get('unnamed' . $i) !== null) {
$i++;
}
return 'unnamed' . $i;
}
/**
* @param mixed[] $array
*/
private function isAssociativeArray(array $array): bool
{
foreach ($array as $key => $value) {
return is_string($key);
}
return false;
}
}
================================================
FILE: src/Steps/BaseStep.php
================================================
<?php
namespace Crwlr\Crawler\Steps;
use Adbar\Dot;
use Closure;
use Crwlr\Crawler\Crawler;
use Crwlr\Crawler\Input;
use Crwlr\Crawler\Io;
use Crwlr\Crawler\Logger\PreStepInvocationLogger;
use Crwlr\Crawler\Output;
use Crwlr\Crawler\Result;
use Crwlr\Crawler\Steps\Exceptions\PreRunValidationException;
use Crwlr\Crawler\Steps\Filters\Filterable;
use Crwlr\Crawler\Steps\Refiners\RefinerInterface;
use Crwlr\Crawler\Utils\OutputTypeHelper;
use Generator;
use InvalidArgumentException;
use Psr\Log\LoggerInterface;
/**
* Base class for classes Step and Group which share some things in terms of adding output data to Result objects.
*/
abstract class BaseStep implements StepInterface
{
use Filterable;
/**
* true means: keep the whole output array/object
* string: keep that one key from the (array/object) output
* array: keep those keys from the (array/object) output
*
* @var bool|string|string[]
*/
protected bool|string|array $keep = false;
/**
* Same as $keep, but for input data.
*
* @var bool|string|string[]
*/
protected bool|string|array $keepFromInput = false;
protected ?string $keepAs = null;
protected ?string $keepInputAs = null;
protected ?Crawler $parentCrawler = null;
/**
* @var array<string, Closure>
*/
protected array $subCrawlers = [];
protected ?LoggerInterface $logger = null;
protected ?string $useInputKey = null;
protected bool|string $uniqueInput = false;
/**
* @var array<int|string, true>
*/
protected array $uniqueInputKeys = [];
protected bool|string $uniqueOutput = false;
/**
* @var array<int|string, true>
*/
protected array $uniqueOutputKeys = [];
/**
* @var array<Closure|RefinerInterface|array{ key: string, refiner: Closure|RefinerInterface}>
*/
protected array $refiners = [];
protected ?string $outputKey = null;
protected ?int $maxOutputs = null;
protected int $currentOutputCount = 0;
private ?Input $fullOriginalInput = null;
/**
* @param Input $input
* @return Generator<Output>
*/
abstract public function invokeStep(Input $input): Generator;
public function addLogger(LoggerInterface $logger): static
{
if ($this->logger instanceof PreStepInvocationLogger) {
$this->logger->passToOtherLogger($logger);
}
$this->logger = $logger;
if (!empty($this->refiners)) {
foreach ($this->refiners as $refiner) {
if ($refiner instanceof RefinerInterface) {
$refiner->addLogger($logger);
} elseif (is_array($refiner) && $refiner['refiner'] instanceof RefinerInterface) {
$refiner['refiner']->addLogger($logger);
}
}
}
return $this;
}
public function setParentCrawler(Crawler $crawler): static
{
$this->parentCrawler = $crawler;
return $this;
}
/**
* @param string|string[]|null $keys
*/
public function keep(string|array|null $keys = null): static
{
if ($keys === null) {
$this->keep = true;
} else {
$this->keep = $keys;
}
return $this;
}
public function keepAs(string $key): static
{
$this->keepAs = $key;
return $this;
}
/**
* @param string|string[]|null $keys
*/
public function keepFromInput(string|array|null $keys = null): static
{
if ($keys === null) {
$this->keepFromInput = true;
} else {
$this->keepFromInput = $keys;
}
return $this;
}
public function keepInputAs(string $key): static
{
$this->keepInputAs = $key;
return $this;
}
public function keepsAnything(): bool
{
return $this->keepsAnythingFromOutputData() || $this->keepsAnythingFromInputData();
}
public function keepsAnythingFromInputData(): bool
{
return $this->keepFromInput !== false || $this->keepInputAs !== null;
}
public function keepsAnythingFromOutputData(): bool
{
return $this->keep !== false || $this->keepAs !== null;
}
public function useInputKey(string $key): static
{
gitextract_d22hbn5_/
├── .editorconfig
├── .gitattributes
├── .github/
│ └── workflows/
│ └── ci.yml
├── .gitignore
├── .php-cs-fixer.php
├── CHANGELOG.md
├── CONTRIBUTING.md
├── LICENSE
├── README.md
├── bin/
│ └── add-git-hooks
├── composer.json
├── git-hooks/
│ └── pre-commit
├── phpstan.neon
├── phpunit.xml
├── src/
│ ├── Cache/
│ │ ├── CacheItem.php
│ │ ├── Exceptions/
│ │ │ ├── MissingZlibExtensionException.php
│ │ │ └── ReadingCacheFailedException.php
│ │ └── FileCache.php
│ ├── Crawler.php
│ ├── HttpCrawler/
│ │ └── AnonymousHttpCrawlerBuilder.php
│ ├── HttpCrawler.php
│ ├── Input.php
│ ├── Io.php
│ ├── Loader/
│ │ ├── Http/
│ │ │ ├── Browser/
│ │ │ │ ├── Screenshot.php
│ │ │ │ └── ScreenshotConfig.php
│ │ │ ├── Cache/
│ │ │ │ └── RetryManager.php
│ │ │ ├── Cookies/
│ │ │ │ ├── Cookie.php
│ │ │ │ ├── CookieJar.php
│ │ │ │ ├── Date.php
│ │ │ │ └── Exceptions/
│ │ │ │ └── InvalidCookieException.php
│ │ │ ├── Exceptions/
│ │ │ │ └── LoadingException.php
│ │ │ ├── HeadlessBrowserLoaderHelper.php
│ │ │ ├── HttpLoader.php
│ │ │ ├── Messages/
│ │ │ │ └── RespondedRequest.php
│ │ │ ├── Politeness/
│ │ │ │ ├── RetryErrorResponseHandler.php
│ │ │ │ ├── RobotsTxtHandler.php
│ │ │ │ ├── Throttler.php
│ │ │ │ └── TimingUnits/
│ │ │ │ └── MultipleOf.php
│ │ │ └── ProxyManager.php
│ │ ├── Loader.php
│ │ └── LoaderInterface.php
│ ├── Logger/
│ │ ├── CliLogger.php
│ │ └── PreStepInvocationLogger.php
│ ├── Output.php
│ ├── Result.php
│ ├── Steps/
│ │ ├── BaseStep.php
│ │ ├── Csv.php
│ │ ├── Dom/
│ │ │ ├── DomDocument.php
│ │ │ ├── HtmlDocument.php
│ │ │ ├── HtmlElement.php
│ │ │ ├── Node.php
│ │ │ ├── NodeList.php
│ │ │ ├── XmlDocument.php
│ │ │ └── XmlElement.php
│ │ ├── Dom.php
│ │ ├── Exceptions/
│ │ │ └── PreRunValidationException.php
│ │ ├── Filters/
│ │ │ ├── AbstractFilter.php
│ │ │ ├── ArrayFilter.php
│ │ │ ├── ClosureFilter.php
│ │ │ ├── ComparisonFilter.php
│ │ │ ├── Enums/
│ │ │ │ ├── ComparisonFilterRule.php
│ │ │ │ ├── StringFilterRule.php
│ │ │ │ ├── StringLengthFilterRule.php
│ │ │ │ └── UrlFilterRule.php
│ │ │ ├── Filter.php
│ │ │ ├── FilterInterface.php
│ │ │ ├── Filterable.php
│ │ │ ├── NegatedFilter.php
│ │ │ ├── StringFilter.php
│ │ │ ├── StringLengthFilter.php
│ │ │ └── UrlFilter.php
│ │ ├── Group.php
│ │ ├── Html/
│ │ │ ├── CssSelector.php
│ │ │ ├── DomQuery.php
│ │ │ ├── Exceptions/
│ │ │ │ └── InvalidDomQueryException.php
│ │ │ ├── GetLink.php
│ │ │ ├── GetLinks.php
│ │ │ ├── MetaData.php
│ │ │ ├── SchemaOrg.php
│ │ │ ├── SelectorTarget.php
│ │ │ └── XPathQuery.php
│ │ ├── Html.php
│ │ ├── Json.php
│ │ ├── Loading/
│ │ │ ├── GetSitemapsFromRobotsTxt.php
│ │ │ ├── Http/
│ │ │ │ ├── AbstractPaginator.php
│ │ │ │ ├── Browser/
│ │ │ │ │ └── BrowserAction.php
│ │ │ │ ├── Document.php
│ │ │ │ ├── Paginate.php
│ │ │ │ ├── Paginator.php
│ │ │ │ └── Paginators/
│ │ │ │ ├── QueryParams/
│ │ │ │ │ ├── AbstractQueryParamManipulator.php
│ │ │ │ │ ├── Decrementor.php
│ │ │ │ │ ├── Incrementor.php
│ │ │ │ │ └── QueryParamManipulator.php
│ │ │ │ ├── QueryParamsPaginator.php
│ │ │ │ ├── SimpleWebsitePaginator.php
│ │ │ │ └── StopRules/
│ │ │ │ ├── Contains.php
│ │ │ │ ├── IsEmptyInDom.php
│ │ │ │ ├── IsEmptyInHtml.php
│ │ │ │ ├── IsEmptyInJson.php
│ │ │ │ ├── IsEmptyInXml.php
│ │ │ │ ├── IsEmptyResponse.php
│ │ │ │ ├── NotContains.php
│ │ │ │ ├── PaginatorStopRules.php
│ │ │ │ └── StopRule.php
│ │ │ ├── Http.php
│ │ │ ├── HttpBase.php
│ │ │ ├── HttpCrawl.php
│ │ │ └── LoadingStep.php
│ │ ├── Refiners/
│ │ │ ├── AbstractRefiner.php
│ │ │ ├── DateTime/
│ │ │ │ └── DateTimeFormat.php
│ │ │ ├── DateTimeRefiner.php
│ │ │ ├── Html/
│ │ │ │ └── RemoveFromHtml.php
│ │ │ ├── HtmlRefiner.php
│ │ │ ├── RefinerInterface.php
│ │ │ ├── String/
│ │ │ │ ├── AbstractStringRefiner.php
│ │ │ │ ├── StrAfterFirst.php
│ │ │ │ ├── StrAfterLast.php
│ │ │ │ ├── StrBeforeFirst.php
│ │ │ │ ├── StrBeforeLast.php
│ │ │ │ ├── StrBetweenFirst.php
│ │ │ │ ├── StrBetweenLast.php
│ │ │ │ └── StrReplace.php
│ │ │ ├── StringRefiner.php
│ │ │ ├── Url/
│ │ │ │ ├── AbstractUrlRefiner.php
│ │ │ │ ├── WithFragment.php
│ │ │ │ ├── WithHost.php
│ │ │ │ ├── WithPath.php
│ │ │ │ ├── WithPort.php
│ │ │ │ ├── WithQuery.php
│ │ │ │ ├── WithScheme.php
│ │ │ │ └── WithoutPort.php
│ │ │ └── UrlRefiner.php
│ │ ├── Sitemap/
│ │ │ └── GetUrlsFromSitemap.php
│ │ ├── Sitemap.php
│ │ ├── Step.php
│ │ ├── StepInterface.php
│ │ ├── StepOutputType.php
│ │ └── Xml.php
│ ├── Stores/
│ │ ├── JsonFileStore.php
│ │ ├── SimpleCsvFileStore.php
│ │ ├── Store.php
│ │ └── StoreInterface.php
│ ├── UserAgents/
│ │ ├── BotUserAgent.php
│ │ ├── BotUserAgentInterface.php
│ │ ├── UserAgent.php
│ │ └── UserAgentInterface.php
│ └── Utils/
│ ├── Gzip.php
│ ├── HttpHeaders.php
│ ├── OutputTypeHelper.php
│ ├── RequestKey.php
│ └── TemplateString.php
└── tests/
├── Cache/
│ ├── CacheItemTest.php
│ ├── FileCacheTest.php
│ └── _cachefilecontent
├── CrawlerTest.php
├── HttpCrawler/
│ └── AnonymousHttpCrawlerBuilderTest.php
├── IoTest.php
├── Loader/
│ ├── Http/
│ │ ├── Browser/
│ │ │ └── ScreenshotConfigTest.php
│ │ ├── Cache/
│ │ │ └── RetryManagerTest.php
│ │ ├── Cookies/
│ │ │ ├── CookieJarTest.php
│ │ │ ├── CookieTest.php
│ │ │ └── DateTest.php
│ │ ├── HeadlessBrowserLoaderHelperTest.php
│ │ ├── HttpLoaderPolitenessTest.php
│ │ ├── HttpLoaderTest.php
│ │ ├── Messages/
│ │ │ └── RespondedRequestTest.php
│ │ ├── Politeness/
│ │ │ ├── RobotsTxtHandlerTest.php
│ │ │ ├── ThrottlerTest.php
│ │ │ └── TimingUnits/
│ │ │ └── MultipleOfTest.php
│ │ └── ProxyManagerTest.php
│ └── LoaderTest.php
├── Logger/
│ ├── CliLoggerTest.php
│ └── PreStepInvocationLoggerTest.php
├── Pest.php
├── ResultTest.php
├── Steps/
│ ├── BaseStepTest.php
│ ├── CsvTest.php
│ ├── Dom/
│ │ ├── HtmlDocumentTest.php
│ │ ├── HtmlElementTest.php
│ │ ├── NodeListTest.php
│ │ ├── NodeTest.php
│ │ ├── XmlDocumentTest.php
│ │ ├── XmlElementTest.php
│ │ └── _Stubs/
│ │ ├── HtmlNodeStub.php
│ │ └── XmlNodeStub.php
│ ├── DomTest.php
│ ├── Filters/
│ │ ├── ArrayFilterTest.php
│ │ ├── ClosureFilterTest.php
│ │ ├── ComparisonFilterTest.php
│ │ ├── Enums/
│ │ │ ├── ComparisonFilterRuleTest.php
│ │ │ ├── StringFilterRuleTest.php
│ │ │ ├── StringLengthFilterRuleTest.php
│ │ │ └── UrlFilterRuleTest.php
│ │ ├── FilterTest.php
│ │ ├── NegatedFilterTest.php
│ │ ├── StringFilterTest.php
│ │ ├── StringLengthFilterTest.php
│ │ └── UrlFilterTest.php
│ ├── GroupTest.php
│ ├── Html/
│ │ ├── CssSelectorTest.php
│ │ ├── Exceptions/
│ │ │ └── InvalidDomQueryExceptionTest.php
│ │ ├── GetLinkTest.php
│ │ ├── GetLinksTest.php
│ │ ├── MetaDataTest.php
│ │ ├── SchemaOrgTest.php
│ │ └── XPathQueryTest.php
│ ├── HtmlTest.php
│ ├── JsonTest.php
│ ├── Loading/
│ │ ├── GetSitemapsFromRobotsTxtTest.php
│ │ ├── Http/
│ │ │ ├── DocumentTest.php
│ │ │ └── Paginators/
│ │ │ ├── AbstractPaginatorTest.php
│ │ │ ├── QueryParams/
│ │ │ │ ├── AbstractQueryParamManipulatorTest.php
│ │ │ │ ├── DecrementorTest.php
│ │ │ │ └── IncrementorTest.php
│ │ │ ├── QueryParamsPaginatorTest.php
│ │ │ ├── SimpleWebsitePaginatorTest.php
│ │ │ └── StopRules/
│ │ │ ├── ContainsTest.php
│ │ │ ├── IsEmptyInHtmlTest.php
│ │ │ ├── IsEmptyInJsonTest.php
│ │ │ ├── IsEmptyInXmlTest.php
│ │ │ ├── IsEmptyResponseTest.php
│ │ │ └── NotContainsTest.php
│ │ ├── HttpTest.php
│ │ └── LoadingStepTest.php
│ ├── Refiners/
│ │ ├── AbstractRefinerTest.php
│ │ ├── DateTime/
│ │ │ └── DateTimeFormatTest.php
│ │ ├── Html/
│ │ │ └── RemoveFromHtmlTest.php
│ │ ├── String/
│ │ │ ├── AfterFirstTest.php
│ │ │ ├── AfterLastTest.php
│ │ │ ├── BeforeFirstTest.php
│ │ │ ├── BeforeLastTest.php
│ │ │ ├── BetweenFirstTest.php
│ │ │ ├── BetweenLastTest.php
│ │ │ └── ReplaceTest.php
│ │ └── Url/
│ │ ├── WithFragmentTest.php
│ │ ├── WithHostTest.php
│ │ ├── WithPathTest.php
│ │ ├── WithPortTest.php
│ │ ├── WithQueryTest.php
│ │ ├── WithSchemeTest.php
│ │ └── WithoutPortTest.php
│ ├── Sitemap/
│ │ └── GetUrlsFromSitemapTest.php
│ ├── StepTest.php
│ ├── XmlTest.php
│ └── _Files/
│ ├── Csv/
│ │ ├── basic.csv
│ │ ├── enclosure.csv
│ │ ├── escape.csv
│ │ ├── separator.csv
│ │ └── with-column-headlines.csv
│ ├── Html/
│ │ ├── basic.html
│ │ ├── bookstore.html
│ │ └── event.html
│ └── Xml/
│ ├── bookstore.xml
│ ├── events.xml
│ └── rss-with-bom.xml
├── Stores/
│ ├── JsonFileStoreTest.php
│ ├── SimpleCsvFileStoreTest.php
│ └── _files/
│ └── .gitkeep
├── UserAgents/
│ ├── BotUserAgentTest.php
│ └── UserAgentTest.php
├── Utils/
│ ├── GzipTest.php
│ ├── HttpHeadersTest.php
│ ├── OutputTypeHelperTest.php
│ ├── RequestKeyTest.php
│ └── TemplateStringTest.php
├── _Integration/
│ ├── GroupTest.php
│ ├── Http/
│ │ ├── CharsetTest.php
│ │ ├── CrawlingTest.php
│ │ ├── ErrorResponsesTest.php
│ │ ├── GzipTest.php
│ │ ├── HeadlessBrowserTest.php
│ │ ├── Html/
│ │ │ ├── PaginatedListingTest.php
│ │ │ └── SimpleListingTest.php
│ │ ├── PaginationTest.php
│ │ ├── ProxyingTest.php
│ │ ├── PublisherExampleTest.php
│ │ ├── QueryParamPaginationTest.php
│ │ ├── RedirectTest.php
│ │ ├── RequestParamsFromInputTest.php
│ │ ├── RetryErrorResponsesTest.php
│ │ ├── RobotsTxtTest.php
│ │ └── TimeoutTest.php
│ ├── ProxyServer.php
│ ├── Server.php
│ └── _Server/
│ ├── BlogPostWithJsonLd.php
│ ├── BrokenMimeTypeRss.php
│ ├── BrowserActions/
│ │ ├── ClickAndWaitForReload.php
│ │ ├── EvaluateAndWaitForReload.php
│ │ ├── EvaluateAndWaitForReloadReloaded.php
│ │ ├── Main.php
│ │ └── Wait.php
│ ├── Crawling.php
│ ├── HelloWorld.php
│ ├── JsGeneratedContent.php
│ ├── NonUtf8.php
│ ├── PageInitScript.php
│ ├── PaginatedListing/
│ │ └── Detail.php
│ ├── PaginatedListing.php
│ ├── PrintCookie.php
│ ├── PrintCookies.php
│ ├── PrintHeaders.php
│ ├── Publisher/
│ │ ├── AuthorDetailPage.php
│ │ ├── AuthorsListPage.php
│ │ ├── BookDetailPage.php
│ │ └── EditionDetailPage.php
│ ├── QueryParamPagination.php
│ ├── RssFeed.php
│ ├── ServiceUnavailable.php
│ ├── SetCookie.php
│ ├── SetCookieJs.php
│ ├── SetDelayedCookieJs.php
│ ├── SetMultipleCookiesJs.php
│ ├── SimpleListing/
│ │ └── Detail.php
│ ├── SimpleListing.php
│ └── TooManyRequests.php
├── _Stubs/
│ ├── AbstractTestPaginator.php
│ ├── Crawlers/
│ │ ├── DummyOne.php
│ │ ├── DummyTwo/
│ │ │ ├── DummyTwoLoader.php
│ │ │ ├── DummyTwoLogger.php
│ │ │ └── DummyTwoUserAgent.php
│ │ └── DummyTwo.php
│ ├── DummyLogger.php
│ ├── PhantasyLoader.php
│ └── RespondedRequestChild.php
└── _Temp/
├── _cachedir/
│ └── .gitkeep
└── _storagedir/
└── .gitkeep
SYMBOL INDEX (1268 symbols across 189 files)
FILE: src/Cache/CacheItem.php
class CacheItem (line 9) | class CacheItem
method __construct (line 13) | public function __construct(
method key (line 30) | public function key(): string
method value (line 35) | public function value(): mixed
method isExpired (line 43) | public function isExpired(): bool
method withTtl (line 53) | public function withTtl(DateInterval|int $ttl): CacheItem
method __serialize (line 61) | public function __serialize(): array
method __unserialize (line 74) | public function __unserialize(array $data): void
FILE: src/Cache/Exceptions/MissingZlibExtensionException.php
class MissingZlibExtensionException (line 8) | class MissingZlibExtensionException extends Exception implements CacheEx...
FILE: src/Cache/Exceptions/ReadingCacheFailedException.php
class ReadingCacheFailedException (line 8) | class ReadingCacheFailedException extends Exception implements CacheExce...
FILE: src/Cache/FileCache.php
class FileCache (line 14) | class FileCache implements CacheInterface
method __construct (line 20) | public function __construct(
method useCompression (line 24) | public function useCompression(): static
method ttl (line 31) | public function ttl(DateInterval|int $ttl): static
method has (line 41) | public function has(string $key): bool
method get (line 59) | public function get(string $key, mixed $default = null): mixed
method set (line 77) | public function set(string $key, mixed $value, DateInterval|int|null $...
method delete (line 88) | public function delete(string $key): bool
method prolong (line 93) | public function prolong(string $key, DateInterval|int $ttl): bool
method clear (line 107) | public function clear(): bool
method prolongAll (line 122) | public function prolongAll(DateInterval|int $ttl): bool
method getMultiple (line 141) | public function getMultiple(iterable $keys, mixed $default = null): it...
method setMultiple (line 156) | public function setMultiple(iterable $values, DateInterval|int|null $t...
method deleteMultiple (line 167) | public function deleteMultiple(iterable $keys): bool
method getCacheItem (line 182) | protected function getCacheItem(string $key): CacheItem
method saveCacheItem (line 202) | protected function saveCacheItem(CacheItem $item): bool
method unserialize (line 213) | protected function unserialize(string $content): mixed
method getFileContents (line 239) | protected function getFileContents(string $key): string
method encode (line 253) | protected function encode(string $content): string
method decode (line 267) | protected function decode(string $content): string
FILE: src/Crawler.php
class Crawler (line 19) | abstract class Crawler
method __construct (line 43) | public function __construct()
method __clone (line 52) | public function __clone(): void
method userAgent (line 63) | abstract protected function userAgent(): UserAgentInterface;
method loader (line 70) | abstract protected function loader(UserAgentInterface $userAgent, Logg...
method group (line 72) | public static function group(): Group
method setMemoryLimit (line 77) | public static function setMemoryLimit(string $memoryLimit): false|string
method getMemoryLimit (line 82) | public static function getMemoryLimit(): false|string
method getSubCrawler (line 87) | public function getSubCrawler(): Crawler
method getUserAgent (line 92) | public function getUserAgent(): UserAgentInterface
method setUserAgent (line 97) | public function setUserAgent(UserAgentInterface $userAgent): static
method getLogger (line 106) | public function getLogger(): LoggerInterface
method getLoader (line 114) | public function getLoader(): LoaderInterface|array
method setStore (line 119) | public function setStore(StoreInterface $store): static
method input (line 128) | public function input(mixed $input): static
method inputs (line 138) | public function inputs(array $inputs): static
method addStep (line 150) | public function addStep(StepInterface $step): static
method runAndTraverse (line 175) | public function runAndTraverse(): void
method runAndDump (line 186) | public function runAndDump(): void
method run (line 203) | public function run(): Generator
method monitorMemoryUsage (line 229) | public function monitorMemoryUsage(?int $ifAboveXBytes = null): static
method outputHook (line 236) | public function outputHook(Closure $callback): static
method logger (line 243) | protected function logger(): LoggerInterface
method invokeStepsRecursive (line 251) | protected function invokeStepsRecursive(Input $input, StepInterface $s...
method storeAndReturnOutputsAsResults (line 282) | protected function storeAndReturnOutputsAsResults(Generator $outputs):...
method validateSteps (line 312) | protected function validateSteps(): void
method prepareInput (line 339) | protected function prepareInput(): array
method logMemoryUsage (line 346) | protected function logMemoryUsage(): void
method firstStep (line 355) | protected function firstStep(): ?StepInterface
method lastStep (line 360) | protected function lastStep(): ?BaseStep
method nextStep (line 371) | protected function nextStep(int $afterIndex): ?StepInterface
method reset (line 376) | protected function reset(): void
FILE: src/HttpCrawler.php
class HttpCrawler (line 15) | abstract class HttpCrawler extends Crawler
method loader (line 20) | protected function loader(UserAgentInterface $userAgent, LoggerInterfa...
method make (line 25) | public static function make(): HttpCrawler\AnonymousHttpCrawlerBuilder
FILE: src/HttpCrawler/AnonymousHttpCrawlerBuilder.php
class AnonymousHttpCrawlerBuilder (line 10) | class AnonymousHttpCrawlerBuilder
method __construct (line 12) | public function __construct() {}
method withBotUserAgent (line 14) | public function withBotUserAgent(string $productToken): HttpCrawler
method withUserAgent (line 28) | public function withUserAgent(string|UserAgentInterface $userAgent): H...
method withMozilla5CompatibleUserAgent (line 44) | public function withMozilla5CompatibleUserAgent(): HttpCrawler
FILE: src/Input.php
class Input (line 5) | class Input extends Io {}
FILE: src/Io.php
class Io (line 7) | class Io
method __construct (line 14) | final public function __construct(
method withValue (line 25) | public function withValue(mixed $value): static
method withPropertyValue (line 30) | public function withPropertyValue(string $key, mixed $value): static
method get (line 43) | public function get(): mixed
method getProperty (line 48) | public function getProperty(string $key, mixed $fallbackValue = null):...
method setKey (line 68) | public function setKey(?string $useFromValue = null): string
method getKey (line 81) | public function getKey(): string|int|float|bool|null
method keep (line 93) | public function keep(array $data): static
method isArrayWithStringKeys (line 100) | public function isArrayWithStringKeys(): bool
method valueToString (line 115) | protected function valueToString(mixed $value): string
FILE: src/Loader/Http/Browser/Screenshot.php
class Screenshot (line 5) | class Screenshot
method __construct (line 7) | public function __construct(
FILE: src/Loader/Http/Browser/ScreenshotConfig.php
class ScreenshotConfig (line 11) | class ScreenshotConfig
method __construct (line 13) | public function __construct(
method make (line 20) | public static function make(string $storePath): self
method getFullPath (line 29) | public function getFullPath(Page $page): string
method setImageFileType (line 36) | public function setImageFileType(string $type): self
method setQuality (line 51) | public function setQuality(int $quality): self
method setFullPage (line 60) | public function setFullPage(): self
method toChromePhpScreenshotConfig (line 70) | public function toChromePhpScreenshotConfig(Page $page): array
FILE: src/Loader/Http/Cache/RetryManager.php
class RetryManager (line 8) | class RetryManager
method __construct (line 14) | public function __construct(
method only (line 22) | public function only(int|array $statusCodes): static
method except (line 34) | public function except(int|array $statusCodes): static
method shallBeRetried (line 43) | public function shallBeRetried(int $statusCode): bool
FILE: src/Loader/Http/Cookies/Cookie.php
class Cookie (line 11) | class Cookie
method __construct (line 43) | public function __construct(
method shouldBeSentTo (line 66) | public function shouldBeSentTo(string|UriInterface|Url $url): bool
method __toString (line 80) | public function __toString(): string
method receivedFromUrl (line 85) | public function receivedFromUrl(): UriInterface
method name (line 90) | public function name(): string
method value (line 95) | public function value(): string
method expires (line 100) | public function expires(): ?Date
method maxAge (line 105) | public function maxAge(): ?int
method isExpired (line 110) | public function isExpired(): bool
method domain (line 126) | public function domain(): string
method path (line 131) | public function path(): ?string
method secure (line 136) | public function secure(): bool
method httpOnly (line 141) | public function httpOnly(): bool
method sameSite (line 146) | public function sameSite(): string
method isReceivedSecure (line 154) | public function isReceivedSecure(): bool
method hasSecurePrefix (line 159) | public function hasSecurePrefix(): bool
method hasHostPrefix (line 164) | public function hasHostPrefix(): bool
method parseSetCookieHeader (line 172) | protected function parseSetCookieHeader(string $setCookieHeader): void
method parseAttribute (line 194) | protected function parseAttribute(string $attribute): void
method checkPrefixes (line 224) | protected function checkPrefixes(): void
method setExpires (line 251) | protected function setExpires(string $value): void
method setMaxAge (line 256) | protected function setMaxAge(string $value): void
method setDomain (line 267) | protected function setDomain(string $value, bool $viaAttribute = false...
method setPath (line 286) | protected function setPath(string $path): void
method setSecure (line 295) | protected function setSecure(): void
method setSameSite (line 309) | protected function setSameSite(string $value): void
method pathMatches (line 323) | protected function pathMatches(Url $url): bool
FILE: src/Loader/Http/Cookies/CookieJar.php
class CookieJar (line 14) | class CookieJar
method allByDomain (line 25) | public function allByDomain(string $domain): array
method flush (line 34) | public function flush(): void
method addFrom (line 43) | public function addFrom(string|UriInterface|Url $url, ResponseInterfac...
method addFromBrowserCookieCollection (line 70) | public function addFromBrowserCookieCollection(string|UriInterface|Url...
method getFor (line 95) | public function getFor(string|UriInterface $url): array
method getForDomainFromUrl (line 117) | protected function getForDomainFromUrl(string|UriInterface|Url $url): ...
method buildSetCookieHeaderFromBrowserCookie (line 132) | protected function buildSetCookieHeaderFromBrowserCookie(BrowserCookie...
method formatExpiresValue (line 175) | private function formatExpiresValue(mixed $value): string
FILE: src/Loader/Http/Cookies/Date.php
class Date (line 9) | class Date
method __construct (line 13) | public function __construct(protected readonly string $httpDateString) {}
method dateTime (line 18) | public function dateTime(): DateTime
FILE: src/Loader/Http/Cookies/Exceptions/InvalidCookieException.php
class InvalidCookieException (line 7) | class InvalidCookieException extends Exception {}
FILE: src/Loader/Http/Exceptions/LoadingException.php
class LoadingException (line 9) | class LoadingException extends Exception
method from (line 13) | public static function from(Throwable $previousException): self
method make (line 22) | public static function make(string|UriInterface $uri, ?int $httpStatus...
FILE: src/Loader/Http/HeadlessBrowserLoaderHelper.php
class HeadlessBrowserLoaderHelper (line 31) | class HeadlessBrowserLoaderHelper
method __construct (line 65) | public function __construct(
method setTempPostNavigateHooks (line 78) | public function setTempPostNavigateHooks(array $hooks): static
method navigateToPageAndGetRespondedRequest (line 96) | public function navigateToPageAndGetRespondedRequest(
method getOpenBrowser (line 156) | public function getOpenBrowser(): ?Browser
method getOpenPage (line 161) | public function getOpenPage(): ?Page
method closeBrowser (line 169) | public function closeBrowser(): void
method setExecutable (line 184) | public function setExecutable(string $executable): static
method setOptions (line 194) | public function setOptions(array $options): static
method addOptions (line 206) | public function addOptions(array $options): static
method waitForNavigationEvent (line 217) | public function waitForNavigationEvent(string $eventName): static
method getTimeout (line 224) | public function getTimeout(): int
method setTimeout (line 229) | public function setTimeout(int $timeout): static
method sanitizeResponseHeaders (line 240) | public function sanitizeResponseHeaders(array $headers): array
method setPageInitScript (line 253) | public function setPageInitScript(string $scriptSource): static
method useNativeUserAgent (line 260) | public function useNativeUserAgent(): static
method includeShadowElementsInHtml (line 267) | public function includeShadowElementsInHtml(): static
method navigate (line 283) | protected function navigate(string $url): void
method callPostNavigateHooks (line 295) | protected function callPostNavigateHooks(): array
method addCookiesToJar (line 324) | protected function addCookiesToJar(?CookieJar $cookieJar, UriInterface...
method getBrowser (line 340) | protected function getBrowser(
method shouldRenewBrowser (line 365) | protected function shouldRenewBrowser(?string $proxy): bool
method optionsFromRequest (line 374) | protected function optionsFromRequest(RequestInterface $request, ?stri...
method prepareRequestHeaders (line 402) | protected function prepareRequestHeaders(array $headers = []): array
method removeHeadersCausingErrorWithHeadlessBrowser (line 415) | protected function removeHeadersCausingErrorWithHeadlessBrowser(array ...
method responseIsHtmlDocument (line 428) | protected function responseIsHtmlDocument(?Page $page = null): bool
method tryToGetRawResponseBody (line 451) | protected function tryToGetRawResponseBody(Page $page, string $request...
method getHtmlFromPage (line 475) | protected function getHtmlFromPage(): string
FILE: src/Loader/Http/HttpLoader.php
class HttpLoader (line 40) | class HttpLoader extends Loader
method __construct (line 82) | public function __construct(
method load (line 122) | public function load(mixed $subject): ?RespondedRequest
method loadOrFail (line 173) | public function loadOrFail(mixed $subject): RespondedRequest
method dontUseCookies (line 206) | public function dontUseCookies(): static
method flushCookies (line 213) | public function flushCookies(): void
method useHeadlessBrowser (line 218) | public function useHeadlessBrowser(): static
method useHttpClient (line 228) | public function useHttpClient(): static
method usesHeadlessBrowser (line 237) | public function usesHeadlessBrowser(): bool
method setMaxRedirects (line 242) | public function setMaxRedirects(int $maxRedirects): static
method robotsTxt (line 249) | public function robotsTxt(): RobotsTxtHandler
method throttle (line 258) | public function throttle(): Throttler
method retryCachedErrorResponses (line 263) | public function retryCachedErrorResponses(): RetryManager
method writeOnlyCache (line 270) | public function writeOnlyCache(): static
method cacheOnlyWhereUrl (line 277) | public function cacheOnlyWhereUrl(FilterInterface $filter): static
method useProxy (line 287) | public function useProxy(string $proxyUrl): void
method useRotatingProxies (line 298) | public function useRotatingProxies(array $proxyUrls): void
method browser (line 305) | public function browser(): HeadlessBrowserLoaderHelper
method addToCache (line 317) | public function addToCache(RespondedRequest $respondedRequest): void
method skipCacheForNextRequest (line 324) | public function skipCacheForNextRequest(): static
method tryLoading (line 334) | protected function tryLoading(
method waitForGoAndLoad (line 375) | protected function waitForGoAndLoad(RequestInterface $request): Respon...
method loadViaClientOrHeadlessBrowser (line 408) | protected function loadViaClientOrHeadlessBrowser(RequestInterface $re...
method handleRedirects (line 430) | protected function handleRedirects(
method sendProxiedRequestUsingGuzzle (line 475) | protected function sendProxiedRequestUsingGuzzle(RequestInterface $req...
method checkIfProxiesCanBeUsed (line 493) | protected function checkIfProxiesCanBeUsed(): void
method mergeClientConfigWithDefaults (line 507) | protected function mergeClientConfigWithDefaults(array $config): array
method isAllowedToBeLoaded (line 522) | protected function isAllowedToBeLoaded(UriInterface $uri, bool $throws...
method getFromCache (line 543) | protected function getFromCache(RequestInterface $request): ?Responded...
method shouldResponseBeCached (line 573) | protected function shouldResponseBeCached(RespondedRequest $respondedR...
method shouldRequestBeServedFromCache (line 594) | protected function shouldRequestBeServedFromCache(RequestInterface $re...
method validateSubjectType (line 614) | protected function validateSubjectType(RequestInterface|string $reques...
method prepareRequest (line 643) | protected function prepareRequest(RequestInterface $request): RequestI...
method addCookiesToJar (line 658) | protected function addCookiesToJar(RespondedRequest $respondedRequest)...
method addCookiesToRequest (line 672) | protected function addCookiesToRequest(RequestInterface $request): Req...
FILE: src/Loader/Http/Messages/RespondedRequest.php
class RespondedRequest (line 16) | class RespondedRequest
method __construct (line 29) | public function __construct(
method fromArray (line 42) | public static function fromArray(array $data): RespondedRequest
method __serialize (line 61) | public function __serialize(): array
method toArrayForResult (line 80) | public function toArrayForResult(): array
method __unserialize (line 103) | public function __unserialize(array $data): void
method effectiveUri (line 116) | public function effectiveUri(): string
method requestedUri (line 121) | public function requestedUri(): string
method allUris (line 129) | public function allUris(): array
method isRedirect (line 140) | public function isRedirect(): bool
method redirects (line 148) | public function redirects(): array
method setResponse (line 156) | public function setResponse(ResponseInterface $response): void
method addRedirectUri (line 168) | public function addRedirectUri(?string $redirectUri = null): void
method cacheKey (line 180) | public function cacheKey(): string
method isServedFromCache (line 185) | public function isServedFromCache(): bool
method setIsServedFromCache (line 190) | public function setIsServedFromCache(bool $value = true): void
method requestFromArray (line 198) | protected static function requestFromArray(array $data): Request
method responseFromArray (line 211) | protected static function responseFromArray(array $data): Response
method screenshotsFromArray (line 224) | protected static function screenshotsFromArray(array $data): array
FILE: src/Loader/Http/Politeness/RetryErrorResponseHandler.php
class RetryErrorResponseHandler (line 11) | class RetryErrorResponseHandler
method __construct (line 26) | public function __construct(
method shouldWait (line 32) | public function shouldWait(RespondedRequest $respondedRequest): bool
method setLogger (line 41) | public function setLogger(LoggerInterface $logger): void
method handleRetries (line 49) | public function handleRetries(
method getWaitTimeFromResponse (line 83) | protected function getWaitTimeFromResponse(ResponseInterface $response...
method getResponseCodeAndReasonPhrase (line 104) | protected function getResponseCodeAndReasonPhrase(RespondedRequest|Res...
method logReceivedErrorResponseMessage (line 117) | protected function logReceivedErrorResponseMessage(RespondedRequest $r...
method logWaitForRetryMessage (line 126) | protected function logWaitForRetryMessage(int $retryNumber): void
method logRepeatedErrorMessage (line 131) | protected function logRepeatedErrorMessage(RespondedRequest $responded...
method retryAfterExceedsLimitMessage (line 141) | protected function retryAfterExceedsLimitMessage(ResponseInterface $re...
FILE: src/Loader/Http/Politeness/RobotsTxtHandler.php
class RobotsTxtHandler (line 17) | class RobotsTxtHandler
method __construct (line 28) | public function __construct(
method ignoreWildcardRules (line 35) | public function ignoreWildcardRules(): void
method isAllowed (line 43) | public function isAllowed(string|UriInterface|Url $url): bool
method getSitemaps (line 68) | public function getSitemaps(string|UriInterface|Url $url): array
method getRobotsTxtFor (line 76) | protected function getRobotsTxtFor(string|UriInterface|Url $url): Robo...
method loadRobotsTxtContent (line 99) | protected function loadRobotsTxtContent(string $robotsTxtUrl): string
method getUrlInstance (line 119) | protected function getUrlInstance(string|UriInterface|Url $url): Url
FILE: src/Loader/Http/Politeness/Throttler.php
class Throttler (line 12) | class Throttler
method __construct (line 43) | public function __construct(
method waitBetween (line 61) | public function waitBetween(Microseconds|MultipleOf $from, Microsecond...
method waitAtLeast (line 72) | public function waitAtLeast(Microseconds $seconds): static
method waitAtMax (line 79) | public function waitAtMax(Microseconds $seconds): static
method trackRequestStartFor (line 89) | public function trackRequestStartFor(UriInterface $url): void
method trackRequestEndFor (line 101) | public function trackRequestEndFor(UriInterface $url): void
method waitForGo (line 125) | public function waitForGo(UriInterface $url): void
method time (line 146) | protected function time(): Microseconds
method getDomain (line 154) | protected function getDomain(UriInterface $url): string
method calcWaitUntil (line 169) | protected function calcWaitUntil(
method getRandBetween (line 190) | protected function getRandBetween(Microseconds $from, Microseconds $to...
method _internalTrackStartFor (line 202) | protected function _internalTrackStartFor(UriInterface $url): void
method _internalTrackEndFor (line 212) | protected function _internalTrackEndFor(UriInterface $url): void
method _requestToUrlWasStarted (line 217) | protected function _requestToUrlWasStarted(UriInterface $url): bool
method validateFromAndTo (line 228) | protected function validateFromAndTo(): void
method fromAndToAreOfSameType (line 239) | protected function fromAndToAreOfSameType(): bool
method fromIsGreaterThanTo (line 245) | protected function fromIsGreaterThanTo(): bool
FILE: src/Loader/Http/Politeness/TimingUnits/MultipleOf.php
class MultipleOf (line 7) | class MultipleOf
method __construct (line 9) | public function __construct(public readonly float $factor) {}
method calc (line 11) | public function calc(Microseconds $microseconds): Microseconds
method factorIsGreaterThan (line 20) | public function factorIsGreaterThan(MultipleOf $multipleOf): bool
FILE: src/Loader/Http/ProxyManager.php
class ProxyManager (line 5) | class ProxyManager
method __construct (line 12) | public function __construct(protected array $proxies)
method singleProxy (line 17) | public function singleProxy(): bool
method hasOnlySingleProxy (line 22) | public function hasOnlySingleProxy(): bool
method hasMultipleProxies (line 27) | public function hasMultipleProxies(): bool
method getProxy (line 32) | public function getProxy(): string
FILE: src/Loader/Loader.php
class Loader (line 11) | abstract class Loader implements LoaderInterface
method __construct (line 33) | public function __construct(
method beforeLoad (line 40) | public function beforeLoad(callable $callback): void
method onCacheHit (line 45) | public function onCacheHit(callable $callback): void
method onSuccess (line 50) | public function onSuccess(callable $callback): void
method onError (line 55) | public function onError(callable $callback): void
method afterLoad (line 60) | public function afterLoad(callable $callback): void
method setCache (line 65) | public function setCache(CacheInterface $cache): static
method userAgent (line 72) | public function userAgent(): UserAgentInterface
method isAllowedToBeLoaded (line 81) | protected function isAllowedToBeLoaded(UriInterface $uri, bool $throws...
method callHook (line 86) | protected function callHook(string $hook, mixed ...$arguments): void
method logger (line 120) | protected function logger(): LoggerInterface
method addHookCallback (line 125) | protected function addHookCallback(string $hook, callable $callback): ...
method _resetCalledHooks (line 134) | protected function _resetCalledHooks(): void
FILE: src/Loader/LoaderInterface.php
type LoaderInterface (line 9) | interface LoaderInterface
method load (line 15) | public function load(mixed $subject): mixed;
method loadOrFail (line 22) | public function loadOrFail(mixed $subject): mixed;
method setCache (line 27) | public function setCache(CacheInterface $cache): static;
FILE: src/Logger/CliLogger.php
class CliLogger (line 11) | class CliLogger implements LoggerInterface
method emergency (line 13) | public function emergency(string|Stringable $message, array $context =...
method alert (line 18) | public function alert(string|Stringable $message, array $context = [])...
method critical (line 23) | public function critical(string|Stringable $message, array $context = ...
method error (line 28) | public function error(string|Stringable $message, array $context = [])...
method warning (line 33) | public function warning(string|Stringable $message, array $context = [...
method notice (line 38) | public function notice(string|Stringable $message, array $context = []...
method info (line 43) | public function info(string|Stringable $message, array $context = []):...
method debug (line 48) | public function debug(string|Stringable $message, array $context = [])...
method log (line 57) | public function log($level, string|Stringable $message, array $context...
method printTimeAndLevel (line 71) | protected function printTimeAndLevel(string $level): void
method time (line 76) | protected function time(): string
method levelColor (line 81) | protected function levelColor(string $level): string
FILE: src/Logger/PreStepInvocationLogger.php
class PreStepInvocationLogger (line 10) | class PreStepInvocationLogger implements LoggerInterface
method emergency (line 17) | public function emergency(string|Stringable $message, array $context =...
method alert (line 22) | public function alert(string|Stringable $message, array $context = [])...
method critical (line 27) | public function critical(string|Stringable $message, array $context = ...
method error (line 32) | public function error(string|Stringable $message, array $context = [])...
method warning (line 37) | public function warning(string|Stringable $message, array $context = [...
method notice (line 42) | public function notice(string|Stringable $message, array $context = []...
method info (line 47) | public function info(string|Stringable $message, array $context = []):...
method debug (line 52) | public function debug(string|Stringable $message, array $context = [])...
method log (line 61) | public function log($level, string|Stringable $message, array $context...
method passToOtherLogger (line 74) | public function passToOtherLogger(LoggerInterface $logger): void
FILE: src/Output.php
class Output (line 5) | class Output extends Io {}
FILE: src/Result.php
class Result (line 7) | final class Result
method __construct (line 14) | public function __construct(protected ?Result $result = null)
method set (line 21) | public function set(string $key, mixed $value): self
method has (line 40) | public function has(string $key): bool
method get (line 45) | public function get(string $key, mixed $default = null): mixed
method toArray (line 57) | public function toArray(): array
method getUnnamedKey (line 72) | private function getUnnamedKey(): string
method isAssociativeArray (line 86) | private function isAssociativeArray(array $array): bool
FILE: src/Steps/BaseStep.php
class BaseStep (line 25) | abstract class BaseStep implements StepInterface
method invokeStep (line 91) | abstract public function invokeStep(Input $input): Generator;
method addLogger (line 93) | public function addLogger(LoggerInterface $logger): static
method setParentCrawler (line 114) | public function setParentCrawler(Crawler $crawler): static
method keep (line 124) | public function keep(string|array|null $keys = null): static
method keepAs (line 135) | public function keepAs(string $key): static
method keepFromInput (line 145) | public function keepFromInput(string|array|null $keys = null): static
method keepInputAs (line 156) | public function keepInputAs(string $key): static
method keepsAnything (line 163) | public function keepsAnything(): bool
method keepsAnythingFromInputData (line 168) | public function keepsAnythingFromInputData(): bool
method keepsAnythingFromOutputData (line 173) | public function keepsAnythingFromOutputData(): bool
method useInputKey (line 178) | public function useInputKey(string $key): static
method uniqueInputs (line 185) | public function uniqueInputs(?string $key = null): static
method uniqueOutputs (line 192) | public function uniqueOutputs(?string $key = null): static
method refineOutput (line 199) | public function refineOutput(
method outputKey (line 222) | public function outputKey(string $key): static
method maxOutputs (line 229) | public function maxOutputs(int $maxOutputs): static
method resetAfterRun (line 236) | public function resetAfterRun(): void
method outputType (line 257) | public function outputType(): StepOutputType
method validateBeforeRun (line 266) | public function validateBeforeRun(BaseStep|array $previousStepOrInitia...
method subCrawlerFor (line 316) | public function subCrawlerFor(string $for, Closure $crawlerBuilder): s...
method storeOriginalInput (line 327) | protected function storeOriginalInput(Input $input): void
method getFullOriginalInput (line 336) | protected function getFullOriginalInput(): ?Input
method runSubCrawlersFor (line 341) | protected function runSubCrawlersFor(Output $output): Output
method outputKeyAliases (line 400) | protected function outputKeyAliases(): array
method validateFirstStepBeforeRun (line 409) | protected function validateFirstStepBeforeRun(array $initialInputs): void
method getPreValidationRunMessageStartWithStepClassName (line 430) | protected function getPreValidationRunMessageStartWithStepClassName(?B...
method getStepClassName (line 451) | protected function getStepClassName(?BaseStep $step = null): ?string
method getParentStepClassName (line 462) | protected function getParentStepClassName(?BaseStep $step = null): ?st...
method getInputKeyToUse (line 475) | protected function getInputKeyToUse(Input $input): ?Input
method inputOrOutputIsUnique (line 511) | protected function inputOrOutputIsUnique(Io $io): bool
method applyRefiners (line 532) | protected function applyRefiners(mixed $outputValue, mixed $inputValue...
method makeOutput (line 563) | protected function makeOutput(mixed $outputData, Input $input): Output
method keepData (line 577) | protected function keepData(Output $output, Input $input): void
method getOutputDataToKeep (line 604) | protected function getOutputDataToKeep(Output $output, array $alreadyK...
method getInputDataToKeep (line 613) | protected function getInputDataToKeep(Input $input, array $alreadyKept...
method getInputOrOutputDataToKeep (line 622) | protected function getInputOrOutputDataToKeep(Io $io, array $alreadyKe...
method nextUnnamedKey (line 665) | protected function nextUnnamedKey(array $data): string
method mapKeepProperties (line 681) | protected function mapKeepProperties(array $data, array $keep): array
method getOutputPropertyFromArray (line 699) | protected function getOutputPropertyFromArray(string $key, array $data...
method isOutputKeyAlias (line 714) | protected function isOutputKeyAlias(string $key): bool
method getOutputKeyAliasRealKey (line 719) | protected function getOutputKeyAliasRealKey(string $key): string
method maxOutputsExceeded (line 726) | protected function maxOutputsExceeded(): bool
method trackYieldedOutput (line 731) | protected function trackYieldedOutput(): void
FILE: src/Steps/Csv.php
class Csv (line 9) | class Csv extends Step
method __construct (line 22) | public function __construct(protected array $columnMapping = [], prote...
method parseString (line 27) | public static function parseString(array $columnMapping = [], bool $sk...
method parseFile (line 35) | public static function parseFile(array $columnMapping = [], bool $skip...
method skipFirstLine (line 44) | public function skipFirstLine(): static
method separator (line 51) | public function separator(string $separator): static
method enclosure (line 62) | public function enclosure(string $enclosure): static
method escape (line 69) | public function escape(string $escape): static
method outputType (line 76) | public function outputType(): StepOutputType
method validateAndSanitizeInput (line 81) | protected function validateAndSanitizeInput(mixed $input): string
method invoke (line 96) | protected function invoke(mixed $input): Generator
method readFile (line 109) | protected function readFile(string $filePath): Generator
method mapLines (line 142) | protected function mapLines(array $lines): Generator
method mapRow (line 163) | protected function mapRow(array $row): array
FILE: src/Steps/Dom.php
class Dom (line 22) | abstract class Dom extends Step
method __construct (line 44) | final public function __construct(string|DomQuery|array $selectorOrMap...
method root (line 51) | public static function root(): static
method each (line 60) | public static function each(string|DomQuery $domQuery): static
method first (line 77) | public static function first(string|DomQuery $domQuery): static
method last (line 94) | public static function last(string|DomQuery $domQuery): static
method cssSelector (line 114) | public static function cssSelector(string $selector): CssSelector
method xPath (line 122) | public static function xPath(string $query): XPathQuery
method makeDefaultDomQueryInstance (line 127) | abstract protected function makeDefaultDomQueryInstance(string $query)...
method extract (line 132) | public function extract(string|DomQuery|array $selectorOrMapping): static
method outputType (line 143) | public function outputType(): StepOutputType
method invoke (line 154) | protected function invoke(mixed $input): Generator
method validateAndSanitizeInput (line 181) | protected function validateAndSanitizeInput(mixed $input): HtmlDocumen...
method singleSelector (line 194) | protected function singleSelector(Node|NodeList $nodeOrNodeList): Gene...
method mapProperties (line 231) | protected function mapProperties(Node $node): array
method getBase (line 259) | protected function getBase(DomDocument|Node $document): Node|NodeList|...
method getBaseFromDomNode (line 277) | private function getBaseFromDomNode(
method getDataFromChildDomStep (line 309) | protected function getDataFromChildDomStep(Dom $step, Node $node): array
FILE: src/Steps/Dom/DomDocument.php
class DomDocument (line 8) | abstract class DomDocument extends Node
method __construct (line 10) | public function __construct(string $source)
method makeDocumentInstance (line 19) | abstract protected function makeDocumentInstance(string $source): object;
FILE: src/Steps/Dom/HtmlDocument.php
class HtmlDocument (line 17) | class HtmlDocument extends DomDocument
method getBaseHref (line 26) | public function getBaseHref(): ?string
method outerHtml (line 33) | public function outerHtml(): string
method makeChildNodeInstance (line 41) | protected function makeChildNodeInstance(object $node): Node
method makeDocumentInstance (line 49) | protected function makeDocumentInstance(string $source): object
method fixInvalidCharactersInSource (line 63) | private function fixInvalidCharactersInSource(string $source): string
FILE: src/Steps/Dom/HtmlElement.php
class HtmlElement (line 14) | class HtmlElement extends Node
method outerHtml (line 16) | public function outerHtml(): string
method innerHtml (line 21) | public function innerHtml(): string
method html (line 26) | public function html(): string
method makeChildNodeInstance (line 34) | protected function makeChildNodeInstance(object $node): Node
FILE: src/Steps/Dom/Node.php
class Node (line 11) | abstract class Node
method __construct (line 21) | public function __construct(object $node)
method querySelector (line 30) | public function querySelector(string $selector): ?Node
method querySelectorAll (line 43) | public function querySelectorAll(string $selector): NodeList
method queryXPath (line 52) | public function queryXPath(string $query): NodeList
method removeNodesMatchingSelector (line 63) | public function removeNodesMatchingSelector(string $selector): void
method removeNodesMatchingXPath (line 78) | public function removeNodesMatchingXPath(string $query): void
method nodeName (line 97) | public function nodeName(): string
method text (line 108) | public function text(): string
method getAttribute (line 122) | public function getAttribute(string $attributeName): ?string
method makeChildNodeInstance (line 134) | abstract protected function makeChildNodeInstance(object $node): Node;
method outerSource (line 136) | protected function outerSource(): string
method innerSource (line 169) | protected function innerSource(): string
method makeNodeListInstance (line 181) | protected function makeNodeListInstance(object $nodeList): NodeList
method getParentDocumentOfNode (line 196) | private function getParentDocumentOfNode(object $node): ?object
FILE: src/Steps/Dom/NodeList.php
class NodeList (line 19) | class NodeList implements IteratorAggregate, Countable
method __construct (line 24) | public function __construct(
method first (line 32) | public function first(): ?Node
method last (line 44) | public function last(): ?Node
method nth (line 57) | public function nth(int $index): ?Node
method each (line 78) | public function each(Closure $callback): array
method count (line 92) | public function count(): int
method getIterator (line 101) | public function getIterator(): Iterator
FILE: src/Steps/Dom/XmlDocument.php
class XmlDocument (line 17) | class XmlDocument extends DomDocument
method outerXml (line 19) | public function outerXml(): string
method makeChildNodeInstance (line 27) | protected function makeChildNodeInstance(object $node): Node
method makeDocumentInstance (line 35) | protected function makeDocumentInstance(string $source): object
method replaceInvalidXmlCharacters (line 71) | private function replaceInvalidXmlCharacters(string $value): string
FILE: src/Steps/Dom/XmlElement.php
class XmlElement (line 14) | class XmlElement extends Node
method outerXml (line 16) | public function outerXml(): string
method innerXml (line 21) | public function innerXml(): string
method makeChildNodeInstance (line 29) | protected function makeChildNodeInstance(object $node): Node
FILE: src/Steps/Exceptions/PreRunValidationException.php
class PreRunValidationException (line 7) | class PreRunValidationException extends Exception {}
FILE: src/Steps/Filters/AbstractFilter.php
class AbstractFilter (line 8) | abstract class AbstractFilter implements FilterInterface
method useKey (line 14) | public function useKey(string $key): static
method addOr (line 25) | public function addOr(FilterInterface $filter): void
method getOr (line 43) | public function getOr(): ?FilterInterface
method negate (line 48) | public function negate(): NegatedFilter
method getKey (line 56) | protected function getKey(mixed $value): mixed
FILE: src/Steps/Filters/ArrayFilter.php
class ArrayFilter (line 7) | class ArrayFilter extends AbstractFilter
method evaluate (line 14) | public function evaluate(mixed $valueInQuestion): bool
FILE: src/Steps/Filters/ClosureFilter.php
class ClosureFilter (line 8) | class ClosureFilter extends AbstractFilter
method __construct (line 10) | public function __construct(
method evaluate (line 17) | public function evaluate(mixed $valueInQuestion): bool
FILE: src/Steps/Filters/ComparisonFilter.php
class ComparisonFilter (line 8) | class ComparisonFilter extends AbstractFilter
method __construct (line 10) | public function __construct(
method evaluate (line 18) | public function evaluate(mixed $valueInQuestion): bool
FILE: src/Steps/Filters/Enums/ComparisonFilterRule.php
method evaluate (line 19) | public function evaluate(mixed $value, mixed $compareTo): bool
FILE: src/Steps/Filters/Enums/StringFilterRule.php
method evaluate (line 13) | public function evaluate(string $haystack, string $needle): bool
FILE: src/Steps/Filters/Enums/StringLengthFilterRule.php
method evaluate (line 19) | public function evaluate(string $subject, int $compareTo): bool
FILE: src/Steps/Filters/Enums/UrlFilterRule.php
method evaluate (line 23) | public function evaluate(string $url, string $needle): bool
method prepareRegex (line 39) | protected function prepareRegex(string $regex): string
FILE: src/Steps/Filters/Filter.php
class Filter (line 11) | abstract class Filter
method equal (line 13) | public static function equal(mixed $equalToValue): ComparisonFilter
method notEqual (line 18) | public static function notEqual(mixed $notEqualToValue): ComparisonFilter
method greaterThan (line 23) | public static function greaterThan(mixed $greaterThanValue): Compariso...
method greaterThanOrEqual (line 28) | public static function greaterThanOrEqual(mixed $greaterThanOrEqualVal...
method lessThan (line 33) | public static function lessThan(mixed $lessThanValue): ComparisonFilter
method lessThanOrEqual (line 38) | public static function lessThanOrEqual(mixed $lessThanOrEqualValue): C...
method stringContains (line 43) | public static function stringContains(string $containsValue): StringFi...
method stringStartsWith (line 48) | public static function stringStartsWith(string $startsWithValue): Stri...
method stringEndsWith (line 53) | public static function stringEndsWith(string $endsWithValue): StringFi...
method stringLengthEqual (line 58) | public static function stringLengthEqual(int $length): StringLengthFilter
method stringLengthNotEqual (line 63) | public static function stringLengthNotEqual(int $length): StringLength...
method stringLengthGreaterThan (line 68) | public static function stringLengthGreaterThan(int $length): StringLen...
method stringLengthGreaterThanOrEqual (line 73) | public static function stringLengthGreaterThanOrEqual(int $length): St...
method stringLengthLessThan (line 78) | public static function stringLengthLessThan(int $length): StringLength...
method stringLengthLessThanOrEqual (line 83) | public static function stringLengthLessThanOrEqual(int $length): Strin...
method urlScheme (line 88) | public static function urlScheme(string $urlSchemeValue): UrlFilter
method urlHost (line 93) | public static function urlHost(string $urlHostValue): UrlFilter
method urlDomain (line 98) | public static function urlDomain(string $urlDomainValue): UrlFilter
method urlPath (line 103) | public static function urlPath(string $urlPathValue): UrlFilter
method urlPathStartsWith (line 108) | public static function urlPathStartsWith(string $urlPathStartsWithValu...
method urlPathMatches (line 113) | public static function urlPathMatches(string $urlPathMatchesValue): Ur...
method arrayHasElement (line 118) | public static function arrayHasElement(): ArrayFilter
method custom (line 123) | public static function custom(Closure $closure): ClosureFilter
FILE: src/Steps/Filters/FilterInterface.php
type FilterInterface (line 5) | interface FilterInterface
method useKey (line 10) | public function useKey(string $key): static;
method evaluate (line 15) | public function evaluate(mixed $valueInQuestion): bool;
method addOr (line 17) | public function addOr(FilterInterface $filter): void;
method getOr (line 19) | public function getOr(): ?FilterInterface;
method negate (line 21) | public function negate(): NegatedFilter;
FILE: src/Steps/Filters/Filterable.php
type Filterable (line 9) | trait Filterable
method where (line 16) | public function where(string|FilterInterface $keyOrFilter, ?FilterInte...
method orWhere (line 38) | public function orWhere(string|FilterInterface $keyOrFilter, ?FilterIn...
method passesAllFilters (line 57) | protected function passesAllFilters(mixed $output): bool
FILE: src/Steps/Filters/NegatedFilter.php
class NegatedFilter (line 5) | final class NegatedFilter implements FilterInterface
method __construct (line 7) | public function __construct(private readonly FilterInterface $filter) {}
method useKey (line 9) | public function useKey(string $key): static
method evaluate (line 16) | public function evaluate(mixed $valueInQuestion): bool
method addOr (line 21) | public function addOr(FilterInterface $filter): void
method getOr (line 26) | public function getOr(): ?FilterInterface
method negate (line 31) | public function negate(): NegatedFilter
FILE: src/Steps/Filters/StringFilter.php
class StringFilter (line 8) | class StringFilter extends AbstractFilter
method __construct (line 10) | public function __construct(
method evaluate (line 18) | public function evaluate(mixed $valueInQuestion): bool
FILE: src/Steps/Filters/StringLengthFilter.php
class StringLengthFilter (line 8) | class StringLengthFilter extends AbstractFilter
method __construct (line 10) | public function __construct(
method evaluate (line 18) | public function evaluate(mixed $valueInQuestion): bool
FILE: src/Steps/Filters/UrlFilter.php
class UrlFilter (line 8) | class UrlFilter extends AbstractFilter
method __construct (line 10) | public function __construct(protected readonly UrlFilterRule $filterRu...
method evaluate (line 15) | public function evaluate(mixed $valueInQuestion): bool
FILE: src/Steps/Group.php
class Group (line 12) | final class Group extends BaseStep
method invokeStep (line 29) | public function invokeStep(Input $input): Generator
method addStep (line 71) | public function addStep(StepInterface $step): self
method addLogger (line 90) | public function addLogger(LoggerInterface $logger): static
method setLoader (line 101) | public function setLoader(LoaderInterface $loader): self
method maxOutputs (line 114) | public function maxOutputs(int $maxOutputs): static
method outputType (line 125) | public function outputType(): StepOutputType
method includeOutput (line 130) | protected function includeOutput(StepInterface $step): bool
method addToCombinedOutputData (line 146) | private function addToCombinedOutputData(mixed $add, array $combined, ...
method getNewlyKeptData (line 162) | private function getNewlyKeptData(Output $output, Input $input): array
method prepareCombinedOutputs (line 176) | private function prepareCombinedOutputs(array $combinedOutputs, array ...
method normalizeCombinedOutputs (line 214) | private function normalizeCombinedOutputs(array $combinedOutputs): array
FILE: src/Steps/Html.php
class Html (line 16) | class Html extends Dom
method getLink (line 21) | public static function getLink(?string $selector = null): GetLink
method getLinks (line 29) | public static function getLinks(?string $selector = null): GetLinks
method metaData (line 34) | public static function metaData(): MetaData
method schemaOrg (line 39) | public static function schemaOrg(): SchemaOrg
method validateAndSanitizeInput (line 49) | protected function validateAndSanitizeInput(mixed $input): HtmlDocument
method makeDefaultDomQueryInstance (line 61) | protected function makeDefaultDomQueryInstance(string $query): DomQuery
FILE: src/Steps/Html/CssSelector.php
class CssSelector (line 15) | final class CssSelector extends DomQuery
method __construct (line 20) | public function __construct(string $query)
method filter (line 43) | protected function filter(Node $node): NodeList
FILE: src/Steps/Html/DomQuery.php
class DomQuery (line 16) | abstract class DomQuery
method __construct (line 40) | public function __construct(
method apply (line 48) | public function apply(Node $node): array|string|null
method first (line 83) | public function first(): self
method last (line 90) | public function last(): self
method nth (line 97) | public function nth(int $n): self
method even (line 108) | public function even(): self
method odd (line 115) | public function odd(): self
method text (line 122) | public function text(): self
method formattedText (line 129) | public function formattedText(?Html2Text $converter = null): self
method html (line 140) | public function html(): self
method attribute (line 147) | public function attribute(string $attributeName): self
method outerHtml (line 156) | public function outerHtml(): self
method link (line 163) | public function link(): self
method withoutFragment (line 174) | public function withoutFragment(): self
method toAbsoluteUrl (line 186) | public function toAbsoluteUrl(): self
method setBaseUrl (line 198) | public function setBaseUrl(string $baseUrl): static
method filter (line 209) | abstract protected function filter(Node $node): NodeList;
method filtersMatches (line 211) | protected function filtersMatches(): bool
method filterMatches (line 224) | protected function filterMatches(NodeList $matches): ?NodeList
method filterEvenOrOdd (line 256) | protected function filterEvenOrOdd(NodeList $domCrawler): NodeList
method getTarget (line 280) | protected function getTarget(HtmlElement|XmlElement $node): string
method handleUrlFragment (line 320) | protected function handleUrlFragment(Url $url): Url
FILE: src/Steps/Html/Exceptions/InvalidDomQueryException.php
class InvalidDomQueryException (line 10) | class InvalidDomQueryException extends Exception
method make (line 14) | public static function make(string $message, string $domQuery): self
method fromSymfonyException (line 23) | public static function fromSymfonyException(
method fromDomException (line 38) | public static function fromDomException(string $domQuery, DOMException...
method setDomQuery (line 51) | public function setDomQuery(string $domQuery): void
method getDomQuery (line 56) | public function getDomQuery(): string
FILE: src/Steps/Html/GetLink.php
class GetLink (line 18) | class GetLink extends Step
method __construct (line 43) | public function __construct(string|CssSelector|null $selector = null)
method isSpecialNonHttpLink (line 48) | public static function isSpecialNonHttpLink(HtmlElement $linkElement):...
method outputType (line 57) | public function outputType(): StepOutputType
method validateAndSanitizeInput (line 65) | protected function validateAndSanitizeInput(mixed $input): HtmlDocument
method invoke (line 81) | protected function invoke(mixed $input): Generator
method onSameDomain (line 102) | public function onSameDomain(): static
method notOnSameDomain (line 109) | public function notOnSameDomain(): static
method onDomain (line 120) | public function onDomain(string|array $domains): static
method onSameHost (line 133) | public function onSameHost(): static
method notOnSameHost (line 140) | public function notOnSameHost(): static
method onHost (line 150) | public function onHost(string|array $hosts): static
method withoutFragment (line 163) | public function withoutFragment(): static
method getBaseFromDocument (line 173) | protected function getBaseFromDocument(HtmlDocument $document): void
method getLinkUrl (line 185) | protected function getLinkUrl(HtmlElement $link): ?Url
method matchesAdditionalCriteria (line 211) | protected function matchesAdditionalCriteria(Url $link): bool
method isOnSameDomain (line 219) | protected function isOnSameDomain(Url $link): bool
method isOnSameHost (line 225) | protected function isOnSameHost(Url $link): bool
method isOnDomain (line 234) | protected function isOnDomain(Url $link): bool
method isOnHost (line 250) | protected function isOnHost(Url $link): bool
method isArrayWithOnlyStrings (line 267) | protected function isArrayWithOnlyStrings(array $array): bool
method handleUrlFragment (line 281) | protected function handleUrlFragment(Url $url): Url
FILE: src/Steps/Html/GetLinks.php
class GetLinks (line 9) | class GetLinks extends GetLink
method invoke (line 16) | protected function invoke(mixed $input): Generator
FILE: src/Steps/Html/MetaData.php
class MetaData (line 11) | class MetaData extends Step
method only (line 21) | public function only(array $keys): static
method outputType (line 28) | public function outputType(): StepOutputType
method invoke (line 36) | protected function invoke(mixed $input): Generator
method validateAndSanitizeInput (line 58) | protected function validateAndSanitizeInput(mixed $input): mixed
method getTitle (line 63) | protected function getTitle(HtmlDocument $document): string
method addToData (line 78) | protected function addToData(array $data, string $key, string $value):...
FILE: src/Steps/Html/SchemaOrg.php
class SchemaOrg (line 12) | class SchemaOrg extends Step
method toArray (line 23) | public function toArray(): static
method onlyType (line 30) | public function onlyType(string $type = ''): static
method extract (line 40) | public function extract(array $mapping): static
method outputType (line 47) | public function outputType(): StepOutputType
method invoke (line 55) | protected function invoke(mixed $input): Generator
method validateAndSanitizeInput (line 73) | protected function validateAndSanitizeInput(mixed $input): string
method scanChildrenForType (line 78) | protected function scanChildrenForType(BaseType $schemaOrgObject): Gen...
method prepareReturnValue (line 94) | protected function prepareReturnValue(BaseType $object): BaseType|array
method applyMapping (line 111) | protected function applyMapping(array $schemaOrgData): array
FILE: src/Steps/Html/XPathQuery.php
class XPathQuery (line 11) | class XPathQuery extends DomQuery
method __construct (line 16) | public function __construct(string $query)
method filter (line 27) | protected function filter(Node $node): NodeList
method validateQuery (line 39) | private function validateQuery(string $query): void
FILE: src/Steps/Json.php
class Json (line 12) | class Json extends Step
method __construct (line 17) | final public function __construct(protected ?array $propertyMapping = ...
method all (line 19) | public static function all(): static
method get (line 27) | public static function get(array $propertyMapping = []): static
method each (line 35) | public static function each(string $each, array $propertyMapping = [])...
method outputType (line 40) | public function outputType(): StepOutputType
method validateAndSanitizeInput (line 45) | protected function validateAndSanitizeInput(mixed $input): mixed
method invoke (line 50) | protected function invoke(mixed $input): Generator
method inputStringToArray (line 84) | protected function inputStringToArray(string $input): ?array
method mapProperties (line 107) | protected function mapProperties(Dot $dot): array
FILE: src/Steps/Loading/GetSitemapsFromRobotsTxt.php
class GetSitemapsFromRobotsTxt (line 13) | class GetSitemapsFromRobotsTxt extends Step
method outputType (line 20) | public function outputType(): StepOutputType
method invoke (line 28) | protected function invoke(mixed $input): Generator
method validateAndSanitizeInput (line 40) | protected function validateAndSanitizeInput(mixed $input): UriInterface
FILE: src/Steps/Loading/Http.php
class Http (line 19) | class Http extends HttpBase
method crawl (line 24) | public static function crawl(array $headers = [], string $httpVersion ...
method get (line 32) | public static function get(array $headers = [], string $httpVersion = ...
method post (line 40) | public static function post(
method put (line 51) | public static function put(
method patch (line 62) | public static function patch(
method delete (line 73) | public static function delete(
method getBodyString (line 87) | public static function getBodyString(MessageInterface|RespondedRequest...
method paginate (line 107) | public function paginate(
method outputType (line 120) | public function outputType(): StepOutputType
method invoke (line 130) | protected function invoke(mixed $input): Generator
method transferSettingsToPaginateStep (line 150) | private function transferSettingsToPaginateStep(Paginate $step): Paginate
FILE: src/Steps/Loading/Http/AbstractPaginator.php
class AbstractPaginator (line 13) | abstract class AbstractPaginator
method __construct (line 31) | public function __construct(protected int $maxPages = Paginator::MAX_P...
method processLoaded (line 33) | public function processLoaded(
method hasFinished (line 40) | public function hasFinished(): bool
method resetFinished (line 55) | public function resetFinished(): void
method stopWhen (line 64) | public function stopWhen(Closure|StopRule $callback): self
method logWhenFinished (line 71) | public function logWhenFinished(LoggerInterface $logger): void
method getNextRequest (line 80) | abstract public function getNextRequest(): ?RequestInterface;
method registerLoadedRequest (line 82) | protected function registerLoadedRequest(RequestInterface|RespondedReq...
method shouldStop (line 111) | protected function shouldStop(RequestInterface $request, ?RespondedReq...
method maxPagesReached (line 128) | protected function maxPagesReached(): bool
method setFinished (line 133) | protected function setFinished(): self
FILE: src/Steps/Loading/Http/Browser/BrowserAction.php
class BrowserAction (line 13) | class BrowserAction
method waitUntilDocumentContainsElement (line 17) | public static function waitUntilDocumentContainsElement(
method clickElement (line 26) | public static function clickElement(
method clickInsideShadowDom (line 43) | public static function clickInsideShadowDom(
method moveMouseToElement (line 73) | public static function moveMouseToElement(string $cssSelector, int $ti...
method moveMouseToPosition (line 82) | public static function moveMouseToPosition(int $x, int $y, ?int $steps...
method scrollDown (line 93) | public static function scrollDown(int $distance): Closure
method scrollUp (line 100) | public static function scrollUp(int $distance): Closure
method typeText (line 107) | public static function typeText(string $text, ?int $delay = null): Clo...
method evaluate (line 118) | public static function evaluate(string $jsCode): Closure
method waitForReload (line 125) | public static function waitForReload(int $timeout = self::DEFAULT_TIME...
method wait (line 132) | public static function wait(float $seconds): Closure
method screenshot (line 139) | public static function screenshot(ScreenshotConfig $config): Closure
method evaluateAndWaitForReload (line 161) | public static function evaluateAndWaitForReload(string $jsCode): Closure
method clickElementAndWaitForReload (line 171) | public static function clickElementAndWaitForReload(string $cssSelecto...
FILE: src/Steps/Loading/Http/Document.php
class Document (line 12) | final class Document
method __construct (line 22) | public function __construct(
method dom (line 33) | public function dom(): HtmlDocument
method url (line 38) | public function url(): Url
method baseUrl (line 43) | public function baseUrl(): Url
method canonicalUrl (line 48) | public function canonicalUrl(): string
method setBaseUrl (line 73) | private function setBaseUrl(): void
FILE: src/Steps/Loading/Http/Paginate.php
class Paginate (line 19) | class Paginate extends Http
method __construct (line 21) | public function __construct(
method invoke (line 35) | protected function invoke(mixed $input): Generator
method paginateInputUrl (line 49) | private function paginateInputUrl(UriInterface $url): Generator
method finish (line 80) | private function finish(): void
method processLoaded (line 89) | private function processLoaded(RequestInterface $request, ?RespondedRe...
FILE: src/Steps/Loading/Http/Paginator.php
class Paginator (line 10) | class Paginator
method simpleWebsite (line 17) | public static function simpleWebsite(
method queryParams (line 24) | public static function queryParams(int $maxPages = Paginator::MAX_PAGE...
FILE: src/Steps/Loading/Http/Paginators/QueryParams/AbstractQueryParamManipulator.php
class AbstractQueryParamManipulator (line 9) | abstract class AbstractQueryParamManipulator implements QueryParamManipu...
method __construct (line 11) | public function __construct(protected string $queryParamName) {}
method getCurrentValue (line 16) | protected function getCurrentValue(Query $query, mixed $fallbackValue ...
method getCurrentValueUsingDotNotation (line 28) | protected function getCurrentValueUsingDotNotation(Query $query, mixed...
method getCurrentValueAsInt (line 38) | protected function getCurrentValueAsInt(Query $query): int
method getCurrentValueAsIntUsingDotNotation (line 46) | protected function getCurrentValueAsIntUsingDotNotation(Query $query):...
FILE: src/Steps/Loading/Http/Paginators/QueryParams/Decrementor.php
class Decrementor (line 9) | class Decrementor extends AbstractQueryParamManipulator
method __construct (line 11) | public function __construct(
method execute (line 22) | public function execute(Query $query): Query
FILE: src/Steps/Loading/Http/Paginators/QueryParams/Incrementor.php
class Incrementor (line 9) | class Incrementor extends AbstractQueryParamManipulator
method __construct (line 11) | public function __construct(
method execute (line 22) | public function execute(Query $query): Query
FILE: src/Steps/Loading/Http/Paginators/QueryParams/QueryParamManipulator.php
type QueryParamManipulator (line 7) | interface QueryParamManipulator
method execute (line 9) | public function execute(Query $query): Query;
FILE: src/Steps/Loading/Http/Paginators/QueryParamsPaginator.php
class QueryParamsPaginator (line 16) | class QueryParamsPaginator extends Http\AbstractPaginator
method paramsInUrl (line 28) | public static function paramsInUrl(int $maxPages = Paginator::MAX_PAGE...
method inUrl (line 33) | public function inUrl(): self
method paramsInBody (line 40) | public static function paramsInBody(int $maxPages = Paginator::MAX_PAG...
method inBody (line 49) | public function inBody(): self
method increase (line 56) | public function increase(string $queryParamName, int $by = 1, bool $us...
method increaseUsingDotNotation (line 63) | public function increaseUsingDotNotation(string $queryParamName, int $...
method decrease (line 70) | public function decrease(string $queryParamName, int $by = 1, bool $us...
method decreaseUsingDotNotation (line 77) | public function decreaseUsingDotNotation(string $queryParamName, int $...
method getNextRequest (line 87) | public function getNextRequest(): ?RequestInterface
FILE: src/Steps/Loading/Http/Paginators/SimpleWebsitePaginator.php
class SimpleWebsitePaginator (line 17) | class SimpleWebsitePaginator extends Http\AbstractPaginator
method __construct (line 41) | public function __construct(string|DomQuery $paginationLinksSelector, ...
method hasFinished (line 52) | public function hasFinished(): bool
method getNextRequest (line 57) | public function getNextRequest(): ?RequestInterface
method processLoaded (line 79) | public function processLoaded(
method logWhenFinished (line 100) | public function logWhenFinished(LoggerInterface $logger): void
method getPaginationLinksFromResponse (line 112) | protected function getPaginationLinksFromResponse(RespondedRequest $re...
method addFoundUrlFromLinkElement (line 143) | protected function addFoundUrlFromLinkElement(
method getAbsoluteUrlFromLinkElement (line 158) | protected function getAbsoluteUrlFromLinkElement(
method isRelevantLinkElement (line 176) | protected function isRelevantLinkElement(Dom\HtmlElement $element): bool
method addFoundUrl (line 187) | protected function addFoundUrl(string $url): void
method cleanUpParentRequests (line 204) | protected function cleanUpParentRequests(): void
FILE: src/Steps/Loading/Http/Paginators/StopRules/Contains.php
class Contains (line 10) | class Contains implements StopRule
method __construct (line 12) | public function __construct(protected string $contains) {}
method shouldStop (line 17) | public function shouldStop(RequestInterface $request, ?RespondedReques...
FILE: src/Steps/Loading/Http/Paginators/StopRules/IsEmptyInDom.php
class IsEmptyInDom (line 17) | abstract class IsEmptyInDom implements StopRule
method __construct (line 19) | public function __construct(protected string|DomQuery $selector) {}
method shouldStop (line 24) | public function shouldStop(RequestInterface $request, ?RespondedReques...
method makeDom (line 58) | abstract protected function makeDom(string $source): DomDocument;
method nodeIsEmpty (line 60) | private function nodeIsEmpty(HtmlElement|XmlElement $node): bool
FILE: src/Steps/Loading/Http/Paginators/StopRules/IsEmptyInHtml.php
class IsEmptyInHtml (line 8) | class IsEmptyInHtml extends IsEmptyInDom
method makeDom (line 10) | protected function makeDom(string $source): DomDocument
FILE: src/Steps/Loading/Http/Paginators/StopRules/IsEmptyInJson.php
class IsEmptyInJson (line 12) | class IsEmptyInJson implements StopRule
method __construct (line 14) | public function __construct(protected string $dotNotationKey) {}
method shouldStop (line 19) | public function shouldStop(RequestInterface $request, ?RespondedReques...
FILE: src/Steps/Loading/Http/Paginators/StopRules/IsEmptyInXml.php
class IsEmptyInXml (line 8) | class IsEmptyInXml extends IsEmptyInDom
method makeDom (line 10) | protected function makeDom(string $source): DomDocument
FILE: src/Steps/Loading/Http/Paginators/StopRules/IsEmptyResponse.php
class IsEmptyResponse (line 9) | class IsEmptyResponse implements StopRule
method shouldStop (line 11) | public function shouldStop(RequestInterface $request, ?RespondedReques...
FILE: src/Steps/Loading/Http/Paginators/StopRules/NotContains.php
class NotContains (line 10) | class NotContains implements StopRule
method __construct (line 12) | public function __construct(protected string $contains) {}
method shouldStop (line 17) | public function shouldStop(RequestInterface $request, ?RespondedReques...
FILE: src/Steps/Loading/Http/Paginators/StopRules/PaginatorStopRules.php
class PaginatorStopRules (line 7) | class PaginatorStopRules
method isEmptyResponse (line 9) | public static function isEmptyResponse(): IsEmptyResponse
method isEmptyInJson (line 14) | public static function isEmptyInJson(string $dotNotationKey): IsEmptyI...
method isEmptyInHtml (line 19) | public static function isEmptyInHtml(string|DomQuery $selector): IsEmp...
method isEmptyInXml (line 24) | public static function isEmptyInXml(string|DomQuery $selector): IsEmpt...
method contains (line 29) | public static function contains(string $string): Contains
method notContains (line 34) | public static function notContains(string $string): NotContains
FILE: src/Steps/Loading/Http/Paginators/StopRules/StopRule.php
type StopRule (line 8) | interface StopRule
method shouldStop (line 10) | public function shouldStop(RequestInterface $request, ?RespondedReques...
FILE: src/Steps/Loading/HttpBase.php
class HttpBase (line 20) | abstract class HttpBase extends Step
method __construct (line 66) | public function __construct(
method stopOnErrorResponse (line 73) | public function stopOnErrorResponse(): static
method yieldErrorResponses (line 80) | public function yieldErrorResponses(): static
method useInputKeyAsUrl (line 93) | public function useInputKeyAsUrl(string $key): static
method useInputKeyAsBody (line 106) | public function useInputKeyAsBody(string $key): static
method useInputKeyAsHeader (line 118) | public function useInputKeyAsHeader(string $key, ?string $asHeader = n...
method useInputKeyAsHeaders (line 138) | public function useInputKeyAsHeaders(string $key): static
method postBrowserNavigateHook (line 145) | public function postBrowserNavigateHook(Closure $callback): static
method skipCache (line 171) | public function skipCache(): static
method useBrowser (line 187) | public function useBrowser(): static
method staticUrl (line 194) | public function staticUrl(string $url): static
method validateAndSanitizeInput (line 205) | protected function validateAndSanitizeInput(mixed $input): mixed
method outputKeyAliases (line 224) | protected function outputKeyAliases(): array
method getResponseFromInputUri (line 238) | protected function getResponseFromInputUri(UriInterface $input): ?Resp...
method getRequestFromInputUri (line 245) | protected function getRequestFromInputUri(UriInterface $uri): RequestI...
method getResponseFromRequest (line 260) | protected function getResponseFromRequest(RequestInterface $request): ...
method applyTempLoaderCustomizations (line 283) | private function applyTempLoaderCustomizations(): array
method resetTempLoaderCustomizations (line 320) | private function resetTempLoaderCustomizations(array $resetConfig): void
method getUrlFromArrayInput (line 337) | protected function getUrlFromArrayInput(mixed $input): mixed
method getBodyFromArrayInput (line 358) | protected function getBodyFromArrayInput(mixed $input): void
method getHeadersFromArrayInput (line 373) | protected function getHeadersFromArrayInput(mixed $input): void
method addToInputHeadersFromInput (line 399) | protected function addToInputHeadersFromInput(mixed $input, string $in...
method mergeHeaders (line 427) | protected function mergeHeaders(): array
method resetInputRequestParams (line 440) | protected function resetInputRequestParams(): void
method resolveStaticUrl (line 447) | private function resolveStaticUrl(): string
method resolveVarsInRequestProperties (line 465) | private function resolveVarsInRequestProperties(StreamInterface|string...
method resolveVarsInHeaders (line 490) | private function resolveVarsInHeaders(array $headers, array $fullInput...
FILE: src/Steps/Loading/HttpCrawl.php
class HttpCrawl (line 19) | class HttpCrawl extends Http
method __construct (line 57) | public function __construct(array $headers = [], string $httpVersion =...
method depth (line 62) | public function depth(int $depth): static
method sameHost (line 69) | public function sameHost(): static
method sameDomain (line 78) | public function sameDomain(): static
method pathStartsWith (line 87) | public function pathStartsWith(string $startsWith = ''): static
method pathMatches (line 94) | public function pathMatches(string $regexPattern = ''): static
method customFilter (line 101) | public function customFilter(Closure $closure): static
method inputIsSitemap (line 108) | public function inputIsSitemap(): static
method loadAllButYieldOnlyMatching (line 115) | public function loadAllButYieldOnlyMatching(): static
method keepUrlFragment (line 122) | public function keepUrlFragment(): static
method useCanonicalLinks (line 129) | public function useCanonicalLinks(): static
method validateAndSanitizeInput (line 136) | protected function validateAndSanitizeInput(mixed $input): mixed
method invoke (line 145) | protected function invoke(mixed $input): Generator
method setHostOrDomain (line 185) | protected function setHostOrDomain(UriInterface $uri): void
method loadUrls (line 203) | protected function loadUrls(): Generator
method getUrlsFromInitialResponse (line 242) | protected function getUrlsFromInitialResponse(RespondedRequest $respon...
method getUrlsFromSitemap (line 257) | protected function getUrlsFromSitemap(RespondedRequest $respondedReque...
method getUrlsFromHtmlDocument (line 294) | protected function getUrlsFromHtmlDocument(Document $document): array
method addLoadedUrlsFromResponse (line 333) | protected function addLoadedUrlsFromResponse(RespondedRequest $respond...
method wasAlreadyLoaded (line 355) | protected function wasAlreadyLoaded(RespondedRequest $respondedRequest...
method addCanonicalUrlToLoadedUrls (line 377) | protected function addCanonicalUrlToLoadedUrls(Document $document): void
method yieldResponse (line 388) | protected function yieldResponse(Document $document, bool $urlMatchesC...
method setResponseCanonicalUrl (line 400) | protected function setResponseCanonicalUrl(RespondedRequest $responded...
method depthIsExceeded (line 409) | protected function depthIsExceeded(int $depth): bool
method matchesAllCriteria (line 417) | protected function matchesAllCriteria(Url $url, ?HtmlElement $linkElem...
method matchesCriteriaBesidesHostOrDomain (line 425) | protected function matchesCriteriaBesidesHostOrDomain(Url $url, ?HtmlE...
method isOnSameHostOrDomain (line 434) | protected function isOnSameHostOrDomain(Url $url): bool
method matchesPathCriteria (line 446) | protected function matchesPathCriteria(Url $url): bool
method matchesCustomCriteria (line 458) | protected function matchesCustomCriteria(Url $url, ?HtmlElement $linkE...
method handleUrlFragment (line 466) | protected function handleUrlFragment(Url $url): Url
FILE: src/Steps/Loading/LoadingStep.php
type LoadingStep (line 11) | trait LoadingStep
method setLoader (line 26) | public function setLoader(LoaderInterface $loader): static
method withLoader (line 36) | public function withLoader(LoaderInterface $loader): static
method getLoader (line 46) | protected function getLoader(): LoaderInterface
FILE: src/Steps/Refiners/AbstractRefiner.php
class AbstractRefiner (line 7) | abstract class AbstractRefiner implements RefinerInterface
method addLogger (line 11) | public function addLogger(LoggerInterface $logger): static
method logTypeWarning (line 18) | protected function logTypeWarning(string $staticRefinerMethod, mixed $...
FILE: src/Steps/Refiners/DateTime/DateTimeFormat.php
class DateTimeFormat (line 8) | class DateTimeFormat extends AbstractStringRefiner
method __construct (line 10) | public function __construct(protected string $targetFormat, protected ...
method refine (line 12) | public function refine(mixed $value): mixed
method parseFromUnknownFormat (line 35) | private function parseFromUnknownFormat(string $value): ?DateTime
FILE: src/Steps/Refiners/DateTimeRefiner.php
class DateTimeRefiner (line 7) | class DateTimeRefiner
method reformat (line 9) | public static function reformat(string $targetFormat, ?string $originF...
FILE: src/Steps/Refiners/Html/RemoveFromHtml.php
class RemoveFromHtml (line 13) | class RemoveFromHtml extends AbstractStringRefiner
method __construct (line 20) | public function __construct(string|DomQuery $selector)
method refine (line 38) | public function refine(mixed $value): mixed
FILE: src/Steps/Refiners/HtmlRefiner.php
class HtmlRefiner (line 8) | class HtmlRefiner
method remove (line 10) | public static function remove(string|DomQuery $selector): RemoveFromHtml
FILE: src/Steps/Refiners/RefinerInterface.php
type RefinerInterface (line 7) | interface RefinerInterface
method refine (line 9) | public function refine(mixed $value): mixed;
method addLogger (line 11) | public function addLogger(LoggerInterface $logger): static;
FILE: src/Steps/Refiners/String/AbstractStringRefiner.php
class AbstractStringRefiner (line 8) | abstract class AbstractStringRefiner extends AbstractRefiner
method apply (line 14) | protected function apply(mixed $value, Closure $refiner, string $stati...
FILE: src/Steps/Refiners/String/StrAfterFirst.php
class StrAfterFirst (line 5) | class StrAfterFirst extends AbstractStringRefiner
method __construct (line 7) | public function __construct(protected readonly string $first) {}
method refine (line 9) | public function refine(mixed $value): mixed
FILE: src/Steps/Refiners/String/StrAfterLast.php
class StrAfterLast (line 5) | class StrAfterLast extends AbstractStringRefiner
method __construct (line 7) | public function __construct(protected readonly string $last) {}
method refine (line 9) | public function refine(mixed $value): mixed
FILE: src/Steps/Refiners/String/StrBeforeFirst.php
class StrBeforeFirst (line 5) | class StrBeforeFirst extends AbstractStringRefiner
method __construct (line 7) | public function __construct(protected readonly string $first) {}
method refine (line 9) | public function refine(mixed $value): mixed
FILE: src/Steps/Refiners/String/StrBeforeLast.php
class StrBeforeLast (line 5) | class StrBeforeLast extends AbstractStringRefiner
method __construct (line 7) | public function __construct(protected readonly string $last) {}
method refine (line 9) | public function refine(mixed $value): mixed
FILE: src/Steps/Refiners/String/StrBetweenFirst.php
class StrBetweenFirst (line 5) | class StrBetweenFirst extends AbstractStringRefiner
method __construct (line 7) | public function __construct(protected readonly string $start, protecte...
method refine (line 9) | public function refine(mixed $value): mixed
FILE: src/Steps/Refiners/String/StrBetweenLast.php
class StrBetweenLast (line 5) | class StrBetweenLast extends AbstractStringRefiner
method __construct (line 7) | public function __construct(protected readonly string $start, protecte...
method refine (line 9) | public function refine(mixed $value): mixed
FILE: src/Steps/Refiners/String/StrReplace.php
class StrReplace (line 5) | class StrReplace extends AbstractStringRefiner
method __construct (line 11) | public function __construct(
method refine (line 16) | public function refine(mixed $value): mixed
FILE: src/Steps/Refiners/StringRefiner.php
class StringRefiner (line 13) | class StringRefiner
method afterFirst (line 15) | public static function afterFirst(string $first): StrAfterFirst
method afterLast (line 20) | public static function afterLast(string $last): StrAfterLast
method beforeFirst (line 25) | public static function beforeFirst(string $first): StrBeforeFirst
method beforeLast (line 30) | public static function beforeLast(string $last): StrBeforeLast
method betweenFirst (line 35) | public static function betweenFirst(string $start, string $end): StrBe...
method betweenLast (line 40) | public static function betweenLast(string $start, string $end): StrBet...
method replace (line 49) | public static function replace(string|array $search, string|array $rep...
FILE: src/Steps/Refiners/Url/AbstractUrlRefiner.php
class AbstractUrlRefiner (line 11) | abstract class AbstractUrlRefiner extends AbstractRefiner
method refine (line 16) | public function refine(mixed $value): mixed
method staticRefinerMethod (line 39) | abstract protected function staticRefinerMethod(): string;
method refineUrl (line 41) | abstract protected function refineUrl(Url $url): string;
FILE: src/Steps/Refiners/Url/WithFragment.php
class WithFragment (line 9) | class WithFragment extends AbstractUrlRefiner
method __construct (line 11) | public function __construct(protected readonly string $fragment) {}
method staticRefinerMethod (line 13) | protected function staticRefinerMethod(): string
method refineUrl (line 21) | protected function refineUrl(Url $url): string
FILE: src/Steps/Refiners/Url/WithHost.php
class WithHost (line 9) | class WithHost extends AbstractUrlRefiner
method __construct (line 11) | public function __construct(protected readonly string $host) {}
method staticRefinerMethod (line 13) | protected function staticRefinerMethod(): string
method refineUrl (line 21) | protected function refineUrl(Url $url): string
FILE: src/Steps/Refiners/Url/WithPath.php
class WithPath (line 9) | class WithPath extends AbstractUrlRefiner
method __construct (line 11) | public function __construct(protected readonly string $path) {}
method staticRefinerMethod (line 13) | protected function staticRefinerMethod(): string
method refineUrl (line 21) | protected function refineUrl(Url $url): string
FILE: src/Steps/Refiners/Url/WithPort.php
class WithPort (line 9) | class WithPort extends AbstractUrlRefiner
method __construct (line 11) | public function __construct(protected readonly int $port) {}
method staticRefinerMethod (line 13) | protected function staticRefinerMethod(): string
method refineUrl (line 21) | protected function refineUrl(Url $url): string
FILE: src/Steps/Refiners/Url/WithQuery.php
class WithQuery (line 9) | class WithQuery extends AbstractUrlRefiner
method __construct (line 11) | public function __construct(protected readonly string $query) {}
method staticRefinerMethod (line 13) | protected function staticRefinerMethod(): string
method refineUrl (line 21) | protected function refineUrl(Url $url): string
FILE: src/Steps/Refiners/Url/WithScheme.php
class WithScheme (line 9) | class WithScheme extends AbstractUrlRefiner
method __construct (line 11) | public function __construct(protected readonly string $scheme) {}
method staticRefinerMethod (line 13) | protected function staticRefinerMethod(): string
method refineUrl (line 21) | protected function refineUrl(Url $url): string
FILE: src/Steps/Refiners/Url/WithoutPort.php
class WithoutPort (line 9) | class WithoutPort extends AbstractUrlRefiner
method staticRefinerMethod (line 11) | protected function staticRefinerMethod(): string
method refineUrl (line 19) | protected function refineUrl(Url $url): string
FILE: src/Steps/Refiners/UrlRefiner.php
class UrlRefiner (line 13) | class UrlRefiner
method withScheme (line 15) | public static function withScheme(string $scheme): WithScheme
method withHost (line 20) | public static function withHost(string $host): WithHost
method withPort (line 25) | public static function withPort(int $port): WithPort
method withoutPort (line 30) | public static function withoutPort(): WithoutPort
method withPath (line 35) | public static function withPath(string $path): WithPath
method withQuery (line 40) | public static function withQuery(string $query): WithQuery
method withoutQuery (line 45) | public static function withoutQuery(): WithQuery
method withFragment (line 50) | public static function withFragment(string $fragment): WithFragment
method withoutFragment (line 55) | public static function withoutFragment(): WithFragment
FILE: src/Steps/Sitemap.php
class Sitemap (line 8) | class Sitemap
method getSitemapsFromRobotsTxt (line 10) | public static function getSitemapsFromRobotsTxt(): GetSitemapsFromRobo...
method getUrlsFromSitemap (line 15) | public static function getUrlsFromSitemap(): GetUrlsFromSitemap
FILE: src/Steps/Sitemap/GetUrlsFromSitemap.php
class GetUrlsFromSitemap (line 13) | class GetUrlsFromSitemap extends Step
method fixUrlSetTag (line 23) | public static function fixUrlSetTag(XmlDocument $dom): XmlDocument
method withData (line 32) | public function withData(): static
method outputType (line 39) | public function outputType(): StepOutputType
method invoke (line 47) | protected function invoke(mixed $input): Generator
method validateAndSanitizeInput (line 67) | protected function validateAndSanitizeInput(mixed $input): mixed
method getWithAdditionalData (line 75) | protected function getWithAdditionalData(XmlElement $urlNode): array
FILE: src/Steps/Step.php
class Step (line 21) | abstract class Step extends BaseStep
method invoke (line 32) | abstract protected function invoke(mixed $input): Generator;
method invokeStep (line 40) | final public function invokeStep(Input $input): Generator
method updateInputUsingOutput (line 75) | public function updateInputUsingOutput(Closure $closure): static
method excludeFromGroupOutput (line 82) | public function excludeFromGroupOutput(): static
method oneOutputPerInput (line 89) | public function oneOutputPerInput(): static
method shouldOutputBeExcludedFromGroupOutput (line 96) | public function shouldOutputBeExcludedFromGroupOutput(): bool
method callUpdateInputUsingOutput (line 104) | public function callUpdateInputUsingOutput(Input $input, Output $outpu...
method validateAndSanitizeInput (line 125) | protected function validateAndSanitizeInput(mixed $input): mixed
method validateAndSanitizeStringOrStringable (line 133) | protected function validateAndSanitizeStringOrStringable(
method validateAndSanitizeStringOrHttpResponse (line 153) | protected function validateAndSanitizeStringOrHttpResponse(
method validateAndSanitizeToUriInterface (line 177) | protected function validateAndSanitizeToUriInterface(
method validateAndSanitizeToHtmlDocumentInstance (line 205) | protected function validateAndSanitizeToHtmlDocumentInstance(
method validateAndSanitizeToXmlDocumentInstance (line 215) | protected function validateAndSanitizeToXmlDocumentInstance(
method getSingleElementFromArray (line 222) | protected function getSingleElementFromArray(mixed $inputValue): mixed
method invokeAndYield (line 234) | private function invokeAndYield(mixed $validInputValue, Input $input):...
method invokeAndYieldOneOutputPerInput (line 264) | private function invokeAndYieldOneOutputPerInput(mixed $validInputValu...
method removeUtf8BomFromString (line 302) | private function removeUtf8BomFromString(string $string): string
method logInvalidInputException (line 311) | private function logInvalidInputException(InvalidArgumentException $ex...
FILE: src/Steps/StepInterface.php
type StepInterface (line 11) | interface StepInterface
method addLogger (line 13) | public function addLogger(LoggerInterface $logger): static;
method invokeStep (line 19) | public function invokeStep(Input $input): Generator;
method keep (line 24) | public function keep(string|array|null $keys = null): static;
method keepAs (line 26) | public function keepAs(string $key): static;
method keepFromInput (line 31) | public function keepFromInput(string|array|null $keys = null): static;
method keepInputAs (line 33) | public function keepInputAs(string $key): static;
method keepsAnything (line 35) | public function keepsAnything(): bool;
method keepsAnythingFromInputData (line 37) | public function keepsAnythingFromInputData(): bool;
method keepsAnythingFromOutputData (line 39) | public function keepsAnythingFromOutputData(): bool;
method useInputKey (line 41) | public function useInputKey(string $key): static;
method uniqueInputs (line 43) | public function uniqueInputs(?string $key = null): static;
method uniqueOutputs (line 45) | public function uniqueOutputs(?string $key = null): static;
method where (line 47) | public function where(string|FilterInterface $keyOrFilter, ?FilterInte...
method orWhere (line 49) | public function orWhere(string|FilterInterface $keyOrFilter, ?FilterIn...
method outputKey (line 51) | public function outputKey(string $key): static;
method maxOutputs (line 53) | public function maxOutputs(int $maxOutputs): static;
method resetAfterRun (line 55) | public function resetAfterRun(): void;
FILE: src/Steps/Xml.php
class Xml (line 12) | class Xml extends Dom
method makeDefaultDomQueryInstance (line 17) | public function makeDefaultDomQueryInstance(string $query): DomQuery
method validateAndSanitizeInput (line 27) | protected function validateAndSanitizeInput(mixed $input): XmlDocument
FILE: src/Stores/JsonFileStore.php
class JsonFileStore (line 8) | class JsonFileStore extends Store
method __construct (line 12) | public function __construct(protected readonly string $storePath, prot...
method store (line 24) | public function store(Result $result): void
method filePath (line 39) | public function filePath(): string
FILE: src/Stores/SimpleCsvFileStore.php
class SimpleCsvFileStore (line 8) | class SimpleCsvFileStore extends Store
method __construct (line 14) | public function __construct(protected readonly string $storePath, prot...
method store (line 24) | public function store(Result $result): void
method filePath (line 49) | public function filePath(): string
method anyPropertyIsArray (line 55) | protected function anyPropertyIsArray(Result $result): bool
method flattenResultArray (line 70) | protected function flattenResultArray(array $result): array
FILE: src/Stores/Store.php
class Store (line 7) | abstract class Store implements StoreInterface
method addLogger (line 11) | public function addLogger(LoggerInterface $logger): static
FILE: src/Stores/StoreInterface.php
type StoreInterface (line 8) | interface StoreInterface
method store (line 10) | public function store(Result $result): void;
method addLogger (line 12) | public function addLogger(LoggerInterface $logger): static;
FILE: src/UserAgents/BotUserAgent.php
class BotUserAgent (line 5) | class BotUserAgent implements BotUserAgentInterface
method __construct (line 12) | public function __construct(
method make (line 18) | public static function make(string $productToken, ?string $crawlerInfo...
method __toString (line 23) | public function __toString(): string
method productToken (line 38) | public function productToken(): string
FILE: src/UserAgents/BotUserAgentInterface.php
type BotUserAgentInterface (line 5) | interface BotUserAgentInterface extends UserAgentInterface
method productToken (line 7) | public function productToken(): string;
FILE: src/UserAgents/UserAgent.php
class UserAgent (line 5) | class UserAgent implements UserAgentInterface
method __construct (line 7) | public function __construct(protected readonly string $userAgent) {}
method __toString (line 9) | public function __toString(): string
method mozilla5CompatibleBrowser (line 14) | public static function mozilla5CompatibleBrowser(): self
FILE: src/UserAgents/UserAgentInterface.php
type UserAgentInterface (line 5) | interface UserAgentInterface
method __toString (line 7) | public function __toString(): string;
FILE: src/Utils/Gzip.php
class Gzip (line 7) | class Gzip
method encode (line 12) | public static function encode(string $string, bool $throwException = f...
method decode (line 26) | public static function decode(string $string, bool $throwException = f...
FILE: src/Utils/HttpHeaders.php
class HttpHeaders (line 5) | final class HttpHeaders
method normalize (line 11) | public static function normalize(array $headers): array
method merge (line 27) | public static function merge(array $headers, array $mergeHeaders): array
method addTo (line 46) | public static function addTo(array $headers, string $headerName, strin...
FILE: src/Utils/OutputTypeHelper.php
class OutputTypeHelper (line 5) | class OutputTypeHelper
method objectToArray (line 10) | public static function objectToArray(object $output): array
method isScalar (line 23) | public static function isScalar(mixed $output): bool
method isAssociativeArrayOrObject (line 28) | public static function isAssociativeArrayOrObject(mixed $output): bool
method isAssociativeArray (line 33) | public static function isAssociativeArray(mixed $output): bool
method recursiveChildObjectsToArray (line 50) | public static function recursiveChildObjectsToArray(array $data): array
FILE: src/Utils/RequestKey.php
class RequestKey (line 10) | class RequestKey
method from (line 26) | public static function from(RequestInterface|RespondedRequest $request...
method removeIgnoreHeaders (line 49) | private static function removeIgnoreHeaders(array $data, array $ignore...
FILE: src/Utils/TemplateString.php
class TemplateString (line 7) | class TemplateString
method resolve (line 12) | public static function resolve(string $string, array $data = []): string
method trimAndUnescapeQuotes (line 33) | private static function trimAndUnescapeQuotes(string $string): string
FILE: tests/Cache/FileCacheTest.php
function helper_addMultipleItemsToCache (line 25) | function helper_addMultipleItemsToCache(array $items, FileCache $cache):...
function helper_respondedRequestWithRequestUrl (line 32) | function helper_respondedRequestWithRequestUrl(string $requestUrl): Resp...
function helper_getCacheItemByKey (line 41) | function helper_getCacheItemByKey(string $key): ?CacheItem
FILE: tests/CrawlerTest.php
function helper_getDummyCrawler (line 21) | function helper_getDummyCrawler(): Crawler
function helper_getDummyCrawlerWithInputReturningStep (line 26) | function helper_getDummyCrawlerWithInputReturningStep(): Crawler
method invoke (line 287) | protected function invoke(mixed $input): Generator
method outputType (line 294) | public function outputType(): StepOutputType
method invoke (line 301) | protected function invoke(mixed $input): Generator
method outputType (line 306) | public function outputType(): StepOutputType
method invoke (line 342) | protected function invoke(mixed $input): Generator
method invoke (line 407) | protected function invoke(mixed $input): Generator
method invoke (line 418) | protected function invoke(mixed $input): Generator
method store (line 427) | public function store(Result $result): void
method invoke (line 462) | protected function invoke(mixed $input): Generator
method invoke (line 473) | protected function invoke(mixed $input): Generator
method invoke (line 484) | protected function invoke(mixed $input): Generator
method invoke (line 495) | protected function invoke(mixed $input): Generator
method store (line 506) | public function store(Result $result): void
method invoke (line 566) | protected function invoke(mixed $input): Generator
method invoke (line 577) | protected function invoke(mixed $input): Generator
method invoke (line 590) | protected function invoke(mixed $input): Generator
method invoke (line 601) | protected function invoke(mixed $input): Generator
method store (line 614) | public function store(Result $result): void
FILE: tests/IoTest.php
function helper_getIoInstance (line 10) | function helper_getIoInstance(
FILE: tests/Loader/Http/HeadlessBrowserLoaderHelperTest.php
function helper_setUpHeadlessChromeMocks (line 25) | function helper_setUpHeadlessChromeMocks(
FILE: tests/Loader/Http/HttpLoaderPolitenessTest.php
function helper_wait300ms (line 24) | function helper_wait300ms(): void
FILE: tests/Loader/Http/HttpLoaderTest.php
method isAllowedToBeLoaded (line 263) | public function isAllowedToBeLoaded(UriInterface $uri, bool $throwsExcep...
method isAllowedToBeLoaded (line 286) | public function isAllowedToBeLoaded(UriInterface $uri, bool $throwsExcep...
method trackRequestStartFor (line 332) | public function trackRequestStartFor(UriInterface $url): void
method trackRequestEndFor (line 339) | public function trackRequestEndFor(UriInterface $url): void
method trackRequestEndFor (line 379) | public function trackRequestEndFor(UriInterface $url): void
FILE: tests/Loader/Http/Politeness/RobotsTxtHandlerTest.php
function helper_getLoaderWithRobotsTxt (line 18) | function helper_getLoaderWithRobotsTxt(string $robotsTxtContent = '', ?U...
FILE: tests/Loader/LoaderTest.php
method __construct (line 13) | public function __construct(BotUserAgent $userAgent, private readonly st...
method load (line 18) | public function load(mixed $subject): mixed
method loadOrFail (line 29) | public function loadOrFail(mixed $subject): mixed
method load (line 64) | public function load(mixed $subject): mixed
method loadOrFail (line 71) | public function loadOrFail(mixed $subject): mixed
method load (line 95) | public function load(mixed $subject): mixed
method loadOrFail (line 104) | public function loadOrFail(mixed $subject): mixed
method load (line 124) | public function load(mixed $subject): string
method loadOrFail (line 130) | public function loadOrFail(mixed $subject): mixed
FILE: tests/Pest.php
class TestServerProcess (line 30) | class TestServerProcess
function helper_dump (line 55) | function helper_dump(mixed $var): void
function helper_dieDump (line 60) | function helper_dieDump(mixed $var): void
function helper_getValueReturningStep (line 67) | function helper_getValueReturningStep(mixed $value): Step
function helper_getInputReturningStep (line 86) | function helper_getInputReturningStep(): Step
function helper_getNumberIncrementingStep (line 96) | function helper_getNumberIncrementingStep(): Step
function helper_getStepYieldingMultipleNumbers (line 106) | function helper_getStepYieldingMultipleNumbers(): Step
function helper_getStepYieldingMultipleArraysWithNumber (line 118) | function helper_getStepYieldingMultipleArraysWithNumber(): Step
function helper_getStepYieldingObjectWithNumber (line 130) | function helper_getStepYieldingObjectWithNumber(int $number): Step
function helper_getStepYieldingMultipleObjectsWithNumber (line 144) | function helper_getStepYieldingMultipleObjectsWithNumber(): Step
function helper_getStepYieldingInputArrayAsSeparateOutputs (line 158) | function helper_getStepYieldingInputArrayAsSeparateOutputs(): Step
function helper_getLoadingStep (line 170) | function helper_getLoadingStep(): Step
function helper_getDummyRobotsTxtResponse (line 185) | function helper_getDummyRobotsTxtResponse(?string $forDomain = null): Re...
function helper_traverseIterable (line 199) | function helper_traverseIterable(iterable $iterable): void
function helper_arrayToGenerator (line 210) | function helper_arrayToGenerator(array $array): Generator
function helper_generatorToArray (line 221) | function helper_generatorToArray(Generator $generator): array
function helper_invokeStepWithInput (line 235) | function helper_invokeStepWithInput(StepInterface $step, mixed $input = ...
function helper_getStepFilesContent (line 240) | function helper_getStepFilesContent(string $filePathInFilesFolder): string
function helper_getStdClassWithData (line 254) | function helper_getStdClassWithData(array $data): stdClass
function helper_getSimpleListHtml (line 265) | function helper_getSimpleListHtml(): string
function helper_getFastLoader (line 277) | function helper_getFastLoader(
function helper_getFastCrawler (line 291) | function helper_getFastCrawler(): HttpCrawler
function helper_nonBotUserAgent (line 306) | function helper_nonBotUserAgent(): UserAgent
function helper_getMinThrottler (line 311) | function helper_getMinThrottler(): Throttler
function helper_getRespondedRequest (line 320) | function helper_getRespondedRequest(
function helper_cachedir (line 344) | function helper_cachedir(?string $inDir = null): string
function helper_resetCacheDir (line 355) | function helper_resetCacheDir(): void
function helper_storagedir (line 360) | function helper_storagedir(?string $inDir = null): string
function helper_resetStorageDir (line 371) | function helper_resetStorageDir(): void
function helper_resetTempDir (line 376) | function helper_resetTempDir(string $dirPath): void
function helper_testfilesdir (line 391) | function helper_testfilesdir(?string $inDir = null): string
FILE: tests/Steps/BaseStepTest.php
class TestStep (line 34) | class TestStep extends BaseStep
method invokeStep (line 38) | public function invokeStep(Input $input): Generator
method invoke (line 165) | protected function invoke(mixed $input): Generator
method validateAndSanitizeInput (line 170) | protected function validateAndSanitizeInput(mixed $input): mixed
method __construct (line 202) | public function __construct()
method invoke (line 211) | protected function invoke(mixed $input): Generator
method __construct (line 234) | public function __construct()
method invoke (line 241) | protected function invoke(mixed $input): Generator
method invoke (line 268) | protected function invoke(mixed $input): Generator
method outputType (line 273) | public function outputType(): StepOutputType
class SomeDemoStep (line 286) | class SomeDemoStep extends Step
method invoke (line 288) | protected function invoke(mixed $input): Generator
class ParentStepClass (line 308) | class ParentStepClass extends Step
method invoke (line 310) | protected function invoke(mixed $input): Generator
method invoke (line 336) | protected function invoke(mixed $input): Generator
method invoke (line 343) | protected function invoke(mixed $input): Generator
method invokeStep (line 348) | public function invokeStep(Input $input): Generator
method invoke (line 428) | protected function invoke(mixed $input): Generator
method outputType (line 433) | public function outputType(): StepOutputType
class ParentStepClassTwo (line 454) | class ParentStepClassTwo extends Step
method invoke (line 456) | protected function invoke(mixed $input): Generator
method outputType (line 461) | public function outputType(): StepOutputType
method invoke (line 491) | protected function invoke(mixed $input): Generator
method outputType (line 496) | public function outputType(): StepOutputType
method invoke (line 503) | protected function invoke(mixed $input): Generator
method outputType (line 508) | public function outputType(): StepOutputType
method invokeStep (line 513) | public function invokeStep(Input $input): Generator
method toArray (line 554) | public function toArray(): array
method toArray (line 649) | public function toArray(): array
FILE: tests/Steps/CsvTest.php
function helper_csvFilePath (line 19) | function helper_csvFilePath(string $fileName): string
method __toString (line 69) | public function __toString(): string
FILE: tests/Steps/Dom/NodeTest.php
function helper_getSymfonyCrawlerInstanceFromSource (line 20) | function helper_getSymfonyCrawlerInstanceFromSource(string $source, stri...
function helper_getLegacyDomNodeInstanceFromSource (line 28) | function helper_getLegacyDomNodeInstanceFromSource(string $source, strin...
function helper_getPhp84HtmlDomNodeInstanceFromSource (line 39) | function helper_getPhp84HtmlDomNodeInstanceFromSource(string $source, st...
function helper_getPhp84XmlDomNodeInstanceFromSource (line 48) | function helper_getPhp84XmlDomNodeInstanceFromSource(string $source, str...
function helper_getAbstractNodeInstance (line 60) | function helper_getAbstractNodeInstance(object $originalNode, bool $html...
method makeChildNodeInstance (line 84) | protected function makeChildNodeInstance(object $node): Node
method makeChildNodeInstance (line 107) | protected function makeChildNodeInstance(object $node): Node
method makeChildNodeInstance (line 130) | protected function makeChildNodeInstance(object $node): Node
FILE: tests/Steps/Dom/_Stubs/HtmlNodeStub.php
class HtmlNodeStub (line 8) | class HtmlNodeStub extends Node
method inner (line 10) | public function inner(): string
method outer (line 15) | public function outer(): string
method makeChildNodeInstance (line 20) | protected function makeChildNodeInstance(object $node): Node
FILE: tests/Steps/Dom/_Stubs/XmlNodeStub.php
class XmlNodeStub (line 8) | class XmlNodeStub extends Node
method inner (line 10) | public function inner(): string
method outer (line 15) | public function outer(): string
method makeChildNodeInstance (line 20) | protected function makeChildNodeInstance(object $node): Node
FILE: tests/Steps/DomTest.php
function helper_getDomStepInstance (line 23) | function helper_getDomStepInstance(array $mapping = []): Dom
FILE: tests/Steps/Filters/FilterTest.php
class TestFilter (line 11) | class TestFilter extends AbstractFilter
method evaluate (line 15) | public function evaluate(mixed $valueInQuestion): bool
FILE: tests/Steps/GroupTest.php
function helper_addStepsToGroup (line 26) | function helper_addStepsToGroup(Group $group, Step ...$steps): Group
function helper_addUpdateInputUsingOutputCallbackToSteps (line 35) | function helper_addUpdateInputUsingOutputCallbackToSteps(Closure $callba...
function helper_getStepThatRemembersIfItWasCalled (line 42) | function helper_getStepThatRemembersIfItWasCalled(): Step
method invoke (line 577) | protected function invoke(mixed $input): Generator
method invoke (line 586) | protected function invoke(mixed $input): Generator
method invoke (line 837) | protected function invoke(mixed $input): Generator
method invoke (line 848) | protected function invoke(mixed $input): Generator
method invoke (line 874) | protected function invoke(mixed $input): Generator
method invoke (line 885) | protected function invoke(mixed $input): Generator
FILE: tests/Steps/Html/SchemaOrgTest.php
function helper_schemaOrgExampleOneJobPostingInBody (line 11) | function helper_schemaOrgExampleOneJobPostingInBody(): string
function helper_schemaOrgExampleMultipleObjects (line 25) | function helper_schemaOrgExampleMultipleObjects(): string
FILE: tests/Steps/HtmlTest.php
function helper_getHtmlContent (line 15) | function helper_getHtmlContent(string $fileName): string
FILE: tests/Steps/Loading/Http/Paginators/QueryParams/AbstractQueryParamManipulatorTest.php
method execute (line 12) | public function execute(Query $query): Query
method execute (line 29) | public function execute(Query $query): Query
FILE: tests/Steps/Loading/Http/Paginators/SimpleWebsitePaginatorTest.php
function helper_getRespondedRequestWithResponseBody (line 15) | function helper_getRespondedRequestWithResponseBody(string $urlPath, str...
function helper_createResponseBodyWithPaginationLinks (line 23) | function helper_createResponseBodyWithPaginationLinks(array $links): string
method parentRequests (line 292) | public function parentRequests(): array
FILE: tests/Steps/Loading/LoadingStepTest.php
method invoke (line 23) | protected function invoke(mixed $input): Generator
method invoke (line 58) | protected function invoke(mixed $input): Generator
FILE: tests/Steps/Refiners/AbstractRefinerTest.php
class SomeRefiner (line 9) | class SomeRefiner extends AbstractRefiner
method refine (line 11) | public function refine(mixed $value): mixed
method testLogTypeWarning (line 18) | public function testLogTypeWarning(): void
FILE: tests/Steps/StepTest.php
method invoke (line 40) | protected function invoke(mixed $input): Generator
method validateAndSanitizeInput (line 459) | protected function validateAndSanitizeInput(mixed $input): string
method invoke (line 464) | protected function invoke(mixed $input): Generator
method validateAndSanitizeInput (line 480) | protected function validateAndSanitizeInput(mixed $input): string
method invoke (line 485) | protected function invoke(mixed $input): Generator
method validateAndSanitizeInput (line 504) | protected function validateAndSanitizeInput(mixed $input): string
method invoke (line 509) | protected function invoke(mixed $input): Generator
method validateAndSanitizeInput (line 533) | protected function validateAndSanitizeInput(mixed $input): string
method invoke (line 538) | protected function invoke(mixed $input): Generator
method validateAndSanitizeInput (line 563) | protected function validateAndSanitizeInput(mixed $input): string
method invoke (line 568) | protected function invoke(mixed $input): Generator
method invoke (line 582) | protected function invoke(mixed $input): Generator
method invoke (line 651) | protected function invoke(mixed $input): Generator
method invoke (line 681) | protected function invoke(mixed $input): Generator
method invoke (line 726) | protected function invoke(mixed $input): Generator
method outputType (line 731) | public function outputType(): StepOutputType
method invoke (line 855) | protected function invoke(mixed $input): Generator
method outputKeyAliases (line 864) | protected function outputKeyAliases(): array
method invoke (line 887) | protected function invoke(mixed $input): Generator
method outputKeyAliases (line 895) | protected function outputKeyAliases(): array
method invoke (line 912) | protected function invoke(mixed $input): Generator
method outputKeyAliases (line 933) | protected function outputKeyAliases(): array
FILE: tests/Stores/JsonFileStoreTest.php
function helper_getResultWithJsonData (line 11) | function helper_getResultWithJsonData(array $data): Result
FILE: tests/Stores/SimpleCsvFileStoreTest.php
function helper_getResultWithData (line 11) | function helper_getResultWithData(array $data): Result
FILE: tests/Utils/OutputTypeHelperTest.php
method toArrayForResult (line 13) | public function toArrayForResult(): array
method toArray (line 27) | public function toArray(): array
method __serialize (line 38) | public function __serialize(): array
FILE: tests/_Integration/GroupTest.php
method userAgent (line 21) | protected function userAgent(): UserAgentInterface
method loader (line 26) | public function loader(UserAgentInterface $userAgent, LoggerInterface $l...
FILE: tests/_Integration/Http/CharsetTest.php
class CharsetExampleCrawler (line 16) | class CharsetExampleCrawler extends HttpCrawler
method loader (line 18) | public function loader(UserAgentInterface $userAgent, LoggerInterface ...
method userAgent (line 23) | protected function userAgent(): UserAgentInterface
FILE: tests/_Integration/Http/CrawlingTest.php
class TestLoader (line 33) | class TestLoader extends HttpLoader
method __construct (line 40) | public function __construct(
method load (line 75) | public function load(mixed $subject): ?RespondedRequest
class Crawler (line 92) | class Crawler extends HttpCrawler
method loader (line 94) | public function loader(UserAgentInterface $userAgent, LoggerInterface ...
method userAgent (line 122) | protected function userAgent(): UserAgentInterface
method getLoader (line 130) | public function getLoader(): TestLoader
FILE: tests/_Integration/Http/ErrorResponsesTest.php
class ErrorCrawler (line 20) | class ErrorCrawler extends HttpCrawler
method logger (line 22) | protected function logger(): LoggerInterface
method userAgent (line 27) | protected function userAgent(): UserAgentInterface
method loader (line 32) | public function loader(UserAgentInterface $userAgent, LoggerInterface ...
FILE: tests/_Integration/Http/GzipTest.php
class GzipCrawler (line 17) | class GzipCrawler extends HttpCrawler
method userAgent (line 19) | protected function userAgent(): UserAgentInterface
method loader (line 24) | public function loader(UserAgentInterface $userAgent, LoggerInterface ...
FILE: tests/_Integration/Http/HeadlessBrowserTest.php
class HeadlessBrowserCrawler (line 28) | class HeadlessBrowserCrawler extends HttpCrawler
method userAgent (line 30) | protected function userAgent(): UserAgentInterface
method loader (line 35) | public function loader(UserAgentInterface $userAgent, LoggerInterface ...
class GetJsonFromResponseHtmlBody (line 45) | class GetJsonFromResponseHtmlBody extends Step
method invoke (line 47) | protected function invoke(mixed $input): Generator
class GetStringFromResponseHtmlBody (line 57) | class GetStringFromResponseHtmlBody extends Step
method invoke (line 59) | protected function invoke(mixed $input): Generator
function helper_getCookiesByDomainFromLoader (line 70) | function helper_getCookiesByDomainFromLoader(HttpLoader $loader, string ...
method invoke (line 125) | protected function invoke(mixed $input): Generator
method invoke (line 149) | protected function invoke(mixed $input): Generator
method invoke (line 230) | protected function invoke(mixed $input): Generator
method invoke (line 263) | protected function invoke(mixed $input): Generator
FILE: tests/_Integration/Http/Html/PaginatedListingTest.php
method userAgent (line 18) | protected function userAgent(): UserAgentInterface
method loader (line 23) | public function loader(UserAgentInterface $userAgent, LoggerInterface $l...
FILE: tests/_Integration/Http/Html/SimpleListingTest.php
method userAgent (line 18) | protected function userAgent(): UserAgentInterface
method loader (line 23) | public function loader(UserAgentInterface $userAgent, LoggerInterface $l...
FILE: tests/_Integration/Http/PaginationTest.php
class PaginationCrawler (line 16) | class PaginationCrawler extends HttpCrawler
method userAgent (line 18) | protected function userAgent(): UserAgentInterface
method loader (line 23) | protected function loader(UserAgentInterface $userAgent, LoggerInterfa...
FILE: tests/_Integration/Http/ProxyingTest.php
class ProxyServerProcesses (line 9) | class ProxyServerProcesses
FILE: tests/_Integration/Http/PublisherExampleTest.php
class PublisherExampleCrawler (line 17) | class PublisherExampleCrawler extends HttpCrawler
method loader (line 19) | public function loader(UserAgentInterface $userAgent, LoggerInterface ...
method userAgent (line 24) | protected function userAgent(): UserAgentInterface
method build (line 80) | public function build(): \Crwlr\Crawler\Crawler
method extractAuthorData (line 92) | private function extractAuthorData(): Html
method extractBookData (line 110) | private function extractBookData(): Html
method extractEditionData (line 121) | private function extractEditionData(): Html
method build (line 185) | public function build(): \Crwlr\Crawler\Crawler
method extractAuthorData (line 197) | private function extractAuthorData(): Html
method extractBookData (line 213) | private function extractBookData(): Html
method extractEditionData (line 224) | private function extractEditionData(): Html
FILE: tests/_Integration/Http/QueryParamPaginationTest.php
class QueryParamPaginationCrawler (line 19) | class QueryParamPaginationCrawler extends HttpCrawler
method userAgent (line 21) | protected function userAgent(): UserAgentInterface
method loader (line 26) | protected function loader(UserAgentInterface $userAgent, LoggerInterfa...
FILE: tests/_Integration/Http/RedirectTest.php
class RedirectTestCrawler (line 20) | class RedirectTestCrawler extends HttpCrawler
method userAgent (line 22) | protected function userAgent(): UserAgentInterface
class GetResponseBodyAsString (line 28) | class GetResponseBodyAsString extends Step
method invoke (line 34) | protected function invoke(mixed $input): Generator
method userAgent (line 75) | protected function userAgent(): UserAgentInterface
method loader (line 80) | protected function loader(UserAgentInterface $userAgent, LoggerInterface...
FILE: tests/_Integration/Http/RequestParamsFromInputTest.php
method invoke (line 15) | protected function invoke(mixed $input): Generator
FILE: tests/_Integration/Http/RetryErrorResponsesTest.php
class RetryErrorResponsesCrawler (line 16) | class RetryErrorResponsesCrawler extends HttpCrawler
method userAgent (line 18) | protected function userAgent(): UserAgentInterface
method loader (line 23) | public function loader(UserAgentInterface $userAgent, LoggerInterface ...
FILE: tests/_Integration/Http/RobotsTxtTest.php
class RobotsTxtCrawler (line 20) | class RobotsTxtCrawler extends HttpCrawler
method logger (line 22) | protected function logger(): LoggerInterface
method userAgent (line 27) | protected function userAgent(): UserAgentInterface
method loader (line 32) | public function loader(UserAgentInterface $userAgent, LoggerInterface ...
FILE: tests/_Integration/Http/TimeoutTest.php
method userAgent (line 18) | protected function userAgent(): UserAgentInterface
method loader (line 23) | public function loader(UserAgentInterface $userAgent, LoggerInterface $l...
FILE: tests/_Integration/Server.php
function getParamAfter (line 5) | function getParamAfter(string $route, string $after): string
FILE: tests/_Stubs/AbstractTestPaginator.php
class AbstractTestPaginator (line 10) | class AbstractTestPaginator extends AbstractPaginator
method __construct (line 12) | public function __construct(
method getNextRequest (line 19) | public function getNextRequest(): ?RequestInterface
method getLoaded (line 27) | public function getLoaded(): array
method getLoadedCount (line 32) | public function getLoadedCount(): int
method getLatestRequest (line 37) | public function getLatestRequest(): ?RequestInterface
method limitReached (line 42) | public function limitReached(): bool
method setFinished (line 47) | public function setFinished(): AbstractPaginator
FILE: tests/_Stubs/Crawlers/DummyOne.php
class DummyOne (line 12) | class DummyOne extends Crawler
method userAgent (line 17) | public function userAgent(): UserAgentInterface
method loader (line 22) | public function loader(UserAgentInterface $userAgent, LoggerInterface ...
FILE: tests/_Stubs/Crawlers/DummyTwo.php
class DummyTwo (line 22) | class DummyTwo extends Crawler
method userAgent (line 33) | protected function userAgent(): UserAgentInterface
method logger (line 43) | protected function logger(): LoggerInterface
method loader (line 53) | protected function loader(UserAgentInterface $userAgent, LoggerInterfa...
FILE: tests/_Stubs/Crawlers/DummyTwo/DummyTwoLoader.php
class DummyTwoLoader (line 7) | class DummyTwoLoader extends HttpLoader
FILE: tests/_Stubs/Crawlers/DummyTwo/DummyTwoLogger.php
class DummyTwoLogger (line 7) | class DummyTwoLogger extends CliLogger
FILE: tests/_Stubs/Crawlers/DummyTwo/DummyTwoUserAgent.php
class DummyTwoUserAgent (line 7) | class DummyTwoUserAgent extends BotUserAgent
FILE: tests/_Stubs/DummyLogger.php
class DummyLogger (line 10) | class DummyLogger implements LoggerInterface
method emergency (line 17) | public function emergency(string|Stringable $message, array $context =...
method alert (line 22) | public function alert(string|Stringable $message, array $context = [])...
method critical (line 27) | public function critical(string|Stringable $message, array $context = ...
method error (line 32) | public function error(string|Stringable $message, array $context = [])...
method warning (line 37) | public function warning(string|Stringable $message, array $context = [...
method notice (line 42) | public function notice(string|Stringable $message, array $context = []...
method info (line 47) | public function info(string|Stringable $message, array $context = []):...
method debug (line 52) | public function debug(string|Stringable $message, array $context = [])...
method log (line 61) | public function log($level, string|Stringable $message, array $context...
FILE: tests/_Stubs/PhantasyLoader.php
class PhantasyLoader (line 7) | class PhantasyLoader extends Loader
method load (line 9) | public function load(mixed $subject): mixed
method loadOrFail (line 14) | public function loadOrFail(mixed $subject): mixed
FILE: tests/_Stubs/RespondedRequestChild.php
class RespondedRequestChild (line 8) | class RespondedRequestChild extends RespondedRequest
method fromRespondedRequest (line 13) | public static function fromRespondedRequest(RespondedRequest $responde...
method fromArray (line 18) | public static function fromArray(array $data): RespondedRequestChild
method itseme (line 25) | public function itseme(): string
Condensed preview — 326 files, each showing path, character count, and a content snippet. Download the .json file or copy for the full structured content (1,231K chars).
[
{
"path": ".editorconfig",
"chars": 315,
"preview": "# EditorConfig is awesome: http://EditorConfig.org\n\nroot = true\n\n[*]\ncharset = utf-8\nend_of_line = lf\nindent_style = spa"
},
{
"path": ".gitattributes",
"chars": 265,
"preview": ".github export-ignore\nbin/add-git-hooks export-ignore\ngit-hooks export-ignore\ntests export-ignore\n.editorconfig export-i"
},
{
"path": ".github/workflows/ci.yml",
"chars": 1658,
"preview": "name: CI\n\non: pull_request\n\njobs:\n tests:\n name: PestPHP Tests\n runs-on: ubuntu-latest\n strategy:\n matrix"
},
{
"path": ".gitignore",
"chars": 170,
"preview": "composer.lock\nvendor\n.php_cs.cache\n.php-cs-fixer.cache\n.phpunit.result.cache\n.phpunit.cache\n/cachedir\n/storedir\n/tests/_"
},
{
"path": ".php-cs-fixer.php",
"chars": 640,
"preview": "<?php\n\nuse PhpCsFixer\\Config;\nuse PhpCsFixer\\Finder;\nuse PhpCsFixer\\Runner\\Parallel\\ParallelConfigFactory;\n\n$finder = Fi"
},
{
"path": "CHANGELOG.md",
"chars": 60646,
"preview": "# Changelog\nAll notable changes to this project will be documented in this file.\n\nThe format is based on [Keep a Changel"
},
{
"path": "CONTRIBUTING.md",
"chars": 2022,
"preview": "# Contributing to this Package\n\nThat you're reading this must mean you consider contributing to\nthis package. So first o"
},
{
"path": "LICENSE",
"chars": 1059,
"preview": "Copyright (c) 2026 Christian Olear\n\nPermission is hereby granted, free of charge, to any person obtaining\na copy of this"
},
{
"path": "README.md",
"chars": 3178,
"preview": "<p align=\"center\"><a href=\"https://www.crwlr.software\" target=\"_blank\"><img src=\"https://github.com/crwlrsoft/graphics/b"
},
{
"path": "bin/add-git-hooks",
"chars": 158,
"preview": "#!/usr/bin/env php\n<?php\n\n$src = __DIR__ . '/../git-hooks/pre-commit';\n$dest = __DIR__ . '/../.git/hooks/pre-commit';\n\nc"
},
{
"path": "composer.json",
"chars": 2804,
"preview": "{\n \"name\": \"crwlr/crawler\",\n \"description\": \"Web crawling and scraping library.\",\n \"type\": \"library\",\n \"keyw"
},
{
"path": "git-hooks/pre-commit",
"chars": 1612,
"preview": "#!/usr/bin/env php\n<?php\n\nrun('composer test', 'Unit tests');\nrun('composer test-integration', 'Integration tests');\nrun"
},
{
"path": "phpstan.neon",
"chars": 1304,
"preview": "parameters:\n level: 8\n paths:\n - src\n - tests\n excludePaths:\n analyse:\n - tests"
},
{
"path": "phpunit.xml",
"chars": 568,
"preview": "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<phpunit xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" xsi:noNamespaceSch"
},
{
"path": "src/Cache/CacheItem.php",
"chars": 1901,
"preview": "<?php\n\nnamespace Crwlr\\Crawler\\Cache;\n\nuse DateInterval;\nuse DateTimeImmutable;\nuse Exception;\n\nclass CacheItem\n{\n pr"
},
{
"path": "src/Cache/Exceptions/MissingZlibExtensionException.php",
"chars": 185,
"preview": "<?php\n\nnamespace Crwlr\\Crawler\\Cache\\Exceptions;\n\nuse Exception;\nuse Psr\\SimpleCache\\CacheException;\n\nclass MissingZlibE"
},
{
"path": "src/Cache/Exceptions/ReadingCacheFailedException.php",
"chars": 183,
"preview": "<?php\n\nnamespace Crwlr\\Crawler\\Cache\\Exceptions;\n\nuse Exception;\nuse Psr\\SimpleCache\\CacheException;\n\nclass ReadingCache"
},
{
"path": "src/Cache/FileCache.php",
"chars": 7199,
"preview": "<?php\n\nnamespace Crwlr\\Crawler\\Cache;\n\nuse Crwlr\\Crawler\\Cache\\Exceptions\\MissingZlibExtensionException;\nuse Crwlr\\Crawl"
},
{
"path": "src/Crawler.php",
"chars": 9416,
"preview": "<?php\n\nnamespace Crwlr\\Crawler;\n\nuse Closure;\nuse Crwlr\\Crawler\\Loader\\LoaderInterface;\nuse Crwlr\\Crawler\\Logger\\CliLogg"
},
{
"path": "src/HttpCrawler/AnonymousHttpCrawlerBuilder.php",
"chars": 1296,
"preview": "<?php\n\nnamespace Crwlr\\Crawler\\HttpCrawler;\n\nuse Crwlr\\Crawler\\HttpCrawler;\nuse Crwlr\\Crawler\\UserAgents\\BotUserAgent;\nu"
},
{
"path": "src/HttpCrawler.php",
"chars": 705,
"preview": "<?php\n\nnamespace Crwlr\\Crawler;\n\nuse Crwlr\\Crawler\\HttpCrawler\\AnonymousHttpCrawlerBuilder;\nuse Crwlr\\Crawler\\Loader\\Htt"
},
{
"path": "src/Input.php",
"chars": 59,
"preview": "<?php\n\nnamespace Crwlr\\Crawler;\n\nclass Input extends Io {}\n"
},
{
"path": "src/Io.php",
"chars": 3358,
"preview": "<?php\n\nnamespace Crwlr\\Crawler;\n\nuse Crwlr\\Crawler\\Utils\\OutputTypeHelper;\n\nclass Io\n{\n protected string|int|float|bo"
},
{
"path": "src/Loader/Http/Browser/Screenshot.php",
"chars": 154,
"preview": "<?php\n\nnamespace Crwlr\\Crawler\\Loader\\Http\\Browser;\n\nclass Screenshot\n{\n public function __construct(\n public "
},
{
"path": "src/Loader/Http/Browser/ScreenshotConfig.php",
"chars": 2270,
"preview": "<?php\n\nnamespace Crwlr\\Crawler\\Loader\\Http\\Browser;\n\nuse Crwlr\\Utils\\Microseconds;\nuse HeadlessChromium\\Clip;\nuse Headle"
},
{
"path": "src/Loader/Http/Cache/RetryManager.php",
"chars": 1096,
"preview": "<?php\n\nnamespace Crwlr\\Crawler\\Loader\\Http\\Cache;\n\n/**\n * @internal\n */\nclass RetryManager\n{\n /**\n * @param int[]"
},
{
"path": "src/Loader/Http/Cookies/Cookie.php",
"chars": 8835,
"preview": "<?php\n\nnamespace Crwlr\\Crawler\\Loader\\Http\\Cookies;\n\nuse Crwlr\\Crawler\\Loader\\Http\\Cookies\\Exceptions\\InvalidCookieExcep"
},
{
"path": "src/Loader/Http/Cookies/CookieJar.php",
"chars": 5161,
"preview": "<?php\n\nnamespace Crwlr\\Crawler\\Loader\\Http\\Cookies;\n\nuse Crwlr\\Crawler\\Loader\\Http\\Cookies\\Exceptions\\InvalidCookieExcep"
},
{
"path": "src/Loader/Http/Cookies/Date.php",
"chars": 952,
"preview": "<?php\n\nnamespace Crwlr\\Crawler\\Loader\\Http\\Cookies;\n\nuse DateTime;\nuse DateTimeInterface;\nuse InvalidArgumentException;\n"
},
{
"path": "src/Loader/Http/Cookies/Exceptions/InvalidCookieException.php",
"chars": 130,
"preview": "<?php\n\nnamespace Crwlr\\Crawler\\Loader\\Http\\Cookies\\Exceptions;\n\nuse Exception;\n\nclass InvalidCookieException extends Exc"
},
{
"path": "src/Loader/Http/Exceptions/LoadingException.php",
"chars": 1100,
"preview": "<?php\n\nnamespace Crwlr\\Crawler\\Loader\\Http\\Exceptions;\n\nuse Exception;\nuse Psr\\Http\\Message\\UriInterface;\nuse Throwable;"
},
{
"path": "src/Loader/Http/HeadlessBrowserLoaderHelper.php",
"chars": 14692,
"preview": "<?php\n\nnamespace Crwlr\\Crawler\\Loader\\Http;\n\nuse Closure;\nuse Crwlr\\Crawler\\Loader\\Http\\Browser\\Screenshot;\nuse Crwlr\\Cr"
},
{
"path": "src/Loader/Http/HttpLoader.php",
"chars": 20613,
"preview": "<?php\n\nnamespace Crwlr\\Crawler\\Loader\\Http;\n\nuse Crwlr\\Crawler\\Loader\\Http\\Cache\\RetryManager;\nuse Crwlr\\Crawler\\Loader\\"
},
{
"path": "src/Loader/Http/Messages/RespondedRequest.php",
"chars": 6201,
"preview": "<?php\n\nnamespace Crwlr\\Crawler\\Loader\\Http\\Messages;\n\nuse Crwlr\\Crawler\\Cache\\Exceptions\\MissingZlibExtensionException;\n"
},
{
"path": "src/Loader/Http/Politeness/RetryErrorResponseHandler.php",
"chars": 4666,
"preview": "<?php\n\nnamespace Crwlr\\Crawler\\Loader\\Http\\Politeness;\n\nuse Closure;\nuse Crwlr\\Crawler\\Loader\\Http\\Exceptions\\LoadingExc"
},
{
"path": "src/Loader/Http/Politeness/RobotsTxtHandler.php",
"chars": 3453,
"preview": "<?php\n\nnamespace Crwlr\\Crawler\\Loader\\Http\\Politeness;\n\nuse Crwlr\\Crawler\\Loader\\Http\\HttpLoader;\nuse Crwlr\\Crawler\\Load"
},
{
"path": "src/Loader/Http/Politeness/Throttler.php",
"chars": 6289,
"preview": "<?php\n\nnamespace Crwlr\\Crawler\\Loader\\Http\\Politeness;\n\nuse Crwlr\\Crawler\\Loader\\Http\\Politeness\\TimingUnits\\MultipleOf;"
},
{
"path": "src/Loader/Http/Politeness/TimingUnits/MultipleOf.php",
"chars": 605,
"preview": "<?php\n\nnamespace Crwlr\\Crawler\\Loader\\Http\\Politeness\\TimingUnits;\n\nuse Crwlr\\Utils\\Microseconds;\n\nclass MultipleOf\n{\n "
},
{
"path": "src/Loader/Http/ProxyManager.php",
"chars": 986,
"preview": "<?php\n\nnamespace Crwlr\\Crawler\\Loader\\Http;\n\nclass ProxyManager\n{\n protected ?int $lastUsedProxy = null;\n\n /**\n "
},
{
"path": "src/Loader/Loader.php",
"chars": 3630,
"preview": "<?php\n\nnamespace Crwlr\\Crawler\\Loader;\n\nuse Crwlr\\Crawler\\Logger\\CliLogger;\nuse Crwlr\\Crawler\\UserAgents\\UserAgentInterf"
},
{
"path": "src/Loader/LoaderInterface.php",
"chars": 911,
"preview": "<?php\n\nnamespace Crwlr\\Crawler\\Loader;\n\nuse Crwlr\\Crawler\\Loader\\Http\\Exceptions\\LoadingException;\nuse InvalidArgumentEx"
},
{
"path": "src/Logger/CliLogger.php",
"chars": 2683,
"preview": "<?php\n\nnamespace Crwlr\\Crawler\\Logger;\n\nuse DateTime;\nuse InvalidArgumentException;\nuse Psr\\Log\\LoggerInterface;\nuse Str"
},
{
"path": "src/Logger/PreStepInvocationLogger.php",
"chars": 2234,
"preview": "<?php\n\nnamespace Crwlr\\Crawler\\Logger;\n\nuse InvalidArgumentException;\nuse Psr\\Log\\LoggerInterface;\nuse Stringable;\nuse U"
},
{
"path": "src/Output.php",
"chars": 60,
"preview": "<?php\n\nnamespace Crwlr\\Crawler;\n\nclass Output extends Io {}\n"
},
{
"path": "src/Result.php",
"chars": 2019,
"preview": "<?php\n\nnamespace Crwlr\\Crawler;\n\nuse Crwlr\\Crawler\\Utils\\OutputTypeHelper;\n\nfinal class Result\n{\n /**\n * @var mix"
},
{
"path": "src/Steps/BaseStep.php",
"chars": 23484,
"preview": "<?php\n\nnamespace Crwlr\\Crawler\\Steps;\n\nuse Adbar\\Dot;\nuse Closure;\nuse Crwlr\\Crawler\\Crawler;\nuse Crwlr\\Crawler\\Input;\nu"
},
{
"path": "src/Steps/Csv.php",
"chars": 4414,
"preview": "<?php\n\nnamespace Crwlr\\Crawler\\Steps;\n\nuse Exception;\nuse Generator;\nuse InvalidArgumentException;\n\nclass Csv extends St"
},
{
"path": "src/Steps/Dom/DomDocument.php",
"chars": 455,
"preview": "<?php\n\nnamespace Crwlr\\Crawler\\Steps\\Dom;\n\nuse Dom\\Document;\nuse Symfony\\Component\\DomCrawler\\Crawler;\n\nabstract class D"
},
{
"path": "src/Steps/Dom/HtmlDocument.php",
"chars": 2430,
"preview": "<?php\n\nnamespace Crwlr\\Crawler\\Steps\\Dom;\n\nuse Crwlr\\Utils\\PhpVersion;\nuse DOMNode;\nuse Symfony\\Component\\DomCrawler\\Cra"
},
{
"path": "src/Steps/Dom/HtmlElement.php",
"chars": 784,
"preview": "<?php\n\nnamespace Crwlr\\Crawler\\Steps\\Dom;\n\nuse DOMNode;\nuse Symfony\\Component\\DomCrawler\\Crawler;\n\n/**\n * @method HtmlEl"
},
{
"path": "src/Steps/Dom/Node.php",
"chars": 5828,
"preview": "<?php\n\nnamespace Crwlr\\Crawler\\Steps\\Dom;\n\nuse Dom\\Document;\nuse Dom\\Element;\nuse Dom\\XPath;\nuse DOMNode;\nuse Symfony\\Co"
},
{
"path": "src/Steps/Dom/NodeList.php",
"chars": 3544,
"preview": "<?php\n\nnamespace Crwlr\\Crawler\\Steps\\Dom;\n\nuse ArrayIterator;\nuse Closure;\nuse Countable;\nuse Dom\\Element;\nuse DOMNode;\n"
},
{
"path": "src/Steps/Dom/XmlDocument.php",
"chars": 2505,
"preview": "<?php\n\nnamespace Crwlr\\Crawler\\Steps\\Dom;\n\nuse Crwlr\\Utils\\PhpVersion;\nuse DOMNode;\nuse Symfony\\Component\\DomCrawler\\Cra"
},
{
"path": "src/Steps/Dom/XmlElement.php",
"chars": 694,
"preview": "<?php\n\nnamespace Crwlr\\Crawler\\Steps\\Dom;\n\nuse DOMNode;\nuse Symfony\\Component\\DomCrawler\\Crawler;\n\n/**\n * @method XmlEle"
},
{
"path": "src/Steps/Dom.php",
"chars": 9868,
"preview": "<?php\n\nnamespace Crwlr\\Crawler\\Steps;\n\nuse Crwlr\\Crawler\\Cache\\Exceptions\\MissingZlibExtensionException;\nuse Crwlr\\Crawl"
},
{
"path": "src/Steps/Exceptions/PreRunValidationException.php",
"chars": 119,
"preview": "<?php\n\nnamespace Crwlr\\Crawler\\Steps\\Exceptions;\n\nuse Exception;\n\nclass PreRunValidationException extends Exception {}\n"
},
{
"path": "src/Steps/Filters/AbstractFilter.php",
"chars": 2161,
"preview": "<?php\n\nnamespace Crwlr\\Crawler\\Steps\\Filters;\n\nuse Exception;\nuse InvalidArgumentException;\n\nabstract class AbstractFilt"
},
{
"path": "src/Steps/Filters/ArrayFilter.php",
"chars": 576,
"preview": "<?php\n\nnamespace Crwlr\\Crawler\\Steps\\Filters;\n\nuse Exception;\n\nclass ArrayFilter extends AbstractFilter\n{\n use Filter"
},
{
"path": "src/Steps/Filters/ClosureFilter.php",
"chars": 446,
"preview": "<?php\n\nnamespace Crwlr\\Crawler\\Steps\\Filters;\n\nuse Closure;\nuse Exception;\n\nclass ClosureFilter extends AbstractFilter\n{"
},
{
"path": "src/Steps/Filters/ComparisonFilter.php",
"chars": 529,
"preview": "<?php\n\nnamespace Crwlr\\Crawler\\Steps\\Filters;\n\nuse Crwlr\\Crawler\\Steps\\Filters\\Enums\\ComparisonFilterRule;\nuse Exception"
},
{
"path": "src/Steps/Filters/Enums/ComparisonFilterRule.php",
"chars": 682,
"preview": "<?php\n\nnamespace Crwlr\\Crawler\\Steps\\Filters\\Enums;\n\nenum ComparisonFilterRule\n{\n case Equal;\n\n case NotEqual;\n\n "
},
{
"path": "src/Steps/Filters/Enums/StringFilterRule.php",
"chars": 462,
"preview": "<?php\n\nnamespace Crwlr\\Crawler\\Steps\\Filters\\Enums;\n\nenum StringFilterRule\n{\n case Contains;\n\n case StartsWith;\n\n "
},
{
"path": "src/Steps/Filters/Enums/StringLengthFilterRule.php",
"chars": 812,
"preview": "<?php\n\nnamespace Crwlr\\Crawler\\Steps\\Filters\\Enums;\n\nenum StringLengthFilterRule\n{\n case Equal;\n\n case NotEqual;\n\n"
},
{
"path": "src/Steps/Filters/Enums/UrlFilterRule.php",
"chars": 1113,
"preview": "<?php\n\nnamespace Crwlr\\Crawler\\Steps\\Filters\\Enums;\n\nuse Crwlr\\Url\\Exceptions\\InvalidUrlException;\nuse Crwlr\\Url\\Url;\nus"
},
{
"path": "src/Steps/Filters/Filter.php",
"chars": 4237,
"preview": "<?php\n\nnamespace Crwlr\\Crawler\\Steps\\Filters;\n\nuse Closure;\nuse Crwlr\\Crawler\\Steps\\Filters\\Enums\\ComparisonFilterRule;\n"
},
{
"path": "src/Steps/Filters/FilterInterface.php",
"chars": 587,
"preview": "<?php\n\nnamespace Crwlr\\Crawler\\Steps\\Filters;\n\ninterface FilterInterface\n{\n /**\n * When the value that will be ev"
},
{
"path": "src/Steps/Filters/Filterable.php",
"chars": 2204,
"preview": "<?php\n\nnamespace Crwlr\\Crawler\\Steps\\Filters;\n\nuse Crwlr\\Crawler\\Steps\\BaseStep;\nuse Exception;\nuse InvalidArgumentExcep"
},
{
"path": "src/Steps/Filters/NegatedFilter.php",
"chars": 738,
"preview": "<?php\n\nnamespace Crwlr\\Crawler\\Steps\\Filters;\n\nfinal class NegatedFilter implements FilterInterface\n{\n public functio"
},
{
"path": "src/Steps/Filters/StringFilter.php",
"chars": 651,
"preview": "<?php\n\nnamespace Crwlr\\Crawler\\Steps\\Filters;\n\nuse Crwlr\\Crawler\\Steps\\Filters\\Enums\\StringFilterRule;\nuse Exception;\n\nc"
},
{
"path": "src/Steps/Filters/StringLengthFilter.php",
"chars": 672,
"preview": "<?php\n\nnamespace Crwlr\\Crawler\\Steps\\Filters;\n\nuse Crwlr\\Crawler\\Steps\\Filters\\Enums\\StringLengthFilterRule;\nuse Excepti"
},
{
"path": "src/Steps/Filters/UrlFilter.php",
"chars": 619,
"preview": "<?php\n\nnamespace Crwlr\\Crawler\\Steps\\Filters;\n\nuse Crwlr\\Crawler\\Steps\\Filters\\Enums\\UrlFilterRule;\nuse Exception;\n\nclas"
},
{
"path": "src/Steps/Group.php",
"chars": 6428,
"preview": "<?php\n\nnamespace Crwlr\\Crawler\\Steps;\n\nuse Crwlr\\Crawler\\Input;\nuse Crwlr\\Crawler\\Loader\\LoaderInterface;\nuse Crwlr\\Craw"
},
{
"path": "src/Steps/Html/CssSelector.php",
"chars": 1582,
"preview": "<?php\n\nnamespace Crwlr\\Crawler\\Steps\\Html;\n\nuse Crwlr\\Crawler\\Steps\\Dom\\HtmlDocument;\nuse Crwlr\\Crawler\\Steps\\Dom\\Node;\n"
},
{
"path": "src/Steps/Html/DomQuery.php",
"chars": 8024,
"preview": "<?php\n\nnamespace Crwlr\\Crawler\\Steps\\Html;\n\nuse Crwlr\\Crawler\\Steps\\Dom\\HtmlDocument;\nuse Crwlr\\Crawler\\Steps\\Dom\\HtmlEl"
},
{
"path": "src/Steps/Html/Exceptions/InvalidDomQueryException.php",
"chars": 1455,
"preview": "<?php\n\nnamespace Crwlr\\Crawler\\Steps\\Html\\Exceptions;\n\nuse DOMException;\nuse Exception;\nuse Symfony\\Component\\CssSelecto"
},
{
"path": "src/Steps/Html/GetLink.php",
"chars": 7121,
"preview": "<?php\n\nnamespace Crwlr\\Crawler\\Steps\\Html;\n\nuse Crwlr\\Crawler\\Cache\\Exceptions\\MissingZlibExtensionException;\nuse Crwlr\\"
},
{
"path": "src/Steps/Html/GetLinks.php",
"chars": 731,
"preview": "<?php\n\nnamespace Crwlr\\Crawler\\Steps\\Html;\n\nuse Crwlr\\Crawler\\Steps\\Dom\\HtmlDocument;\nuse Exception;\nuse Generator;\n\ncla"
},
{
"path": "src/Steps/Html/MetaData.php",
"chars": 2113,
"preview": "<?php\n\nnamespace Crwlr\\Crawler\\Steps\\Html;\n\nuse Crwlr\\Crawler\\Cache\\Exceptions\\MissingZlibExtensionException;\nuse Crwlr\\"
},
{
"path": "src/Steps/Html/SchemaOrg.php",
"chars": 3172,
"preview": "<?php\n\nnamespace Crwlr\\Crawler\\Steps\\Html;\n\nuse Adbar\\Dot;\nuse Crwlr\\Crawler\\Cache\\Exceptions\\MissingZlibExtensionExcept"
},
{
"path": "src/Steps/Html/SelectorTarget.php",
"chars": 166,
"preview": "<?php\n\nnamespace Crwlr\\Crawler\\Steps\\Html;\n\nenum SelectorTarget\n{\n case Text;\n\n case FormattedText;\n\n case Html"
},
{
"path": "src/Steps/Html/XPathQuery.php",
"chars": 1440,
"preview": "<?php\n\nnamespace Crwlr\\Crawler\\Steps\\Html;\n\nuse Crwlr\\Crawler\\Steps\\Dom\\Node;\nuse Crwlr\\Crawler\\Steps\\Dom\\NodeList;\nuse "
},
{
"path": "src/Steps/Html.php",
"chars": 1657,
"preview": "<?php\n\nnamespace Crwlr\\Crawler\\Steps;\n\nuse Crwlr\\Crawler\\Cache\\Exceptions\\MissingZlibExtensionException;\nuse Crwlr\\Crawl"
},
{
"path": "src/Steps/Json.php",
"chars": 3550,
"preview": "<?php\n\nnamespace Crwlr\\Crawler\\Steps;\n\nuse Adbar\\Dot;\nuse Crwlr\\Crawler\\Steps\\Dom\\HtmlDocument;\nuse Crwlr\\Utils\\Json as "
},
{
"path": "src/Steps/Loading/GetSitemapsFromRobotsTxt.php",
"chars": 1040,
"preview": "<?php\n\nnamespace Crwlr\\Crawler\\Steps\\Loading;\n\nuse Crwlr\\Crawler\\Loader\\Http\\HttpLoader;\nuse Crwlr\\Crawler\\Steps\\Step;\nu"
},
{
"path": "src/Steps/Loading/Http/AbstractPaginator.php",
"chars": 4021,
"preview": "<?php\n\nnamespace Crwlr\\Crawler\\Steps\\Loading\\Http;\n\nuse Closure;\nuse Crwlr\\Crawler\\Loader\\Http\\Messages\\RespondedRequest"
},
{
"path": "src/Steps/Loading/Http/Browser/BrowserAction.php",
"chars": 5943,
"preview": "<?php\n\nnamespace Crwlr\\Crawler\\Steps\\Loading\\Http\\Browser;\n\nuse Closure;\nuse Crwlr\\Crawler\\Loader\\Http\\Browser\\Screensho"
},
{
"path": "src/Steps/Loading/Http/Document.php",
"chars": 2351,
"preview": "<?php\n\nnamespace Crwlr\\Crawler\\Steps\\Loading\\Http;\n\nuse Crwlr\\Crawler\\Loader\\Http\\Messages\\RespondedRequest;\nuse Crwlr\\C"
},
{
"path": "src/Steps/Loading/Http/Paginate.php",
"chars": 2564,
"preview": "<?php\n\nnamespace Crwlr\\Crawler\\Steps\\Loading\\Http;\n\nuse Crwlr\\Crawler\\Loader\\Http\\Exceptions\\LoadingException;\nuse Crwlr"
},
{
"path": "src/Steps/Loading/Http/Paginator.php",
"chars": 851,
"preview": "<?php\n\nnamespace Crwlr\\Crawler\\Steps\\Loading\\Http;\n\nuse Crwlr\\Crawler\\Steps\\Html\\DomQuery;\nuse Crwlr\\Crawler\\Steps\\Html\\"
},
{
"path": "src/Steps/Loading/Http/Paginators/QueryParams/AbstractQueryParamManipulator.php",
"chars": 1216,
"preview": "<?php\n\nnamespace Crwlr\\Crawler\\Steps\\Loading\\Http\\Paginators\\QueryParams;\n\nuse Adbar\\Dot;\nuse Crwlr\\QueryString\\Query;\nu"
},
{
"path": "src/Steps/Loading/Http/Paginators/QueryParams/Decrementor.php",
"chars": 961,
"preview": "<?php\n\nnamespace Crwlr\\Crawler\\Steps\\Loading\\Http\\Paginators\\QueryParams;\n\nuse Adbar\\Dot;\nuse Crwlr\\QueryString\\Query;\nu"
},
{
"path": "src/Steps/Loading/Http/Paginators/QueryParams/Incrementor.php",
"chars": 961,
"preview": "<?php\n\nnamespace Crwlr\\Crawler\\Steps\\Loading\\Http\\Paginators\\QueryParams;\n\nuse Adbar\\Dot;\nuse Crwlr\\QueryString\\Query;\nu"
},
{
"path": "src/Steps/Loading/Http/Paginators/QueryParams/QueryParamManipulator.php",
"chars": 191,
"preview": "<?php\n\nnamespace Crwlr\\Crawler\\Steps\\Loading\\Http\\Paginators\\QueryParams;\n\nuse Crwlr\\QueryString\\Query;\n\ninterface Query"
},
{
"path": "src/Steps/Loading/Http/Paginators/QueryParamsPaginator.php",
"chars": 3058,
"preview": "<?php\n\nnamespace Crwlr\\Crawler\\Steps\\Loading\\Http\\Paginators;\n\nuse Crwlr\\Crawler\\Steps\\Loading\\Http;\nuse Crwlr\\Crawler\\S"
},
{
"path": "src/Steps/Loading/Http/Paginators/SimpleWebsitePaginator.php",
"chars": 6555,
"preview": "<?php\n\nnamespace Crwlr\\Crawler\\Steps\\Loading\\Http\\Paginators;\n\nuse Crwlr\\Crawler\\Loader\\Http\\Messages\\RespondedRequest;\n"
},
{
"path": "src/Steps/Loading/Http/Paginators/StopRules/Contains.php",
"chars": 753,
"preview": "<?php\n\nnamespace Crwlr\\Crawler\\Steps\\Loading\\Http\\Paginators\\StopRules;\n\nuse Crwlr\\Crawler\\Cache\\Exceptions\\MissingZlibE"
},
{
"path": "src/Steps/Loading/Http/Paginators/StopRules/IsEmptyInDom.php",
"chars": 2011,
"preview": "<?php\n\nnamespace Crwlr\\Crawler\\Steps\\Loading\\Http\\Paginators\\StopRules;\n\nuse Crwlr\\Crawler\\Cache\\Exceptions\\MissingZlibE"
},
{
"path": "src/Steps/Loading/Http/Paginators/StopRules/IsEmptyInHtml.php",
"chars": 316,
"preview": "<?php\n\nnamespace Crwlr\\Crawler\\Steps\\Loading\\Http\\Paginators\\StopRules;\n\nuse Crwlr\\Crawler\\Steps\\Dom\\DomDocument;\nuse Cr"
},
{
"path": "src/Steps/Loading/Http/Paginators/StopRules/IsEmptyInJson.php",
"chars": 855,
"preview": "<?php\n\nnamespace Crwlr\\Crawler\\Steps\\Loading\\Http\\Paginators\\StopRules;\n\nuse Adbar\\Dot;\nuse Crwlr\\Crawler\\Loader\\Http\\Me"
},
{
"path": "src/Steps/Loading/Http/Paginators/StopRules/IsEmptyInXml.php",
"chars": 313,
"preview": "<?php\n\nnamespace Crwlr\\Crawler\\Steps\\Loading\\Http\\Paginators\\StopRules;\n\nuse Crwlr\\Crawler\\Steps\\Dom\\DomDocument;\nuse Cr"
},
{
"path": "src/Steps/Loading/Http/Paginators/StopRules/IsEmptyResponse.php",
"chars": 587,
"preview": "<?php\n\nnamespace Crwlr\\Crawler\\Steps\\Loading\\Http\\Paginators\\StopRules;\n\nuse Crwlr\\Crawler\\Loader\\Http\\Messages\\Responde"
},
{
"path": "src/Steps/Loading/Http/Paginators/StopRules/NotContains.php",
"chars": 757,
"preview": "<?php\n\nnamespace Crwlr\\Crawler\\Steps\\Loading\\Http\\Paginators\\StopRules;\n\nuse Crwlr\\Crawler\\Cache\\Exceptions\\MissingZlibE"
},
{
"path": "src/Steps/Loading/Http/Paginators/StopRules/PaginatorStopRules.php",
"chars": 912,
"preview": "<?php\n\nnamespace Crwlr\\Crawler\\Steps\\Loading\\Http\\Paginators\\StopRules;\n\nuse Crwlr\\Crawler\\Steps\\Html\\DomQuery;\n\nclass P"
},
{
"path": "src/Steps/Loading/Http/Paginators/StopRules/StopRule.php",
"chars": 295,
"preview": "<?php\n\nnamespace Crwlr\\Crawler\\Steps\\Loading\\Http\\Paginators\\StopRules;\n\nuse Crwlr\\Crawler\\Loader\\Http\\Messages\\Responde"
},
{
"path": "src/Steps/Loading/Http.php",
"chars": 5159,
"preview": "<?php\n\nnamespace Crwlr\\Crawler\\Steps\\Loading;\n\nuse Crwlr\\Crawler\\Cache\\Exceptions\\MissingZlibExtensionException;\nuse Crw"
},
{
"path": "src/Steps/Loading/HttpBase.php",
"chars": 15069,
"preview": "<?php\n\nnamespace Crwlr\\Crawler\\Steps\\Loading;\n\nuse Closure;\nuse Crwlr\\Crawler\\Loader\\Http\\Exceptions\\LoadingException;\nu"
},
{
"path": "src/Steps/Loading/HttpCrawl.php",
"chars": 12814,
"preview": "<?php\n\nnamespace Crwlr\\Crawler\\Steps\\Loading;\n\nuse Closure;\nuse Crwlr\\Crawler\\Loader\\Http\\Messages\\RespondedRequest;\nuse"
},
{
"path": "src/Steps/Loading/LoadingStep.php",
"chars": 829,
"preview": "<?php\n\nnamespace Crwlr\\Crawler\\Steps\\Loading;\n\nuse Crwlr\\Crawler\\Loader\\LoaderInterface;\n\n/**\n * @template T of LoaderIn"
},
{
"path": "src/Steps/Refiners/AbstractRefiner.php",
"chars": 570,
"preview": "<?php\n\nnamespace Crwlr\\Crawler\\Steps\\Refiners;\n\nuse Psr\\Log\\LoggerInterface;\n\nabstract class AbstractRefiner implements "
},
{
"path": "src/Steps/Refiners/DateTime/DateTimeFormat.php",
"chars": 1555,
"preview": "<?php\n\nnamespace Crwlr\\Crawler\\Steps\\Refiners\\DateTime;\n\nuse Crwlr\\Crawler\\Steps\\Refiners\\String\\AbstractStringRefiner;\n"
},
{
"path": "src/Steps/Refiners/DateTimeRefiner.php",
"chars": 314,
"preview": "<?php\n\nnamespace Crwlr\\Crawler\\Steps\\Refiners;\n\nuse Crwlr\\Crawler\\Steps\\Refiners\\DateTime\\DateTimeFormat;\n\nclass DateTim"
},
{
"path": "src/Steps/Refiners/Html/RemoveFromHtml.php",
"chars": 2035,
"preview": "<?php\n\nnamespace Crwlr\\Crawler\\Steps\\Refiners\\Html;\n\nuse Crwlr\\Crawler\\Steps\\Dom;\nuse Crwlr\\Crawler\\Steps\\Dom\\HtmlDocume"
},
{
"path": "src/Steps/Refiners/HtmlRefiner.php",
"chars": 299,
"preview": "<?php\n\nnamespace Crwlr\\Crawler\\Steps\\Refiners;\n\nuse Crwlr\\Crawler\\Steps\\Html\\DomQuery;\nuse Crwlr\\Crawler\\Steps\\Refiners\\"
},
{
"path": "src/Steps/Refiners/RefinerInterface.php",
"chars": 223,
"preview": "<?php\n\nnamespace Crwlr\\Crawler\\Steps\\Refiners;\n\nuse Psr\\Log\\LoggerInterface;\n\ninterface RefinerInterface\n{\n public fu"
},
{
"path": "src/Steps/Refiners/String/AbstractStringRefiner.php",
"chars": 823,
"preview": "<?php\n\nnamespace Crwlr\\Crawler\\Steps\\Refiners\\String;\n\nuse Closure;\nuse Crwlr\\Crawler\\Steps\\Refiners\\AbstractRefiner;\n\na"
},
{
"path": "src/Steps/Refiners/String/StrAfterFirst.php",
"chars": 553,
"preview": "<?php\n\nnamespace Crwlr\\Crawler\\Steps\\Refiners\\String;\n\nclass StrAfterFirst extends AbstractStringRefiner\n{\n public fu"
},
{
"path": "src/Steps/Refiners/String/StrAfterLast.php",
"chars": 541,
"preview": "<?php\n\nnamespace Crwlr\\Crawler\\Steps\\Refiners\\String;\n\nclass StrAfterLast extends AbstractStringRefiner\n{\n public fun"
},
{
"path": "src/Steps/Refiners/String/StrBeforeFirst.php",
"chars": 480,
"preview": "<?php\n\nnamespace Crwlr\\Crawler\\Steps\\Refiners\\String;\n\nclass StrBeforeFirst extends AbstractStringRefiner\n{\n public f"
},
{
"path": "src/Steps/Refiners/String/StrBeforeLast.php",
"chars": 645,
"preview": "<?php\n\nnamespace Crwlr\\Crawler\\Steps\\Refiners\\String;\n\nclass StrBeforeLast extends AbstractStringRefiner\n{\n public fu"
},
{
"path": "src/Steps/Refiners/String/StrBetweenFirst.php",
"chars": 827,
"preview": "<?php\n\nnamespace Crwlr\\Crawler\\Steps\\Refiners\\String;\n\nclass StrBetweenFirst extends AbstractStringRefiner\n{\n public "
},
{
"path": "src/Steps/Refiners/String/StrBetweenLast.php",
"chars": 753,
"preview": "<?php\n\nnamespace Crwlr\\Crawler\\Steps\\Refiners\\String;\n\nclass StrBetweenLast extends AbstractStringRefiner\n{\n public f"
},
{
"path": "src/Steps/Refiners/String/StrReplace.php",
"chars": 950,
"preview": "<?php\n\nnamespace Crwlr\\Crawler\\Steps\\Refiners\\String;\n\nclass StrReplace extends AbstractStringRefiner\n{\n /**\n * @"
},
{
"path": "src/Steps/Refiners/StringRefiner.php",
"chars": 1486,
"preview": "<?php\n\nnamespace Crwlr\\Crawler\\Steps\\Refiners;\n\nuse Crwlr\\Crawler\\Steps\\Refiners\\String\\StrAfterFirst;\nuse Crwlr\\Crawler"
},
{
"path": "src/Steps/Refiners/Url/AbstractUrlRefiner.php",
"chars": 1067,
"preview": "<?php\n\nnamespace Crwlr\\Crawler\\Steps\\Refiners\\Url;\n\nuse Crwlr\\Crawler\\Steps\\Refiners\\AbstractRefiner;\nuse Crwlr\\Url\\Exce"
},
{
"path": "src/Steps/Refiners/Url/WithFragment.php",
"chars": 581,
"preview": "<?php\n\nnamespace Crwlr\\Crawler\\Steps\\Refiners\\Url;\n\nuse Crwlr\\Url\\Exceptions\\InvalidUrlComponentException;\nuse Crwlr\\Url"
},
{
"path": "src/Steps/Refiners/Url/WithHost.php",
"chars": 561,
"preview": "<?php\n\nnamespace Crwlr\\Crawler\\Steps\\Refiners\\Url;\n\nuse Crwlr\\Url\\Exceptions\\InvalidUrlComponentException;\nuse Crwlr\\Url"
},
{
"path": "src/Steps/Refiners/Url/WithPath.php",
"chars": 561,
"preview": "<?php\n\nnamespace Crwlr\\Crawler\\Steps\\Refiners\\Url;\n\nuse Crwlr\\Url\\Exceptions\\InvalidUrlComponentException;\nuse Crwlr\\Url"
},
{
"path": "src/Steps/Refiners/Url/WithPort.php",
"chars": 558,
"preview": "<?php\n\nnamespace Crwlr\\Crawler\\Steps\\Refiners\\Url;\n\nuse Crwlr\\Url\\Exceptions\\InvalidUrlComponentException;\nuse Crwlr\\Url"
},
{
"path": "src/Steps/Refiners/Url/WithQuery.php",
"chars": 566,
"preview": "<?php\n\nnamespace Crwlr\\Crawler\\Steps\\Refiners\\Url;\n\nuse Crwlr\\Url\\Exceptions\\InvalidUrlComponentException;\nuse Crwlr\\Url"
},
{
"path": "src/Steps/Refiners/Url/WithScheme.php",
"chars": 571,
"preview": "<?php\n\nnamespace Crwlr\\Crawler\\Steps\\Refiners\\Url;\n\nuse Crwlr\\Url\\Exceptions\\InvalidUrlComponentException;\nuse Crwlr\\Url"
},
{
"path": "src/Steps/Refiners/Url/WithoutPort.php",
"chars": 492,
"preview": "<?php\n\nnamespace Crwlr\\Crawler\\Steps\\Refiners\\Url;\n\nuse Crwlr\\Url\\Exceptions\\InvalidUrlComponentException;\nuse Crwlr\\Url"
},
{
"path": "src/Steps/Refiners/UrlRefiner.php",
"chars": 1404,
"preview": "<?php\n\nnamespace Crwlr\\Crawler\\Steps\\Refiners;\n\nuse Crwlr\\Crawler\\Steps\\Refiners\\Url\\WithFragment;\nuse Crwlr\\Crawler\\Ste"
},
{
"path": "src/Steps/Sitemap/GetUrlsFromSitemap.php",
"chars": 2532,
"preview": "<?php\n\nnamespace Crwlr\\Crawler\\Steps\\Sitemap;\n\nuse Crwlr\\Crawler\\Cache\\Exceptions\\MissingZlibExtensionException;\nuse Crw"
},
{
"path": "src/Steps/Sitemap.php",
"chars": 429,
"preview": "<?php\n\nnamespace Crwlr\\Crawler\\Steps;\n\nuse Crwlr\\Crawler\\Steps\\Loading\\GetSitemapsFromRobotsTxt;\nuse Crwlr\\Crawler\\Steps"
},
{
"path": "src/Steps/Step.php",
"chars": 10602,
"preview": "<?php\n\nnamespace Crwlr\\Crawler\\Steps;\n\nuse Closure;\nuse Crwlr\\Crawler\\Cache\\Exceptions\\MissingZlibExtensionException;\nus"
},
{
"path": "src/Steps/StepInterface.php",
"chars": 1484,
"preview": "<?php\n\nnamespace Crwlr\\Crawler\\Steps;\n\nuse Crwlr\\Crawler\\Input;\nuse Crwlr\\Crawler\\Output;\nuse Crwlr\\Crawler\\Steps\\Filter"
},
{
"path": "src/Steps/StepOutputType.php",
"chars": 133,
"preview": "<?php\n\nnamespace Crwlr\\Crawler\\Steps;\n\nenum StepOutputType\n{\n case Scalar;\n\n case AssociativeArrayOrObject;\n\n c"
},
{
"path": "src/Steps/Xml.php",
"chars": 947,
"preview": "<?php\n\nnamespace Crwlr\\Crawler\\Steps;\n\nuse Crwlr\\Crawler\\Cache\\Exceptions\\MissingZlibExtensionException;\nuse Crwlr\\Crawl"
},
{
"path": "src/Stores/JsonFileStore.php",
"chars": 1061,
"preview": "<?php\n\nnamespace Crwlr\\Crawler\\Stores;\n\nuse Crwlr\\Crawler\\Result;\nuse Exception;\n\nclass JsonFileStore extends Store\n{\n "
},
{
"path": "src/Stores/SimpleCsvFileStore.php",
"chars": 1900,
"preview": "<?php\n\nnamespace Crwlr\\Crawler\\Stores;\n\nuse Crwlr\\Crawler\\Result;\nuse Exception;\n\nclass SimpleCsvFileStore extends Store"
},
{
"path": "src/Stores/Store.php",
"chars": 300,
"preview": "<?php\n\nnamespace Crwlr\\Crawler\\Stores;\n\nuse Psr\\Log\\LoggerInterface;\n\nabstract class Store implements StoreInterface\n{\n "
},
{
"path": "src/Stores/StoreInterface.php",
"chars": 239,
"preview": "<?php\n\nnamespace Crwlr\\Crawler\\Stores;\n\nuse Crwlr\\Crawler\\Result;\nuse Psr\\Log\\LoggerInterface;\n\ninterface StoreInterface"
},
{
"path": "src/UserAgents/BotUserAgent.php",
"chars": 1204,
"preview": "<?php\n\nnamespace Crwlr\\Crawler\\UserAgents;\n\nclass BotUserAgent implements BotUserAgentInterface\n{\n /**\n * @param "
},
{
"path": "src/UserAgents/BotUserAgentInterface.php",
"chars": 151,
"preview": "<?php\n\nnamespace Crwlr\\Crawler\\UserAgents;\n\ninterface BotUserAgentInterface extends UserAgentInterface\n{\n public func"
},
{
"path": "src/UserAgents/UserAgent.php",
"chars": 381,
"preview": "<?php\n\nnamespace Crwlr\\Crawler\\UserAgents;\n\nclass UserAgent implements UserAgentInterface\n{\n public function __constr"
},
{
"path": "src/UserAgents/UserAgentInterface.php",
"chars": 119,
"preview": "<?php\n\nnamespace Crwlr\\Crawler\\UserAgents;\n\ninterface UserAgentInterface\n{\n public function __toString(): string;\n}\n"
},
{
"path": "src/Utils/Gzip.php",
"chars": 1181,
"preview": "<?php\n\nnamespace Crwlr\\Crawler\\Utils;\n\nuse Crwlr\\Crawler\\Cache\\Exceptions\\MissingZlibExtensionException;\n\nclass Gzip\n{\n "
},
{
"path": "src/Utils/HttpHeaders.php",
"chars": 1857,
"preview": "<?php\n\nnamespace Crwlr\\Crawler\\Utils;\n\nfinal class HttpHeaders\n{\n /**\n * @param array<string, string|string[]> $h"
},
{
"path": "src/Utils/OutputTypeHelper.php",
"chars": 1576,
"preview": "<?php\n\nnamespace Crwlr\\Crawler\\Utils;\n\nclass OutputTypeHelper\n{\n /**\n * @return mixed[]\n */\n public static"
},
{
"path": "src/Utils/RequestKey.php",
"chars": 2372,
"preview": "<?php\n\nnamespace Crwlr\\Crawler\\Utils;\n\nuse Crwlr\\Crawler\\Cache\\Exceptions\\MissingZlibExtensionException;\nuse Crwlr\\Crawl"
},
{
"path": "src/Utils/TemplateString.php",
"chars": 1203,
"preview": "<?php\n\nnamespace Crwlr\\Crawler\\Utils;\n\nuse Adbar\\Dot;\n\nclass TemplateString\n{\n /**\n * @param mixed[] $data\n *"
},
{
"path": "tests/Cache/CacheItemTest.php",
"chars": 1289,
"preview": "<?php\n\nnamespace tests\\Cache;\n\nuse Crwlr\\Crawler\\Cache\\CacheItem;\nuse DateInterval;\nuse DateTimeImmutable;\n\nit('is seria"
},
{
"path": "tests/Cache/FileCacheTest.php",
"chars": 16399,
"preview": "<?php\n\nnamespace tests\\Cache;\n\nuse Crwlr\\Crawler\\Cache\\CacheItem;\nuse Crwlr\\Crawler\\Cache\\Exceptions\\MissingZlibExtensio"
},
{
"path": "tests/Cache/_cachefilecontent",
"chars": 40581,
"preview": "a:8:{s:13:\"requestMethod\";s:3:\"GET\";s:10:\"requestUri\";s:74:\"https://www.crwlr.software/blog/dealing-with-http-url-query-"
},
{
"path": "tests/CrawlerTest.php",
"chars": 24974,
"preview": "<?php\n\nnamespace tests;\n\nuse Crwlr\\Crawler\\Steps\\Exceptions\\PreRunValidationException;\nuse Crwlr\\Crawler\\Steps\\StepOutpu"
},
{
"path": "tests/HttpCrawler/AnonymousHttpCrawlerBuilderTest.php",
"chars": 1642,
"preview": "<?php\n\nuse Crwlr\\Crawler\\HttpCrawler;\nuse Crwlr\\Crawler\\Loader\\Http\\HttpLoader;\nuse Crwlr\\Crawler\\UserAgents\\BotUserAgen"
},
{
"path": "tests/IoTest.php",
"chars": 5802,
"preview": "<?php\n\nnamespace tests;\n\nuse Crwlr\\Crawler\\Io;\n\n/**\n * @param mixed[] $keep\n */\nfunction helper_getIoInstance(\n mixed"
},
{
"path": "tests/Loader/Http/Browser/ScreenshotConfigTest.php",
"chars": 3744,
"preview": "<?php\n\nnamespace tests\\Loader\\Http\\Browser;\n\nuse Crwlr\\Crawler\\Loader\\Http\\Browser\\ScreenshotConfig;\nuse HeadlessChromiu"
},
{
"path": "tests/Loader/Http/Cache/RetryManagerTest.php",
"chars": 2290,
"preview": "<?php\n\nnamespace tests\\Loader\\Http\\Cache;\n\nuse Crwlr\\Crawler\\Loader\\Http\\Cache\\RetryManager;\n\nit('returns true for statu"
},
{
"path": "tests/Loader/Http/Cookies/CookieJarTest.php",
"chars": 4623,
"preview": "<?php\n\nnamespace tests\\Loader\\Http\\Cookies;\n\nuse Crwlr\\Crawler\\Loader\\Http\\Cookies\\CookieJar;\nuse Crwlr\\Url\\Url;\nuse Guz"
},
{
"path": "tests/Loader/Http/Cookies/CookieTest.php",
"chars": 13007,
"preview": "<?php\n\nnamespace tests\\Loader\\Http\\Cookies;\n\nuse Crwlr\\Crawler\\Loader\\Http\\Cookies\\Exceptions\\InvalidCookieException;\nus"
},
{
"path": "tests/Loader/Http/Cookies/DateTest.php",
"chars": 923,
"preview": "<?php\n\nnamespace tests\\Loader\\Http\\Cookies;\n\nuse Crwlr\\Crawler\\Loader\\Http\\Cookies\\Date;\nuse DateTimeZone;\n\ntest('It can"
},
{
"path": "tests/Loader/Http/HeadlessBrowserLoaderHelperTest.php",
"chars": 11349,
"preview": "<?php\n\nnamespace tests\\Loader\\Http;\n\nuse Closure;\nuse Crwlr\\Crawler\\Loader\\Http\\Cookies\\CookieJar;\nuse Crwlr\\Crawler\\Loa"
},
{
"path": "tests/Loader/Http/HttpLoaderPolitenessTest.php",
"chars": 6348,
"preview": "<?php\n\nnamespace tests\\Loader\\Http;\n\nuse Crwlr\\Crawler\\Loader\\Http\\Exceptions\\LoadingException;\nuse Crwlr\\Crawler\\Loader"
},
{
"path": "tests/Loader/Http/HttpLoaderTest.php",
"chars": 37399,
"preview": "<?php\n\nnamespace tests\\Loader\\Http;\n\nuse Crwlr\\Crawler\\Cache\\FileCache;\nuse Crwlr\\Crawler\\Loader\\Http\\Cookies\\CookieJar;"
},
{
"path": "tests/Loader/Http/Messages/RespondedRequestTest.php",
"chars": 12288,
"preview": "<?php\n\nnamespace tests\\Loader\\Http\\Messages;\n\nuse Crwlr\\Crawler\\Loader\\Http\\Browser\\Screenshot;\nuse Crwlr\\Crawler\\Loader"
},
{
"path": "tests/Loader/Http/Politeness/RobotsTxtHandlerTest.php",
"chars": 3991,
"preview": "<?php\n\nnamespace tests\\Loader\\Http\\Politeness;\n\nuse Crwlr\\Crawler\\Loader\\Http\\HttpLoader;\nuse Crwlr\\Crawler\\Loader\\Http\\"
},
{
"path": "tests/Loader/Http/Politeness/ThrottlerTest.php",
"chars": 5342,
"preview": "<?php\n\nnamespace tests\\Loader\\Http\\Politeness;\n\nuse Crwlr\\Crawler\\Loader\\Http\\Politeness\\Throttler;\nuse Crwlr\\Crawler\\Lo"
},
{
"path": "tests/Loader/Http/Politeness/TimingUnits/MultipleOfTest.php",
"chars": 374,
"preview": "<?php\n\nnamespace tests\\Loader\\Http\\Politeness\\TimingUnits;\n\nuse Crwlr\\Crawler\\Loader\\Http\\Politeness\\TimingUnits\\Multipl"
},
{
"path": "tests/Loader/Http/ProxyManagerTest.php",
"chars": 1337,
"preview": "<?php\n\nnamespace tests\\Loader\\Http;\n\nuse Crwlr\\Crawler\\Loader\\Http\\ProxyManager;\n\nit('knows if it manages only one or mu"
},
{
"path": "tests/Loader/LoaderTest.php",
"chars": 3858,
"preview": "<?php\n\nnamespace tests\\Loader;\n\nuse Crwlr\\Crawler\\Loader\\Loader;\nuse Crwlr\\Crawler\\UserAgents\\BotUserAgent;\nuse Mockery;"
},
{
"path": "tests/Logger/CliLoggerTest.php",
"chars": 1229,
"preview": "<?php\n\nnamespace tests\\Logger;\n\nuse Crwlr\\Crawler\\Logger\\CliLogger;\nuse PHPUnit\\Framework\\TestCase;\n\n/** @var TestCase $"
},
{
"path": "tests/Logger/PreStepInvocationLoggerTest.php",
"chars": 1463,
"preview": "<?php\n\nnamespace tests\\Logger;\n\nuse Crwlr\\Crawler\\Logger\\PreStepInvocationLogger;\nuse tests\\_Stubs\\DummyLogger;\n\nit('log"
},
{
"path": "tests/Pest.php",
"chars": 9964,
"preview": "<?php\n\nnamespace tests;\n\nuse Crwlr\\Crawler\\HttpCrawler;\nuse Crwlr\\Crawler\\Input;\nuse Crwlr\\Crawler\\Loader\\Http\\HttpLoade"
},
{
"path": "tests/ResultTest.php",
"chars": 4372,
"preview": "<?php\n\nnamespace tests;\n\nuse Crwlr\\Crawler\\Loader\\Http\\Browser\\Screenshot;\nuse Crwlr\\Crawler\\Loader\\Http\\Messages\\Respon"
},
{
"path": "tests/Steps/BaseStepTest.php",
"chars": 33033,
"preview": "<?php\n\nnamespace tests\\Steps;\n\nuse Crwlr\\Crawler\\Crawler;\nuse Crwlr\\Crawler\\HttpCrawler;\nuse Crwlr\\Crawler\\Input;\nuse Cr"
},
{
"path": "tests/Steps/CsvTest.php",
"chars": 17073,
"preview": "<?php\n\nnamespace tests\\Steps;\n\nuse Crwlr\\Crawler\\Loader\\Http\\Messages\\RespondedRequest;\nuse Crwlr\\Crawler\\Input;\nuse Crw"
},
{
"path": "tests/Steps/Dom/HtmlDocumentTest.php",
"chars": 2626,
"preview": "<?php\n\nnamespace tests\\Steps\\Dom;\n\nuse Crwlr\\Crawler\\Steps\\Dom\\HtmlDocument;\nuse Crwlr\\Crawler\\Steps\\Dom\\HtmlElement;\nus"
},
{
"path": "tests/Steps/Dom/HtmlElementTest.php",
"chars": 4909,
"preview": "<?php\n\nnamespace tests\\Steps\\Dom;\n\nuse Crwlr\\Crawler\\Steps\\Dom\\HtmlDocument;\nuse Crwlr\\Crawler\\Steps\\Dom\\HtmlElement;\nus"
},
{
"path": "tests/Steps/Dom/NodeListTest.php",
"chars": 6341,
"preview": "<?php\n\nnamespace Tests\\Steps\\Dom;\n\nuse Crwlr\\Crawler\\Steps\\Dom\\HtmlDocument;\nuse Crwlr\\Crawler\\Steps\\Dom\\HtmlElement;\nus"
},
{
"path": "tests/Steps/Dom/NodeTest.php",
"chars": 28100,
"preview": "<?php\n\nnamespace Tests\\Steps\\Dom;\n\nuse Crwlr\\Crawler\\Steps\\Dom\\HtmlElement;\nuse Crwlr\\Crawler\\Steps\\Dom\\Node;\nuse Crwlr\\"
},
{
"path": "tests/Steps/Dom/XmlDocumentTest.php",
"chars": 2760,
"preview": "<?php\n\nnamespace tests\\Steps\\Dom;\n\nuse Crwlr\\Crawler\\Steps\\Dom\\NodeList;\nuse Crwlr\\Crawler\\Steps\\Dom\\XmlDocument;\nuse Cr"
},
{
"path": "tests/Steps/Dom/XmlElementTest.php",
"chars": 3199,
"preview": "<?php\n\nnamespace tests\\Steps\\Dom;\n\nuse Crwlr\\Crawler\\Steps\\Dom\\NodeList;\nuse Crwlr\\Crawler\\Steps\\Dom\\XmlDocument;\nuse Cr"
},
{
"path": "tests/Steps/Dom/_Stubs/HtmlNodeStub.php",
"chars": 442,
"preview": "<?php\n\nnamespace tests\\Steps\\Dom\\_Stubs;\n\nuse Crwlr\\Crawler\\Steps\\Dom\\HtmlElement;\nuse Crwlr\\Crawler\\Steps\\Dom\\Node;\n\ncl"
},
{
"path": "tests/Steps/Dom/_Stubs/XmlNodeStub.php",
"chars": 439,
"preview": "<?php\n\nnamespace tests\\Steps\\Dom\\_Stubs;\n\nuse Crwlr\\Crawler\\Steps\\Dom\\Node;\nuse Crwlr\\Crawler\\Steps\\Dom\\XmlElement;\n\ncla"
},
{
"path": "tests/Steps/DomTest.php",
"chars": 13720,
"preview": "<?php\n\nnamespace tests\\Steps;\n\nuse Crwlr\\Crawler\\Loader\\Http\\Messages\\RespondedRequest;\nuse Crwlr\\Crawler\\Input;\nuse Crw"
},
{
"path": "tests/Steps/Filters/ArrayFilterTest.php",
"chars": 3407,
"preview": "<?php\n\nnamespace tests\\Steps\\Filters;\n\nuse Crwlr\\Crawler\\Steps\\Filters\\Filter;\n\nit('filters an array of string values', "
},
{
"path": "tests/Steps/Filters/ClosureFilterTest.php",
"chars": 1261,
"preview": "<?php\n\nnamespace tests\\Steps\\Filters;\n\nuse Crwlr\\Crawler\\Steps\\Filters\\ClosureFilter;\n\nuse function tests\\helper_getStdC"
},
{
"path": "tests/Steps/Filters/ComparisonFilterTest.php",
"chars": 1219,
"preview": "<?php\n\nnamespace tests\\Steps\\Filters;\n\nuse Crwlr\\Crawler\\Steps\\Filters\\ComparisonFilter;\nuse Crwlr\\Crawler\\Steps\\Filters"
},
{
"path": "tests/Steps/Filters/Enums/ComparisonFilterRuleTest.php",
"chars": 3945,
"preview": "<?php\n\nnamespace tests\\Steps\\Filters\\Enums;\n\nuse Crwlr\\Crawler\\Steps\\Filters\\Enums\\ComparisonFilterRule;\n\nit('correctly "
},
{
"path": "tests/Steps/Filters/Enums/StringFilterRuleTest.php",
"chars": 1543,
"preview": "<?php\n\nnamespace tests\\Steps\\Filters\\Enums;\n\nuse Crwlr\\Crawler\\Steps\\Filters\\Enums\\StringFilterRule;\n\nit('checks if a st"
},
{
"path": "tests/Steps/Filters/Enums/StringLengthFilterRuleTest.php",
"chars": 2477,
"preview": "<?php\n\nnamespace tests\\Steps\\Filters\\Enums;\n\nuse Crwlr\\Crawler\\Steps\\Filters\\Enums\\StringLengthFilterRule;\n\nit('correctl"
},
{
"path": "tests/Steps/Filters/Enums/UrlFilterRuleTest.php",
"chars": 2899,
"preview": "<?php\n\nnamespace tests\\Steps\\Filters\\Enums;\n\nuse Crwlr\\Crawler\\Steps\\Filters\\Enums\\UrlFilterRule;\n\nit('checks if a URL h"
},
{
"path": "tests/Steps/Filters/FilterTest.php",
"chars": 1672,
"preview": "<?php\n\nnamespace tests\\Steps\\Filters;\n\nuse Crwlr\\Crawler\\Steps\\Filters\\AbstractFilter;\nuse Exception;\nuse InvalidArgumen"
},
{
"path": "tests/Steps/Filters/NegatedFilterTest.php",
"chars": 493,
"preview": "<?php\n\nnamespace tests\\Steps\\Filters;\n\nuse Crwlr\\Crawler\\Steps\\Filters\\Filter;\nuse Crwlr\\Crawler\\Steps\\Filters\\NegatedFi"
},
{
"path": "tests/Steps/Filters/StringFilterTest.php",
"chars": 1346,
"preview": "<?php\n\nnamespace tests\\Steps\\Filters;\n\nuse Crwlr\\Crawler\\Steps\\Filters\\Enums\\StringFilterRule;\nuse Crwlr\\Crawler\\Steps\\F"
},
{
"path": "tests/Steps/Filters/StringLengthFilterTest.php",
"chars": 1344,
"preview": "<?php\n\nnamespace tests\\Steps\\Filters;\n\nuse Crwlr\\Crawler\\Steps\\Filters\\Enums\\StringLengthFilterRule;\nuse Crwlr\\Crawler\\S"
},
{
"path": "tests/Steps/Filters/UrlFilterTest.php",
"chars": 1500,
"preview": "<?php\n\nnamespace tests\\Steps\\Filters;\n\nuse Crwlr\\Crawler\\Steps\\Filters\\Enums\\UrlFilterRule;\nuse Crwlr\\Crawler\\Steps\\Filt"
},
{
"path": "tests/Steps/GroupTest.php",
"chars": 27618,
"preview": "<?php\n\nnamespace tests\\Steps;\n\nuse Closure;\nuse Crwlr\\Crawler\\Crawler;\nuse Crwlr\\Crawler\\Input;\nuse Crwlr\\Crawler\\Loader"
},
{
"path": "tests/Steps/Html/CssSelectorTest.php",
"chars": 6649,
"preview": "<?php\n\nnamespace tests\\Steps\\Html;\n\nuse Crwlr\\Crawler\\Steps\\Dom\\HtmlDocument;\nuse Crwlr\\Crawler\\Steps\\Html\\CssSelector;\n"
}
]
// ... and 126 more files (download for full content)
About this extraction
This page contains the full source code of the crwlrsoft/crawler GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 326 files (1.1 MB), approximately 307.0k tokens, and a symbol index with 1268 extracted functions, classes, methods, constants, and types. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.
Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.