Repository: crwlrsoft/crawler
Branch: main
Commit: d6680f9e698a
Files: 326
Total size: 1.1 MB

Directory structure:
gitextract_d22hbn5_/

├── .editorconfig
├── .gitattributes
├── .github/
│   └── workflows/
│       └── ci.yml
├── .gitignore
├── .php-cs-fixer.php
├── CHANGELOG.md
├── CONTRIBUTING.md
├── LICENSE
├── README.md
├── bin/
│   └── add-git-hooks
├── composer.json
├── git-hooks/
│   └── pre-commit
├── phpstan.neon
├── phpunit.xml
├── src/
│   ├── Cache/
│   │   ├── CacheItem.php
│   │   ├── Exceptions/
│   │   │   ├── MissingZlibExtensionException.php
│   │   │   └── ReadingCacheFailedException.php
│   │   └── FileCache.php
│   ├── Crawler.php
│   ├── HttpCrawler/
│   │   └── AnonymousHttpCrawlerBuilder.php
│   ├── HttpCrawler.php
│   ├── Input.php
│   ├── Io.php
│   ├── Loader/
│   │   ├── Http/
│   │   │   ├── Browser/
│   │   │   │   ├── Screenshot.php
│   │   │   │   └── ScreenshotConfig.php
│   │   │   ├── Cache/
│   │   │   │   └── RetryManager.php
│   │   │   ├── Cookies/
│   │   │   │   ├── Cookie.php
│   │   │   │   ├── CookieJar.php
│   │   │   │   ├── Date.php
│   │   │   │   └── Exceptions/
│   │   │   │       └── InvalidCookieException.php
│   │   │   ├── Exceptions/
│   │   │   │   └── LoadingException.php
│   │   │   ├── HeadlessBrowserLoaderHelper.php
│   │   │   ├── HttpLoader.php
│   │   │   ├── Messages/
│   │   │   │   └── RespondedRequest.php
│   │   │   ├── Politeness/
│   │   │   │   ├── RetryErrorResponseHandler.php
│   │   │   │   ├── RobotsTxtHandler.php
│   │   │   │   ├── Throttler.php
│   │   │   │   └── TimingUnits/
│   │   │   │       └── MultipleOf.php
│   │   │   └── ProxyManager.php
│   │   ├── Loader.php
│   │   └── LoaderInterface.php
│   ├── Logger/
│   │   ├── CliLogger.php
│   │   └── PreStepInvocationLogger.php
│   ├── Output.php
│   ├── Result.php
│   ├── Steps/
│   │   ├── BaseStep.php
│   │   ├── Csv.php
│   │   ├── Dom/
│   │   │   ├── DomDocument.php
│   │   │   ├── HtmlDocument.php
│   │   │   ├── HtmlElement.php
│   │   │   ├── Node.php
│   │   │   ├── NodeList.php
│   │   │   ├── XmlDocument.php
│   │   │   └── XmlElement.php
│   │   ├── Dom.php
│   │   ├── Exceptions/
│   │   │   └── PreRunValidationException.php
│   │   ├── Filters/
│   │   │   ├── AbstractFilter.php
│   │   │   ├── ArrayFilter.php
│   │   │   ├── ClosureFilter.php
│   │   │   ├── ComparisonFilter.php
│   │   │   ├── Enums/
│   │   │   │   ├── ComparisonFilterRule.php
│   │   │   │   ├── StringFilterRule.php
│   │   │   │   ├── StringLengthFilterRule.php
│   │   │   │   └── UrlFilterRule.php
│   │   │   ├── Filter.php
│   │   │   ├── FilterInterface.php
│   │   │   ├── Filterable.php
│   │   │   ├── NegatedFilter.php
│   │   │   ├── StringFilter.php
│   │   │   ├── StringLengthFilter.php
│   │   │   └── UrlFilter.php
│   │   ├── Group.php
│   │   ├── Html/
│   │   │   ├── CssSelector.php
│   │   │   ├── DomQuery.php
│   │   │   ├── Exceptions/
│   │   │   │   └── InvalidDomQueryException.php
│   │   │   ├── GetLink.php
│   │   │   ├── GetLinks.php
│   │   │   ├── MetaData.php
│   │   │   ├── SchemaOrg.php
│   │   │   ├── SelectorTarget.php
│   │   │   └── XPathQuery.php
│   │   ├── Html.php
│   │   ├── Json.php
│   │   ├── Loading/
│   │   │   ├── GetSitemapsFromRobotsTxt.php
│   │   │   ├── Http/
│   │   │   │   ├── AbstractPaginator.php
│   │   │   │   ├── Browser/
│   │   │   │   │   └── BrowserAction.php
│   │   │   │   ├── Document.php
│   │   │   │   ├── Paginate.php
│   │   │   │   ├── Paginator.php
│   │   │   │   └── Paginators/
│   │   │   │       ├── QueryParams/
│   │   │   │       │   ├── AbstractQueryParamManipulator.php
│   │   │   │       │   ├── Decrementor.php
│   │   │   │       │   ├── Incrementor.php
│   │   │   │       │   └── QueryParamManipulator.php
│   │   │   │       ├── QueryParamsPaginator.php
│   │   │   │       ├── SimpleWebsitePaginator.php
│   │   │   │       └── StopRules/
│   │   │   │           ├── Contains.php
│   │   │   │           ├── IsEmptyInDom.php
│   │   │   │           ├── IsEmptyInHtml.php
│   │   │   │           ├── IsEmptyInJson.php
│   │   │   │           ├── IsEmptyInXml.php
│   │   │   │           ├── IsEmptyResponse.php
│   │   │   │           ├── NotContains.php
│   │   │   │           ├── PaginatorStopRules.php
│   │   │   │           └── StopRule.php
│   │   │   ├── Http.php
│   │   │   ├── HttpBase.php
│   │   │   ├── HttpCrawl.php
│   │   │   └── LoadingStep.php
│   │   ├── Refiners/
│   │   │   ├── AbstractRefiner.php
│   │   │   ├── DateTime/
│   │   │   │   └── DateTimeFormat.php
│   │   │   ├── DateTimeRefiner.php
│   │   │   ├── Html/
│   │   │   │   └── RemoveFromHtml.php
│   │   │   ├── HtmlRefiner.php
│   │   │   ├── RefinerInterface.php
│   │   │   ├── String/
│   │   │   │   ├── AbstractStringRefiner.php
│   │   │   │   ├── StrAfterFirst.php
│   │   │   │   ├── StrAfterLast.php
│   │   │   │   ├── StrBeforeFirst.php
│   │   │   │   ├── StrBeforeLast.php
│   │   │   │   ├── StrBetweenFirst.php
│   │   │   │   ├── StrBetweenLast.php
│   │   │   │   └── StrReplace.php
│   │   │   ├── StringRefiner.php
│   │   │   ├── Url/
│   │   │   │   ├── AbstractUrlRefiner.php
│   │   │   │   ├── WithFragment.php
│   │   │   │   ├── WithHost.php
│   │   │   │   ├── WithPath.php
│   │   │   │   ├── WithPort.php
│   │   │   │   ├── WithQuery.php
│   │   │   │   ├── WithScheme.php
│   │   │   │   └── WithoutPort.php
│   │   │   └── UrlRefiner.php
│   │   ├── Sitemap/
│   │   │   └── GetUrlsFromSitemap.php
│   │   ├── Sitemap.php
│   │   ├── Step.php
│   │   ├── StepInterface.php
│   │   ├── StepOutputType.php
│   │   └── Xml.php
│   ├── Stores/
│   │   ├── JsonFileStore.php
│   │   ├── SimpleCsvFileStore.php
│   │   ├── Store.php
│   │   └── StoreInterface.php
│   ├── UserAgents/
│   │   ├── BotUserAgent.php
│   │   ├── BotUserAgentInterface.php
│   │   ├── UserAgent.php
│   │   └── UserAgentInterface.php
│   └── Utils/
│       ├── Gzip.php
│       ├── HttpHeaders.php
│       ├── OutputTypeHelper.php
│       ├── RequestKey.php
│       └── TemplateString.php
└── tests/
    ├── Cache/
    │   ├── CacheItemTest.php
    │   ├── FileCacheTest.php
    │   └── _cachefilecontent
    ├── CrawlerTest.php
    ├── HttpCrawler/
    │   └── AnonymousHttpCrawlerBuilderTest.php
    ├── IoTest.php
    ├── Loader/
    │   ├── Http/
    │   │   ├── Browser/
    │   │   │   └── ScreenshotConfigTest.php
    │   │   ├── Cache/
    │   │   │   └── RetryManagerTest.php
    │   │   ├── Cookies/
    │   │   │   ├── CookieJarTest.php
    │   │   │   ├── CookieTest.php
    │   │   │   └── DateTest.php
    │   │   ├── HeadlessBrowserLoaderHelperTest.php
    │   │   ├── HttpLoaderPolitenessTest.php
    │   │   ├── HttpLoaderTest.php
    │   │   ├── Messages/
    │   │   │   └── RespondedRequestTest.php
    │   │   ├── Politeness/
    │   │   │   ├── RobotsTxtHandlerTest.php
    │   │   │   ├── ThrottlerTest.php
    │   │   │   └── TimingUnits/
    │   │   │       └── MultipleOfTest.php
    │   │   └── ProxyManagerTest.php
    │   └── LoaderTest.php
    ├── Logger/
    │   ├── CliLoggerTest.php
    │   └── PreStepInvocationLoggerTest.php
    ├── Pest.php
    ├── ResultTest.php
    ├── Steps/
    │   ├── BaseStepTest.php
    │   ├── CsvTest.php
    │   ├── Dom/
    │   │   ├── HtmlDocumentTest.php
    │   │   ├── HtmlElementTest.php
    │   │   ├── NodeListTest.php
    │   │   ├── NodeTest.php
    │   │   ├── XmlDocumentTest.php
    │   │   ├── XmlElementTest.php
    │   │   └── _Stubs/
    │   │       ├── HtmlNodeStub.php
    │   │       └── XmlNodeStub.php
    │   ├── DomTest.php
    │   ├── Filters/
    │   │   ├── ArrayFilterTest.php
    │   │   ├── ClosureFilterTest.php
    │   │   ├── ComparisonFilterTest.php
    │   │   ├── Enums/
    │   │   │   ├── ComparisonFilterRuleTest.php
    │   │   │   ├── StringFilterRuleTest.php
    │   │   │   ├── StringLengthFilterRuleTest.php
    │   │   │   └── UrlFilterRuleTest.php
    │   │   ├── FilterTest.php
    │   │   ├── NegatedFilterTest.php
    │   │   ├── StringFilterTest.php
    │   │   ├── StringLengthFilterTest.php
    │   │   └── UrlFilterTest.php
    │   ├── GroupTest.php
    │   ├── Html/
    │   │   ├── CssSelectorTest.php
    │   │   ├── Exceptions/
    │   │   │   └── InvalidDomQueryExceptionTest.php
    │   │   ├── GetLinkTest.php
    │   │   ├── GetLinksTest.php
    │   │   ├── MetaDataTest.php
    │   │   ├── SchemaOrgTest.php
    │   │   └── XPathQueryTest.php
    │   ├── HtmlTest.php
    │   ├── JsonTest.php
    │   ├── Loading/
    │   │   ├── GetSitemapsFromRobotsTxtTest.php
    │   │   ├── Http/
    │   │   │   ├── DocumentTest.php
    │   │   │   └── Paginators/
    │   │   │       ├── AbstractPaginatorTest.php
    │   │   │       ├── QueryParams/
    │   │   │       │   ├── AbstractQueryParamManipulatorTest.php
    │   │   │       │   ├── DecrementorTest.php
    │   │   │       │   └── IncrementorTest.php
    │   │   │       ├── QueryParamsPaginatorTest.php
    │   │   │       ├── SimpleWebsitePaginatorTest.php
    │   │   │       └── StopRules/
    │   │   │           ├── ContainsTest.php
    │   │   │           ├── IsEmptyInHtmlTest.php
    │   │   │           ├── IsEmptyInJsonTest.php
    │   │   │           ├── IsEmptyInXmlTest.php
    │   │   │           ├── IsEmptyResponseTest.php
    │   │   │           └── NotContainsTest.php
    │   │   ├── HttpTest.php
    │   │   └── LoadingStepTest.php
    │   ├── Refiners/
    │   │   ├── AbstractRefinerTest.php
    │   │   ├── DateTime/
    │   │   │   └── DateTimeFormatTest.php
    │   │   ├── Html/
    │   │   │   └── RemoveFromHtmlTest.php
    │   │   ├── String/
    │   │   │   ├── AfterFirstTest.php
    │   │   │   ├── AfterLastTest.php
    │   │   │   ├── BeforeFirstTest.php
    │   │   │   ├── BeforeLastTest.php
    │   │   │   ├── BetweenFirstTest.php
    │   │   │   ├── BetweenLastTest.php
    │   │   │   └── ReplaceTest.php
    │   │   └── Url/
    │   │       ├── WithFragmentTest.php
    │   │       ├── WithHostTest.php
    │   │       ├── WithPathTest.php
    │   │       ├── WithPortTest.php
    │   │       ├── WithQueryTest.php
    │   │       ├── WithSchemeTest.php
    │   │       └── WithoutPortTest.php
    │   ├── Sitemap/
    │   │   └── GetUrlsFromSitemapTest.php
    │   ├── StepTest.php
    │   ├── XmlTest.php
    │   └── _Files/
    │       ├── Csv/
    │       │   ├── basic.csv
    │       │   ├── enclosure.csv
    │       │   ├── escape.csv
    │       │   ├── separator.csv
    │       │   └── with-column-headlines.csv
    │       ├── Html/
    │       │   ├── basic.html
    │       │   ├── bookstore.html
    │       │   └── event.html
    │       └── Xml/
    │           ├── bookstore.xml
    │           ├── events.xml
    │           └── rss-with-bom.xml
    ├── Stores/
    │   ├── JsonFileStoreTest.php
    │   ├── SimpleCsvFileStoreTest.php
    │   └── _files/
    │       └── .gitkeep
    ├── UserAgents/
    │   ├── BotUserAgentTest.php
    │   └── UserAgentTest.php
    ├── Utils/
    │   ├── GzipTest.php
    │   ├── HttpHeadersTest.php
    │   ├── OutputTypeHelperTest.php
    │   ├── RequestKeyTest.php
    │   └── TemplateStringTest.php
    ├── _Integration/
    │   ├── GroupTest.php
    │   ├── Http/
    │   │   ├── CharsetTest.php
    │   │   ├── CrawlingTest.php
    │   │   ├── ErrorResponsesTest.php
    │   │   ├── GzipTest.php
    │   │   ├── HeadlessBrowserTest.php
    │   │   ├── Html/
    │   │   │   ├── PaginatedListingTest.php
    │   │   │   └── SimpleListingTest.php
    │   │   ├── PaginationTest.php
    │   │   ├── ProxyingTest.php
    │   │   ├── PublisherExampleTest.php
    │   │   ├── QueryParamPaginationTest.php
    │   │   ├── RedirectTest.php
    │   │   ├── RequestParamsFromInputTest.php
    │   │   ├── RetryErrorResponsesTest.php
    │   │   ├── RobotsTxtTest.php
    │   │   └── TimeoutTest.php
    │   ├── ProxyServer.php
    │   ├── Server.php
    │   └── _Server/
    │       ├── BlogPostWithJsonLd.php
    │       ├── BrokenMimeTypeRss.php
    │       ├── BrowserActions/
    │       │   ├── ClickAndWaitForReload.php
    │       │   ├── EvaluateAndWaitForReload.php
    │       │   ├── EvaluateAndWaitForReloadReloaded.php
    │       │   ├── Main.php
    │       │   └── Wait.php
    │       ├── Crawling.php
    │       ├── HelloWorld.php
    │       ├── JsGeneratedContent.php
    │       ├── NonUtf8.php
    │       ├── PageInitScript.php
    │       ├── PaginatedListing/
    │       │   └── Detail.php
    │       ├── PaginatedListing.php
    │       ├── PrintCookie.php
    │       ├── PrintCookies.php
    │       ├── PrintHeaders.php
    │       ├── Publisher/
    │       │   ├── AuthorDetailPage.php
    │       │   ├── AuthorsListPage.php
    │       │   ├── BookDetailPage.php
    │       │   └── EditionDetailPage.php
    │       ├── QueryParamPagination.php
    │       ├── RssFeed.php
    │       ├── ServiceUnavailable.php
    │       ├── SetCookie.php
    │       ├── SetCookieJs.php
    │       ├── SetDelayedCookieJs.php
    │       ├── SetMultipleCookiesJs.php
    │       ├── SimpleListing/
    │       │   └── Detail.php
    │       ├── SimpleListing.php
    │       └── TooManyRequests.php
    ├── _Stubs/
    │   ├── AbstractTestPaginator.php
    │   ├── Crawlers/
    │   │   ├── DummyOne.php
    │   │   ├── DummyTwo/
    │   │   │   ├── DummyTwoLoader.php
    │   │   │   ├── DummyTwoLogger.php
    │   │   │   └── DummyTwoUserAgent.php
    │   │   └── DummyTwo.php
    │   ├── DummyLogger.php
    │   ├── PhantasyLoader.php
    │   └── RespondedRequestChild.php
    └── _Temp/
        ├── _cachedir/
        │   └── .gitkeep
        └── _storagedir/
            └── .gitkeep

================================================
FILE CONTENTS
================================================

================================================
FILE: .editorconfig
================================================
# EditorConfig is awesome: http://EditorConfig.org

root = true

[*]
charset = utf-8
end_of_line = lf
indent_style = space
indent_size = 4
insert_final_newline = true
trim_trailing_whitespace = true

[*.md]
trim_trailing_whitespace = false

[*.yml]
indent_size = 2

[_cachefilecontent]
insert_final_newline = false


================================================
FILE: .gitattributes
================================================
.github export-ignore
bin/add-git-hooks export-ignore
git-hooks export-ignore
tests export-ignore
.editorconfig export-ignore
.gitattributes export-ignore
.gitignore export-ignore
.php-cs-fixer.php export-ignore
phpstan.neon export-ignore
phpunit.xml export-ignore


================================================
FILE: .github/workflows/ci.yml
================================================
name: CI

on: pull_request

jobs:
  tests:
    name: PestPHP Tests
    runs-on: ubuntu-latest
    strategy:
      matrix:
        php-versions: ['8.1', '8.2', '8.3', '8.4', '8.5']

    steps:
      - name: Checkout repository
        uses: actions/checkout@v4

      - name: Install PHP
        uses: shivammathur/setup-php@v2
        with:
          php-version: ${{ matrix.php-versions }}

      - name: Install dependencies
        run: composer install --prefer-dist --no-progress

      - name: Run tests
        run: composer test

      - name: Run integration tests
        run: composer test-integration

  tests84:
    name: PestPHP Tests Running only on PHP >= 8.4
    runs-on: ubuntu-latest
    strategy:
      matrix:
        php-versions: ['8.4', '8.5']

    steps:
      - name: Checkout repository
        uses: actions/checkout@v4

      - name: Install PHP
        uses: shivammathur/setup-php@v2
        with:
          php-version: ${{ matrix.php-versions }}

      - name: Install dependencies
        run: composer install --prefer-dist --no-progress

      - name: Run tests
        run: composer test-php84

  stanAndCs:
    name: Static Analysis (phpstan) and Code Style (PHP CS Fixer)
    runs-on: ubuntu-latest

    steps:
      - name: Checkout repository
        uses: actions/checkout@v4

      - name: Install PHP
        uses: shivammathur/setup-php@v2
        with:
          php-version: '8.1'
          coverage: none

      - name: Install dependencies
        run: composer install --prefer-dist --no-progress

      - name: Run PHPStan
        run: composer stan

      - name: Run PHP CS Fixer
        run: composer cs


================================================
FILE: .gitignore
================================================
composer.lock
vendor
.php_cs.cache
.php-cs-fixer.cache
.phpunit.result.cache
.phpunit.cache
/cachedir
/storedir
/tests/_Temp/_cachedir/*
!/tests/_Temp/_cachedir/.gitkeep


================================================
FILE: .php-cs-fixer.php
================================================
<?php

use PhpCsFixer\Config;
use PhpCsFixer\Finder;
use PhpCsFixer\Runner\Parallel\ParallelConfigFactory;

$finder = Finder::create()
    ->exclude(['tests/_Integration/_Server', '.github', 'bin', 'git-hooks'])
    ->in(__DIR__);

return (new Config())
    ->setFinder($finder)
    ->setParallelConfig(ParallelConfigFactory::detect())
    ->setRules([
        '@PER-CS' => true,
        'strict_param' => true,
        'array_syntax' => ['syntax' => 'short'],
        'no_unused_imports' => true,
        'operator_linebreak' => ['only_booleans' => true, 'position' => 'end'],
    ])
    ->setRiskyAllowed(true)
    ->setUsingCache(true);


================================================
FILE: CHANGELOG.md
================================================
# Changelog
All notable changes to this project will be documented in this file.

The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [Unreleased]

## [3.5.6] - 2026-01-05
### Fixed
* Potential issues found with PHPStan 2 on level 8.

## [3.5.5] - 2025-08-05
### Fixed
* Removed the overriding `validateAndSanitizeInput()` method from the `Paginate` HTTP step to ensure features like `staticUrl()` and `useInputKeyAsUrl()` work correctly.
* The `Paginate` HTTP step now also supports receiving an array of URLs, initiating pagination separately for each one.

### Deprecated
* The `Crwlr\Crawler\Steps\Loading\Http\Paginate` class. It shall be removed and its behavior implemented in the `Http` class directly, in the next major version.

## [3.5.4] - 2025-07-28
### Fixed
* An issue in the `SimpleWebsitePaginator` when used with stop rules.

## [3.5.3] - 2025-06-10
### Fixed
* Issues with passing cookies from the cookie jar to the headless browser when using the `useBrowser()` method on `Http` steps, in cases where the loader wasn’t globally configured to use the browser for all requests.

## [3.5.2] - 2025-05-16
### Fixed
* The `Result::toArray()` method now converts all objects contained in the Result array (at any level of the array) to arrays. Also, if the only element in a result array has some autogenerated key containing "unnamed", but the value also is an associative array with string keys, the method only returns that child array.

## [3.5.1] - 2025-04-23
### Fixed
* An issue that occurred, when a step uses the `PreStepInvocationLogger`. As refiners also use the logger, a newer logger (replacing the `PreStepInvocationLogger`) is now also passed to all registered refiners of a step.
* Enable applying refiners to output properties with array value. E.g. if a step outputs an array of URLs (`['https://...', 'https://...']`), a `UrlRefiner` will be applied to all those URLs.

## [3.5.0] - 2025-04-10
### Added
* Dynamically building request URLs from extracted data: `Http` steps now have a new `staticUrl()` method, and you can also use variables within that static URL - as well as in request headers and the body - like `https://www.example.com/foo/[crwl:some_extracted_property]`. These variables will be replaced with the corresponding properties from input data (also works with kept data).
* New Refiners:
    * `DateTimeRefiner::reformat('Y-m-d H:i:s')` to reformat a date time string to a different format. Tries to automatically recognize the input format. If this does not work, you can provide an input format to use as the second argument.
    * `HtmlRefiner::remove('#foo')` to remove nodes matching the given selector from selected HTML.
* Steps that produce multiple outputs per input can now group them per input by calling the new `Step::oneOutputPerInput()` method.

## [3.4.5] - 2025-04-09
### Fixed
* When feeding an `Http` step with a string that is not a valid URL (e.g. `https://`), the exception when trying to parse it as a URL is caught, and an error logged.

## [3.4.4] - 2025-04-04
### Fixed
* As sometimes, XML parsing errors occur because of characters that aren't valid within XML documents, the library now catches XML parsing errors, tries to find and replace invalid characters (with transliterates or HTML entities) and retries parsing the document. Works best when you additionally install the `voku/portable-ascii` composer package.

## [3.4.3] - 2025-04-03
### Fixed
* When providing an empty base selector to an `Html` step (`Html::each('')`, `Html::first('')`, `Html::last('')`), it won't fail with an error, but instead log a warning, that it most likely doesn't make sense.
* The `Step::keep()` methods now also work when applied to child steps within a group step.

## [3.4.2] - 2025-03-08
### Fixed
* Issue when using `Http::get()->useBrowser()->postBrowserNavigateHook()`. Previously in this case, when the loader is configured to use the HTTP client, the post browser navigate hook was actually not set because of an issue with the order, things happened internally.

## [3.4.1] - 2025-03-08
### Fixed
* Since, when using the Chrome browser for loading, we can only execute GET requests:
    * The loader now automatically switches to the HTTP client for POST, PUT, PATCH, and DELETE requests and logs a warning.
    * A warning is logged when attempting to use "Post Browser Navigate Hooks" with POST, PUT, PATCH, or DELETE requests.
    * Consequently, the `useBrowser()` method, introduced in v3.4.0, is also limited to GET requests.

## [3.4.0] - 2025-03-06
### Added
* Two new methods to the base class of all `Http` steps:
    * `skipCache()` – Allows using the cache while skipping it for a specific loading step.
    * `useBrowser()` – Switches the loader to use a (headless) Chrome browser for loading calls in a specific step and then reverts the loader to its previous setting.
* Introduced the new `BrowserAction::screenshot()` post browser navigate hook. It accepts an instance of the new `ScreenshotConfig` class, allowing you to configure various options (see the methods of `ScreenshotConfig`). If successful, the screenshot file paths are included in the `RespondedRequest` output object of the `Http` step.

## [3.3.0] - 2025-03-02
### Added
* New `BrowserAction`s to use with the `postBrowserNavigateHook()` method: 
  * `BrowserAction::clickInsideShadowDom()`
  * `BrowserAction::moveMouseToElement()`
  * `BrowserAction::moveMouseToPosition()`
  * `BrowserAction::scrollDown()`
  * `BrowserAction::scrollUp()`
  * `BrowserAction::typeText()`
  * `BrowserAction::waitForReload()`
* A new method in `HeadlessBrowserLoaderHelper` to include the HTML content of shadow DOM elements in the returned HTML. Use it like this: `$crawler->getLoader()->browser()->includeShadowElementsInHtml()`.

### Changed
* The `BrowserAction::clickElement()` action, now automatically waits for an element matching the selector to be rendered, before performing the click. This means you don't need to put a `BrowserAction::waitUntilDocumentContainsElement()` before it. It works the same in the new `BrowserAction::clickInsideShadowDom()` and `BrowserAction::moveMouseToElement()` actions.

### Deprecated
* `BrowserAction::clickElementAndWaitForReload()` and `BrowserAction::evaluateAndWaitForReload()`. As a replacement, please use `BrowserAction::clickElement()` or `BrowserAction::evaluate()` and `BrowserAction::waitForReload()` separately.

## [3.2.5] - 2025-02-26
### Fixed
* When a child step is nested in the `extract()` method of an `Html` or `Xml` step, and does not use `each()` as the base, the extracted value is an array with the keys defined in the `extract()` call, rather than an array of such arrays as it would be with `each()` as base.

## [3.2.4] - 2025-02-25
### Fixed
* Trying to load a relative reference URI (no scheme and host/authority, only path) via the `HttpLoader` now immediately logs (or throws when `loadOrFail()` is used) an error instead of trying to actually load it.

## [3.2.3] - 2025-01-28
### Fixed
* Fix deprecation warning triggered in the `DomQuery` class, when trying to get the value of an HTML/XML attribute that does not exist on the element.

## [3.2.2] - 2025-01-17
### Fixed
* Warnings about loader hooks being called multiple times, when using a `BotUserAgent` and therefore loading and respecting the robots.txt file, or when using the `Http::stopOnErrorResponse()` method.

## [3.2.1] - 2025-01-13
### Fixed
* Reuse previously opened page when using the (headless) Chrome browser, instead of opening a new page for each request.

## [3.2.0] - 2025-01-12
### Added
* `RespondedRequest::isServedFromCache()` to determine whether a response was served from cache or actually loaded.

## [3.1.5] - 2025-01-10
### Fixed
* Another improvement for getting XML source when using the browser, in cases where Chrome doesn't identify the response as an XML document (even though a Content-Type header is sent).

## [3.1.4] - 2025-01-10
### Fixed
* `HttpLoader::dontUseCookies()` now also works when using the Chrome browser. Cookies are cleared before every request.

## [3.1.3] - 2025-01-10
### Fixed
* Further improve getting the raw response body from non-HTML documents via Chrome browser.

## [3.1.2] - 2025-01-08
### Fixed
* When loading a non-HTML document (e.g., XML) via the Chrome browser, the library now retrieves the original source. Previously, it returned the outerHTML of the rendered document, which wrapped the content in an HTML structure.

## [3.1.1] - 2025-01-07
### Fixed
* When the `validateAndSanitize()` method of a step throws an `InvalidArgumentException`, the exception is now caught, logged and the step is not invoked with the invalid input. This improves fault tolerance. Feeding a step with one invalid input shouldn't cause the whole crawler run to fail. Exceptions other than `InvalidArgumentException` remain uncaught.

## [3.1.0] - 2025-01-03
### Added
* New method `HeadlessBrowserLoaderHelper::setPageInitScript()` (`$crawler->getLoader()->browser()->setPageInitScript()`) to provide javascript code that is executed on every new browser page before navigating anywhere.
* New method `HeadlessBrowserLoaderHelper::useNativeUserAgent()` (`$crawler->getLoader()->browser()->useNativeUserAgent()`) to allow using the native `User-Agent` that your Chrome browser sends by default.

## [3.0.4] - 2024-12-18
### Fixed
* Minor improvement for the `DomQuery` (base for `Dom::cssSelector()` and `Dom::xPath()`): enable providing an empty string as selector, to simply get the node that the selector is applied to.

## [3.0.3] - 2024-12-11
### Fixed
* Improved fix for non UTF-8 characters in HTML documents declared as UTF-8.

## [3.0.2] - 2024-12-11
### Fixed
* When the new PHP 8.4 DOM API is used, and HTML declared as UTF-8 contains non UTF-8 compatible characters, it does not replace them with a � character, but instead removes it. This behaviour is consistent with the data returned by Symfony DomCrawler.

## [3.0.1] - 2024-12-10
### Undeprecated
* Removed deprecations for all XPath functionality (`Dom::xPath()`, `XPathQuery` class and `Node::queryXPath()`), because it's still available with the net DOM API in PHP 8.4.

## [3.0.0] - 2024-12-08
The primary change in version 3.0.0 is that the library now leverages PHP 8.4’s new DOM API when used in an environment with PHP >= 8.4. To maintain compatibility with PHP < 8.4, an abstraction layer has been implemented. This layer dynamically uses either the Symfony DomCrawler component or the new DOM API, depending on the PHP version.

Since no direct interaction with an instance of the Symfony DomCrawler library was required at the step level provided by the library, it is highly likely that you won’t need to make any changes to your code to upgrade to v3. To ensure a smooth transition, please review the points under “Changed.”

### Changed
* __BREAKING__: The `DomQuery::innerText()` method (a.k.a. `Dom::cssSelector('...')->innerText()`) has been removed. `innerText` exists only in the Symfony DomCrawler component, and its usefulness is questionable. If you still require this variant of the DOM element text, please let us know or create a pull request yourself. Thank you!
* __BREAKING__: The `DomQueryInterface` was removed. As the `DomQuery` class offers a lot more functionality than the interface defines, the purpose of the interface was questionable. Please use the abstract `DomQuery` class instead. This also means that some method signatures, type hinting the interface, have changed. Look for occurrences of `DomQueryInterface` and replace them.
* __BREAKING__: The visibility of the `DomQuery::filter()` method was changed from public to protected. It is still needed in the `DomQuery` class, but outside of it, it is probably better and easier to directly use the new DOM abstraction (see the `src/Steps/Dom` directory). If you are extending the `DomQuery` class (which is not recommended), be aware that the argument now takes a `Node` (from the new DOM abstraction) instead of a Symfony `Crawler`.
* __BREAKING__: The `Step::validateAndSanitizeToDomCrawlerInstance()` method was removed. Please use the `Step::validateAndSanitizeToHtmlDocumentInstance()` and `Step::validateAndSanitizeToXmlDocumentInstance()` methods instead.
* __BREAKING__: The second argument in `Closure`s passed to the `Http::crawl()->customFilter()` has changed from an instance of Symfony `Crawler` class, to an `HtmlElement` instance from the new DOM abstraction (`Crwlr\Crawler\Steps\Dom\HtmlElement`).
* __BREAKING__: The Filter class was split into `AbstractFilter` (base class for actual filter classes) and `Filter` only hosting the static function for easy instantiation, because otherwise each filter class also has all the static methods.
* __BREAKING__: Further, the signatures of some methods that are mainly here for internal usage, have changed due to the new DOM abstraction:
  * The static `GetLink::isSpecialNonHttpLink()` method now needs an instance of `HtmlElement` instead of a Symfony `Crawler`.
  * `GetUrlsFromSitemap::fixUrlSetTag()` now takes an `XmlDocument` instead of a Symfony `Crawler`.
  * The `DomQuery::apply()` method now takes a `Node` instead of a Symfony `Crawler`.

### Deprecated
* `Dom::xPath()` method and
* the `XPathQuery` class as well as
* the new `Node::queryXPath()` method.

### Added
* New step output filter `Filter::arrayHasElement()`. When a step produces array output with a property being a numeric array, you can now filter outputs by checking if one element of that array property, matches certain filter criteria. Example: The outputs look like `['foo' => 'bar', 'baz' => ['one', 'two', 'three']]`. You can filter all outputs where `baz` contains `two` like: `Filter::arrayHasElement()->where('baz', Filter::equal('two'))`.

## [2.1.3] - 2024-11-05
### Fixed
* Improvements for deprecations in PHP 8.4.

## [2.1.2] - 2024-10-22
### Fixed
* Issue when converting cookie objects received from the chrome-php library.

## [2.1.1] - 2024-10-21
### Fixed
* Also add cookies, set during headless browser usage, to the cookie jar. When switching back to the (guzzle) HTTP client the cookies should also be sent.
* Don't call `Loader::afterLoad()` when `Loader::beforeLoad()` was not called before. This can potentially happen, when an exception is thrown before the call to the `beforeLoad` hook, but it is caught and the `afterLoader` hook method is called anyway. As this most likely won't make sense to users, the `afterLoad` hook callback functions will just not be called in this case.
* The `Throttler` class now has protected methods `_internalTrackStartFor()`,  `_requestToUrlWasStarted()` and `_internalTrackEndFor()`. When extending the `Throttler` class (be careful, actually that's not really recommended) they can be used to check if a request to a URL was actually started before.

## [2.1.0] - 2024-10-19
### Added
* The new `postBrowserNavigateHook()` method in the `Http` step classes, which allows to define callback functions that are triggered after the headless browser navigated to the specified URL. They are called with the chrome-php `Page` object as argument, so you can interact with the page. Also, there is a new class `BrowserAction` providing some simple actions (like wait for element, click element,...) as Closures via static methods. You can use it like `Http::get()->postBrowserNavigateHook(BrowserAction::clickElement('#element'))`.

## [2.0.1] - 2024-10-15
### Fixed
* Issue with the `afterLoad` hook of the `HttpLoader`, introduced in v2. Calling the hook was commented out, which slipped through because the test case was faulty.

## [2.0.0] - 2024-10-15
### Changed
* __BREAKING__: Removed methods `BaseStep::addToResult()`, `BaseStep::addLaterToResult()`, `BaseStep::addsToOrCreatesResult()`, `BaseStep::createsResult()`, and `BaseStep::keepInputData()`. These methods were deprecated in v1.8.0 and should be replaced with `Step::keep()`, `Step::keepAs()`, `Step::keepFromInput()`, and `Step::keepInputAs()`.
* __BREAKING__: Added the following keep methods to the `StepInterface`: `StepInterface::keep()`, `StepInterface::keepAs()`, `StepInterface::keepFromInput()`, `StepInterface::keepInputAs()`, as well as `StepInterface::keepsAnything()`, `StepInterface::keepsAnythingFromInputData()` and `StepInterface::keepsAnythingFromOutputData()`. If you have a class that implements this interface without extending `Step` (or `BaseStep`), you will need to implement these methods yourself. However, it is strongly recommended to extend `Step` instead.
* __BREAKING__: With the removal of the `addToResult()` method, the library no longer uses `toArrayForAddToResult()` methods on output objects. Instead, please use `toArrayForResult()`. Consequently, `RespondedRequest::toArrayForAddToResult()` has been renamed to `RespondedRequest::toArrayForResult()`.
* __BREAKING__: Removed the `result` and `addLaterToResult` properties from `Io` objects (`Input` and `Output`). These properties were part of the `addToResult` feature and are now removed. Instead, use the `keep` property where kept data is added.
* __BREAKING__: The signature of the `Crawler::addStep()` method has changed. You can no longer provide a result key as the first parameter. Previously, this key was passed to the `Step::addToResult()` method internally. Now, please handle this call yourself.
* __BREAKING__: The return type of the `Crawler::loader()` method no longer allows `array`. This means it's no longer possible to provide multiple loaders from the crawler. Instead, use the new functionality to directly provide a custom loader to a step described below. As part of this change, the `UnknownLoaderKeyException` was also removed as it is now obsolete. If you have any references to this class, please make sure to remove them.
* __BREAKING__: Refactored the abstract `LoadingStep` class to a trait and removed the `LoadingStepInterface`. Loading steps should now extend the `Step` class and use the trait. As multiple loaders are no longer supported, the `addLoader` method was renamed to `setLoader`. Similarly, the methods `useLoader()` and `usesLoader()` for selecting loaders by key are removed. Now, you can directly provide a different loader to a single step using the trait's new `withLoader()` method (e.g., `Http::get()->withLoader($loader)`). The trait now also uses phpdoc template tags, for a generic loader type. You can define the loader type by putting `/** @use LoadingStep<MyLoader> */` above `use LoadingStep;` in your step class. Then your IDE and static analysis (if supported) will know what type of loader, the trait methods return and accept.
* __BREAKING__: Removed the `PaginatorInterface` to allow for better extensibility. The old `Crwlr\Crawler\Steps\Loading\Http\Paginators\AbstractPaginator` class has also been removed. Please use the newer, improved version `Crwlr\Crawler\Steps\Loading\Http\AbstractPaginator`. This newer version has also changed: the first argument `UriInterface $url` is removed from the `processLoaded()` method, as the URL also is part of the request (`Psr\Http\Message\RequestInterface`) which is now the first argument. Additionally, the default implementation of the `getNextRequest()` method is removed. Child implementations must define this method themselves. If your custom paginator still has a `getNextUrl()` method, note that it is no longer needed by the library and will not be called. The `getNextRequest()` method now fulfills its original purpose.
* __BREAKING__: Removed methods from `HttpLoader`:
  * `$loader->setHeadlessBrowserOptions()` => use `$loader->browser()->setOptions()` instead
  * `$loader->addHeadlessBrowserOptions()` => use `$loader->browser()->addOptions()` instead
  * `$loader->setChromeExecutable()` => use `$loader->browser()->setExecutable()` instead
  * `$loader->browserHelper()` => use `$loader->browser()` instead
* __BREAKING__: Removed method `RespondedRequest::cacheKeyFromRequest()`. Use `RequestKey::from()` instead.
* __BREAKING__: The `HttpLoader::retryCachedErrorResponses()` method now returns an instance of the new `Crwlr\Crawler\Loader\Http\Cache\RetryManager` class. This class provides the methods `only()` and `except()` to restrict retries to specific HTTP response status codes. Previously, this method returned the `HttpLoader` itself (`$this`), so if you're using it in a chain and calling other loader methods after it, you will need to refactor your code.
* __BREAKING__: Removed the `Microseconds` class from this package. It has been moved to the `crwlr/utils` package, which you can use instead.

### Added
* New methods `FileCache::prolong()` and `FileCache::prolongAll()` to allow prolonging the time to live for cached responses.

### Fixed
* The `maxOutputs()` method is now also available and working on `Group` steps.
* Improved warning messages for step validations that are happening before running a crawler.
* A `PreRunValidationException` when the crawler finds a problem with the setup, before actually running, is not only logged as an error via the logger, but also rethrown to the user. This way the user won't get the impression, that the crawler ran successfully without looking at the log messages.

## [1.10.0] - 2024-08-05
### Added
* URL refiners: `UrlRefiner::withScheme()`, `UrlRefiner::withHost()`, `UrlRefiner::withPort()`, `UrlRefiner::withoutPort()`, `UrlRefiner::withPath()`, `UrlRefiner::withQuery()`, `UrlRefiner::withoutQuery()`, `UrlRefiner::withFragment()` and `UrlRefiner::withoutFragment()`.
* New paginator stop rules `PaginatorStopRules::contains()` and `PaginatorStopRules::notContains()`.
* Static method `UserAgent::mozilla5CompatibleBrowser()` to get a `UserAgent` instance with the user agent string `Mozilla/5.0 (compatible)` and also the new method `withMozilla5CompatibleUserAgent` in the `AnonymousHttpCrawlerBuilder` that you can use like this: `HttpCrawler::make()->withMozilla5CompatibleUserAgent()`.

## [1.9.5] - 2024-07-25
### Fixed
* Prevent PHP warnings when an HTTP response includes a `Content-Type: application/x-gzip` header, but the content is not actually compressed. This issue also occurred with cached responses, because compressed content is decoded during caching. Upon retrieval from the cache, the header indicated compression, but the content was already decoded.

## [1.9.4] - 2024-07-24
### Fixed
* When using `HttpLoader::cacheOnlyWhereUrl()` to restrict caching, the filter rule is not only applied when adding newly loaded responses to the cache, but also for using cached responses. Example: a response for `https://www.example.com/foo` is already available in the cache, but `$loader->cacheOnlyWhereUrl(Filter::urlPathStartsWith('/bar/'))` was called, the cached response is not used.

## [1.9.3] - 2024-07-05
### Fixed
* Add `HttpLoader::browser()` as a replacement for `HttpLoader::browserHelper()` and deprecate the `browserHelper()` method. It's an alias and just because it will read a little better: `$loader->browser()->xyz()` vs. `$loader->browserHelper()->xyz()`. `HttpLoader::browserHelper()` will be removed in v2.0.
* Also deprecate `HttpLoader::setHeadlessBrowserOptions()`, `HttpLoader::addHeadlessBrowserOptions()` and `HttpLoader::setChromeExecutable()`. Use `$loader->browser()->setOptions()`, `$loader->browser()->addOptions()` and `$loader->browser()->setExecutable()` instead.

## [1.9.2] - 2024-06-18
### Fixed
* Issue with setting the headless chrome executable, introduced in 1.9.0. 

## [1.9.1] - 2024-06-17
### Added
* Also add `HeadlessBrowserLoaderHelper::getTimeout()` to get the currently configured timeout value.

## [1.9.0] - 2024-06-17
### Added
* New methods `HeadlessBrowserLoaderHelper::setTimeout()` and `HeadlessBrowserLoaderHelper::waitForNavigationEvent()` to allow defining the timeout for the headless chrome in milliseconds (default 30000 = 30 seconds) and the navigation event (`load` (default), `DOMContentLoaded`, `firstMeaningfulPaint`, `networkIdle`, etc.) to wait for when loading a URL.

## [1.8.0] - 2024-06-05
### Added
* New methods `Step::keep()` and `Step::keepAs()`, as well as `Step::keepFromInput()` and `Step::keepInputAs()`, as alternatives to `Step::addToResult()` (or `Step::addLaterToResult()`). The `keep()` method can be called without any argument, to keep all from the output data. It can be called with a string, to keep a certain key or with an array to keep a list of keys. If the step yields scalar value outputs (not an associative array or object with keys) you need to use the `keepAs()` method with the key you want the output value to have in the kept data. The methods `keepFromInput()` and `keepInputAs()` work the same, but uses the input (not the output) that the step receives. Most likely only needed with a first step, to keep data from initial inputs (or in a sub crawler, see below). Kept properties can also be accessed with the `Step::useInputKey()` method, so you can easily reuse properties from multiple steps ago as input.
* New method `Step::outputType()` with default implementation returning `StepOutputType::Mixed`. Please consider implementing this method yourself in all your custom steps, because it is going to be required in v2 of the library. It allows detecting (potential) problems in crawling procedures immediately when starting a run instead of failing after already running a while.
* New method `Step::subCrawlerFor()`, allowing to fill output properties from an actual full child crawling procedure. As the first argument, you give it a key from the step's output, that the child crawler uses as input(s). As the second argument you need to provide a `Closure` that receives a clone of the current `Crawler` without steps and with initial inputs, set from the current output. In the `Closure` you then define the crawling procedure by adding steps as you're used to do it, and return it. This allows to achieve nested output data, scraped from different (sub-)pages, more flexible and less complicated as with the usual linear crawling procedure and `Step::addToResult()`.

### Deprecated
* The `Step::addToResult()`, `Step::addLaterToResult()` and `Step::keepInputData()` methods. Instead, please use the new keep methods. This can cause some migration work for v2, because especially the add to result methods are a pretty central functionality, but the new "keep" methodology (plus the new sub crawler feature) will make a lot of things easier, less complex and the library will most likely work more efficiently in v2.

### Fixed
* When a cache file was generated with compression, and you're trying to read it with a `FileCache` instance without compression enabled, it also works. When unserializing the file content fails it tries decoding the string first before unserializing it.

## [1.7.2] - 2024-03-19
### Fixed
* When the `useInputKey()` method is used on a step and the defined key does not exist in input, it logs a warning and does not invoke the step instead of throwing an `Exception`.

## [1.7.1] - 2024-03-11
### Fixed
* A PHP error that happened when the loader returns `null` for the initial request in the `Http::crawl()` step.

## [1.7.0] - 2024-03-04
### Added
* Allow getting the whole decoded JSON as array with the new `Json::all()` and also allow to get the whole decoded JSON, when using `Json::get()`, inside a mapping using either empty string or `*` as target. Example: `Json::get(['all' => '*'])`. `*` only works, when there is no key `*` in the decoded data.

### Fixed
* Make it work with responses loaded by a headless browser. If decoding the input string fails, it now checks if it could be HTML. If that's the case, it extracts the text content of the `<body>` and tries to decode this instead.

## [1.6.2] - 2024-02-26
### Fixed
* When using `HttpLoader::cacheOnlyWhereUrl()` and a request was redirected (maybe even multiple times), previously all URLs in the chain had to match the filter rule. As this isn't really practicable, now only one of the URLs has to match the rule.

## [1.6.1] - 2024-02-16
### Changed
* Make method `HttpLoader::addToCache()` public, so steps can update a cached response with an extended version.

## [1.6.0] - 2024-02-13
### Added
* Enable dot notation in `Step::addToResult()`, so you can get data from nested output, like: `$step->addToResult(['url' => 'response.url', 'status' => 'response.status', 'foo' => 'bar'])`.
* When a step adds output properties to the result, and the output contains objects, it tries to serialize those objects to arrays, by calling `__serialize()`. If you want an object to be serialized differently for that purpose, you can define a `toArrayForAddToResult()` method in that class. When that method exists, it's preferred to the `__serialize()` method.
* Implemented above-mentioned `toArrayForAddToResult()` method in the `RespondedRequest` class, so on every step that somehow yields a `RespondedRequest` object, you can use the keys `url`, `uri`, `status`, `headers` and `body` with the `addToResult()` method. Previously this only worked for `Http` steps, because it defines output key aliases (`HttpBase::outputKeyAliases()`). Now, in combination with the ability to use dot notation when adding data to the result, if your custom step returns nested output like `['response' => RespondedRequest, 'foo' => 'bar']`, you can add response data to the result like this `$step->addToResult(['url' => 'response.url', 'body' => 'response.body'])`.

### Fixed
* Improvement regarding the timing when a store (`Store` class instance) is called by the crawler with a final crawling result. When a crawling step initiates a crawling result (so, `addToResult()` was called on the step instance), the crawler has to wait for all child outputs (resulting from one step-input) until it calls the store, because the child outputs can all add data to the same final result object. But previously this was not only the case for all child outputs starting from a step where `addToResult()` was called, but all children of one initial crawler input. So with this change, in a lot of cases, the store will earlier be called with finished `Result` objects and memory usage will be lowered.

## [1.5.3] - 2024-02-07
### Fixed
* Merge `HttpBaseLoader` back to `HttpLoader`. It's probably not a good idea to have multiple loaders. At least not multiple loaders just for HTTP. It should be enough to publicly expose the `HeadlessBrowserLoaderHelper` via `HttpLoader::browserHelper()` for the extension steps. But keep the `HttpBase` step, to share the general HTTP functionality implemented there.

## [1.5.2] - 2024-02-07
### Fixed
* Issue in `GetUrlsFromSitemap` (`Sitemap::getUrlsFromSitemap()`) step when XML content has no line breaks.

## [1.5.1] - 2024-02-06
### Fixed
* For being more flexible to build a separate headless browser loader (in an extension package) extract the most basic HTTP loader functionality to a new `HttpBaseLoader` and important functionality for the headless browser loader to a new `HeadlessBrowserLoaderHelper`. Further, also share functionality from the `Http` steps via a new abstract `HttpBase` step. It's considered a fix, because there's no new functionality, just refactoring existing code for better extendability.

## [1.5.0] - 2024-01-29
### Added
* The `DomQuery` class (parent of `CssSelector` (`Dom::cssSelector`) and `XPathQuery` (`Dom::xPath`)) has a new method `formattedText()` that uses the new crwlr/html-2-text package to convert the HTML to formatted plain text. You can also provide a customized instance of the `Html2Text` class to the `formattedText()` method.

### Fixed
* The `Http::crawl()` step won't yield a page again if a newly found URL responds with a redirect to a previously loaded URL.

## [1.4.0] - 2024-01-14
### Added
* The `QueryParamsPaginator` can now also increase and decrease non first level query param values like `foo[bar][baz]=5` using dot notation: `QueryParamsPaginator::paramsInUrl()->increaseUsingDotNotation('foo.bar.baz', 5)`.

## [1.3.5] - 2023-12-20
### Fixed
* The `FileCache` can now also read uncompressed cache files when compression is activated.

## [1.3.4] - 2023-12-19
### Fixed
* Reset paginator state after finishing paginating for one base input, to enable paginating multiple listings of the same structure.

## [1.3.3] - 2023-12-01
### Fixed
* Add forgotten getter method to get the DOM query that is attached to an `InvalidDomQueryException` instance.

## [1.3.2] - 2023-12-01
### Fixed
* When creating a `CssSelector` or `XPathQuery` instance with invalid selector/query syntax, an `InvalidDomQueryException` is now immediately thrown. This change is considered to be not only non-breaking, but actually a fix, because the `CssSelector` would otherwise throw an exception later when the `apply()` method is called. The `XPathQuery` would silently return no result without notifying you of the invalid query and generate a PHP warning.

## [1.3.1] - 2023-11-30
### Fixed
* Support usage with the new Symfony major version v7.

## [1.3.0] - 2023-10-28
### Added
* New methods `HttpLoader::useProxy()` and `HttpLoader::useRotatingProxies([...])` to define proxies that the loader shall use. They can be used with a guzzle HTTP client instance (default) and when the loader uses the headless Chrome browser. Using them when providing some other PSR-18 implementation will throw an exception.
* New `QueryParamsPaginator` to paginate by increasing and/or decreasing one or multiple query params, either in the URL or in the body of requests. Can be created via static method `Crwlr\Crawler\Steps\Loading\Http\Paginator::queryParams()`.
* New method `stopWhen` in the new `Crwlr\Crawler\Steps\Loading\Http\AbstractPaginator` class (for more info see the deprecation below). You can pass implementations of the new `StopRule` interface or custom closures to that method and then, every time the Paginator receives a loaded response to process, those stop rules are called with the response. If any of the conditions of the stop rules is met, the Paginator stops paginating. Of course also added a few stop rules to use with that new method: `IsEmptyInHtml`, `IsEmptyInJson`, `IsEmptyInXml` and `IsEmptyResponse`, also available via static methods: `PaginatorStopRules::isEmptyInHtml()`, `PaginatorStopRules::isEmptyInJson()`, `PaginatorStopRules::isEmptyInXml()` and `PaginatorStopRules::isEmptyResponse()`.

### Deprecated
* Deprecated the `Crwlr\Crawler\Steps\Loading\Http\PaginatorInterface` and the `Crwlr\Crawler\Steps\Loading\Http\Paginators\AbstractPaginator`. Instead, added a new version of the `AbstractPaginator` as `Crwlr\Crawler\Steps\Loading\Http\AbstractPaginator` that can be used. Usually there shouldn't be a problem switching from the old to the new version. If you want to make your custom paginator implementation ready for v2 of the library, extend the new `AbstractPaginator` class, implement your own `getNextRequest` method (new requirement, with a default implementation in the abstract class, which will be removed in v2) and check if properties and methods of your existing class don't collide with the new properties and methods in the abstract class.

### Fixed
* The `HttpLoader::load()` implementation won't throw any exception, because it shouldn't kill a crawler run. When you want any loading error to end the whole crawler execution `HttpLoader::loadOrFail()` should be used. Also adapted the phpdoc in the `LoaderInterface`.

## [1.2.2] - 2023-09-19
### Fixed
* Fix in `HttpCrawl` (`Http::crawl()`) step: when a page contains a broken link, that can't be resolved and throws an `Exception` from the URL library, ignore the link and log a warning message.
* Minor fix for merging HTTP headers when an `Http` step gets both, statically defined headers and headers to use from array input.

## [1.2.1] - 2023-08-21
### Fixed
* When a URL redirects, the `trackRequestEndFor()` method of the `HttpLoader`'s `Throttler` instance is called only once at the end and with the original request URL.

## [1.2.0] - 2023-08-18
### Added
* New `onCacheHit` hook in the `Loader` class (in addition to `beforeLoad`, `onSuccess`, `onError` and `afterLoad`) that is called in the `HttpLoader` class when a response for a request was found in the cache.

### Deprecated
* Moved the `Microseconds` value object class to the crwlr/utils package, as it is a very useful and universal tool. The class in this package still exists, but just extends the class from the utils package and will be removed in v2. So, if you're using this class, please change to use the version from the utils package.

## [1.1.6] - 2023-07-20
### Fixed
* Throttling now also works when using the headless browser.

## [1.1.5] - 2023-07-14
### Fixed
* The `Http::crawl()` step, as well as the `Html::getLink()` and `Html::getLinks()` steps now ignore links, when the `href` attribute starts with `mailto:`, `tel:` or `javascript:`. For the crawl step it obviously makes no sense, but it's also considered a bugfix for the getLink(s) steps, because they are meant to deliver absolute HTTP URLs. If you want to get the values of such links, use the HTML data extraction step.

## [1.1.4] - 2023-07-14
### Fixed
* The `Http::crawl()` step now also work with sitemaps as input URL, where the `<urlset>` tag contains attributes that would cause the symfony DomCrawler to not find any elements.

## [1.1.3] - 2023-06-29
### Fixed
* Improved `Json` step: if the target of the "each" (like `Json::each('target', [...])`) does not exist in the input JSON data, the step yields nothing and logs a warning.

## [1.1.2] - 2023-05-28
### Fixed
* Using the `only()` method of the `MetaData` (`Html::metaData()`) step class, the `title` property was always contained in the output, even if not listed in the `only` properties. This is fixed now.

## [1.1.1] - 2023-05-28
### Fixed
* There was an issue when adding multiple associative arrays with the same key to a `Result` object: let's say you're having a step producing array output like: `['bar' => 'something', 'baz' => 'something else']` and it (the whole array) shall be added to the result property `foo`. When the step produced multiple such array outputs, that led to a result like `['bar' => '...', 'baz' => '...', ['bar' => '...', 'baz' => '...'], ['bar' => '...', 'baz' => '...']`. Now it's fixed to result in `[['bar' => '...', 'baz' => '...'], ['bar' => '...', 'baz' => '...'], ['bar' => '...', 'baz' => '...']`.

## [1.1.0] - 2023-05-21

### Added
* `Http` steps can now receive body and headers from input data (instead of statically defining them via argument like `Http::method(headers: ...)`) using the new methods `useInputKeyAsBody(<key>)` and `useInputKeyAsHeader(<key>, <asHeader>)` or `useInputKeyAsHeaders(<key>)`. Further, when invoked with associative array input data, the step will by default use the value from `url` or `uri` for the request URL. If the input array contains the URL in a key with a different name, you can use the new `useInputKeyAsUrl(<key>)` method. That was basically already possible with the existing `useInputKey(<key>)` method, because the URL is the main input argument for the step. But if you want to use it in combination with the other new `useInputKeyAsXyz()` methods, you have to use `useInputKeyAsUrl()`, because using `useInputKey(<key>)` would invoke the whole step with that key only.
* `Crawler::runAndDump()` as a simple way to just run a crawler and dump all results, each as an array.
* `addToResult()` now also works with serializable objects.
* If you know certain keys that the output of a step will contain, you can now also define aliases for those keys, to be used with `addToResult()`. The output of an `Http` step (`RespondedRequest`) contains the keys `requestUri` and `effectiveUri`. The aliases `url` and `uri` refer to `effectiveUri`, so `addToResult(['url'])` will add the `effectiveUri` as `url` to the result object.
* The `GetLink` (`Html::getLink()`) and `GetLinks` (`Html::getLinks()`) steps, as well as the abstract `DomQuery` (parent of `CssSelector` (/`Dom::cssSelector`) and `XPathQuery` (/`Dom::xPath`)) now have a method `withoutFragment()` to get links respectively URLs without their fragment part.
* The `HttpCrawl` step (`Http::crawl()`) has a new method `useCanonicalLinks()`. If you call it, the step will not yield responses if its canonical link URL was already yielded. And if it discovers a link, and some document pointing to that URL via canonical link was already loaded, it treats it as if it was already loaded. Further this feature also sets the canonical link URL as the `effectiveUri` of the response.
* All filters can now be negated by calling the `negate()` method, so the `evaluate()` method will return the opposite bool value when called. The `negate()` method returns an instance of `NegatedFilter` that wraps the original filter.
* New method `cacheOnlyWhereUrl()` in the `HttpLoader` class, that takes an instance of the `FilterInterface` as argument. If you define one or multiple filters using this method, the loader will cache only responses for URLs that match all the filters.

### Fixed
* The `HttpCrawl` step (`Http::crawl()`) by default now removes the fragment part of URLs to not load the same page multiple times, because in almost any case, servers won't respond with different content based on the fragment. That's why this change is considered non-breaking. For the rare cases when servers respond with different content based on the fragment, you can call the new `keepUrlFragment()` method of the step.
* Although the `HttpCrawl` step (`Http::crawl()`) already respected the limit of outputs defined via the `maxOutputs()` method, it actually didn't stop loading pages. The limit had no effect on loading, only on passing on outputs (responses) to the next step. This is fixed in this version.
* A so-called byte order mark at the beginning of a file (/string) can cause issues. So just remove it, when a step's input string starts with a UTF-8 BOM.
* There seems to be an issue in guzzle when it gets a PSR-7 request object with a header with multiple string values (as array, like: `['accept-encoding' => ['gzip', 'deflate', 'br']]`). When testing it happened that it only sent the last part (in this case `br`). Therefore, the `HttpLoader` now prepares headers before sending (in this case to: `['accept-encoding' => ['gzip, deflate, br']]`).
* You can now also use the output key aliases when filtering step outputs. You can even use keys that are only present in the serialized version of an output object.

## [1.0.2] - 2023-03-20
### Fixed
* JSON step: another fix for JSON strings having keys without quotes with empty string value.

## [1.0.1] - 2023-03-17
### Fixed
* JSON step: improve attempt to fix JSON string having keys without quotes.

## [1.0.0] - 2023-02-08

### Added
* New method `Step::refineOutput()` to manually refine step output values. It takes either a `Closure` or an instance of the new `RefinerInterface` as argument. If the step produces array output, you can provide a key from the array output, to refine, as first argument and the refiner as second argument. You can call the method multiple times and all the refiners will be applied to the outputs in the order you add them. If you want to refine multiple output array keys with a `Closure`, you can skip providing a key and the `Closure` will receive the full output array for refinement. As mentioned you can provide an instance of the `RefinerInterface`. There are already a few implementations: `StringRefiner::afterFirst()`, `StringRefiner::afterLast()`, `StringRefiner::beforeFirst()`, `StringRefiner::beforeLast()`, `StringRefiner::betweenFirst()`, `StringRefiner::betweenLast()` and `StringRefiner::replace()`.
* New method `Step::excludeFromGroupOutput()` to exclude a normal steps output from the combined output of a group that it's part of.
* New method `HttpLoader::setMaxRedirects()` to customize the limit of redirects to follow. Works only when using the HTTP client.
* New filters to filter by string length, with the same options as the comparison filters (equal, not equal, greater than,...).
* New `Filter::custom()` that you can use with a Closure, so you're not limited to the available filters only.
* New method `DomQuery::link()` as a shortcut for `DomQuery::attribute('href')->toAbsoluteUrl()`.
* New static method `HttpCrawler::make()` returning an instance of the new class `AnonymousHttpCrawlerBuilder`. This makes it possible to create your own Crawler instance with a one-liner like: `HttpCrawler::make()->withBotUserAgent('MyCrawler')`. There's also a `withUserAgent()` method to create an instance with a normal (non bot) user agent.

### Changed
* __BREAKING__: The `FileCache` now also respects the `ttl` (time to live) argument and by default it is one hour (3600 seconds). If you're using the cache and expect the items to live (basically) forever, please provide a high enough value for default the time to live. When you try to get a cache item that is already expired, it (the file) is immediately deleted.
* __BREAKING__: The `TooManyRequestsHandler` (and with that also the constructor argument in the `HttpLoader`) was renamed to `RetryErrorResponseHandler`. It now reacts the same to 503 (Service Unavailable) responses as to the 429 (Too Many Requests) responses. If you're actively passing your own instance to the `HttpLoader`, you need to update it.
* You can now have multiple different loaders in a `Crawler`. To use this, return an array containing your loaders from the protected `Crawler::loader()` method with keys to name them. You can then selectively use them by calling the `Step::useLoader()` method on a loading step with the key of the loader it should use.

### Removed
* __BREAKING__: The loop feature. The only real world use case should be paginating listings and this should be solved with the Paginator feature.
* __BREAKING__: `Step::dontCascade()` and `Step::cascades()` because with the change in v0.7, that groups can only produce combined output, there should be no use case for this anymore. If you want to exclude one steps output from the combined group output, you can use the new `Step::excludeFromGroupOutput()` method.

## [0.7.0] - 2023-01-13

### Added
* New functionality to paginate: There is the new `Paginate` child class of the `Http` step class (easy access via `Http::get()->paginate()`). It takes an instance of the `PaginatorInterface` and uses it to iterate through pagination links. There is one implementation of that interface, the `SimpleWebsitePaginator`. The `Http::get()->paginate()` method uses it by default, when called just with a CSS selector to get pagination links. Paginators receive all loaded pages and implement the logic to find pagination links. The paginator class is also called before sending a request, with the request object that is about to be sent as an argument (`prepareRequest()`). This way, it should even be doable to implement more complex pagination functionality. For example when pagination is built using POST request with query strings in the request body.
* New methods `stopOnErrorResponse()` and `yieldErrorResponses()` that can be used with `Http` steps. By calling `stopOnErrorResponse()` the step will throw a `LoadingException` when a response has a 4xx or 5xx status code. By calling the `yieldErrorResponse()` even error responses will be yielded and passed on to the next steps (this was default behaviour until this version. See the breaking change below).
* The body of HTTP responses with a `Content-Type` header containing `application/x-gzip` are automatically decoded when `Http::getBodyString()` is used. Therefore, added `ext-zlib` to suggested in `composer.json`.
* New methods `addToResult()` and `addLaterToResult()`. `addToResult()` is a single replacement for `setResultKey()` and `addKeysToResult()` (they are removed, see `Changed` below) that can be used for array and non array output. `addLaterToResult()` is a new method that does not create a Result object immediately, but instead adds the output of the current step to all the Results that will later be created originating from the current output.
* New methods `outputKey()` and `keepInputData()` that can be used with any step. Using the `outputKey()` method, the step will convert non array output to an array and use the key provided as an argument to this method as array key for the output value. The `keepInputData()` method allows you to forward data from the step's input to the output. If the input is non array you can define a key using the method's argument. This is useful e.g. if you're having data in the initial inputs that you also want to add to the final crawling results.
* New method `createsResult()` that can be used with any step, so you can differentiate if a step creates a Result object, or just keeps data to add to results later (new `addLaterToResult()` method). But primarily relevant for library internal use.
* The `FileCache` class can compress the cache data now to save disk space. Use the `useCompression()` method to do so.
* New method `retryCachedErrorResponses()` in `HttpLoader`. When called, the loader will only use successful responses (status code < 400) from the cache and therefore retry already cached error responses.
* New method `writeOnlyCache()` in `HttpLoader` to only write to, but don't read from the response cache. Can be used to renew cached responses.
* `Filter::urlPathMatches()` to filter URL paths using a regex.
* Option to provide a chrome executable name to the `chrome-php/chrome` library via `HttpLoader::setChromeExecutable()`.

### Changed
* __BREAKING__: Group steps can now only produce combined outputs, as previously done when `combineToSingleOutput()` method was called. The method is removed. 
* __BREAKING__: `setResultKey()` and `addKeysToResult()` are removed. Calls to those methods can both be replaced with calls to the new `addToResult()` method.
* __BREAKING__: `getResultKey()` is also removed with `setResultKey()`. It's removed without replacement, as it doesn't really make sense any longer.
* __BREAKING__: Error responses (4xx as well as 5xx), by default, won't produce any step outputs any longer. If you want to receive error responses, use the new `yieldErrorResponses()` method.
* __BREAKING__: Removed the `httpClient()` method in the `HttpCrawler` class. If you want to provide your own HTTP client, implement a custom `loader` method passing your client to the `HttpLoader` instead.
* __Deprecated__ the loop feature (class `Loop` and `Crawler::loop()` method). Probably the only use case is iterating over paginated list pages, which can be done using the new Paginator functionality. It will be removed in v1.0.
* In case of a 429 (Too Many Requests) response, the `HttpLoader` now automatically waits and retries. By default, it retries twice and waits 10 seconds for the first retry and a minute for the second one. In case the response also contains a `Retry-After` header with a value in seconds, it complies to that. Exception: by default it waits at max `60` seconds (you can set your own limit if you want), if the `Retry-After` value is higher, it will stop crawling. If all the retries also receive a `429` it also throws an Exception.
* Removed logger from `Throttler` as it doesn't log anything.
* Fail silently when `robots.txt` can't be parsed.
* Default timeout configuration for the default guzzle HTTP client: `connect_timeout` is `10` seconds and `timeout` is `60` seconds.
* The `validateAndSanitize...()` methods in the abstract `Step` class, when called with an array with one single element, automatically try to use that array element as input value.
* With the `Html` and `Xml` data extraction steps you can now add layers to the data that is being extracted, by just adding further `Html`/`Xml` data extraction steps as values in the mapping array that you pass as argument to the `extract()` method.
* The base `Http` step can now also be called with an array of URLs as a single input. Crawl and Paginate steps still require a single URL input.

### Fixed
* The `CookieJar` now also works with `localhost` or other hosts without a registered domain name.
* Improve the `Sitemap::getUrlsFromSitemap()` step to also work when the `<urlset>` tag contains attributes that would cause the symfony DomCrawler to not find any elements.
* Fixed possibility of infinite redirects in `HttpLoader` by adding a redirects limit of 10.

## [0.6.0] - 2022-10-03

### Added
* New step `Http::crawl()` (class `HttpCrawl` extending the normal `Http` step class) for conventional crawling. It loads all pages of a website (same host or domain) by following links. There's also a lot of options like depth, filtering by paths, and so on.
* New steps `Sitemap::getSitemapsFromRobotsTxt()` (`GetSitemapsFromRobotsTxt`) and `Sitemap::getUrlsFromSitemap()` (`GetUrlsFromSitemap`) to get sitemap (URLs) from a robots.txt file and to get all the URLs from those sitemaps.
* New step `Html::metaData()` to get data from meta tags (and title tag) in HTML documents.
* New step `Html::schemaOrg()` (`SchemaOrg`) to get schema.org structured data in JSON-LD format from HTML documents.
* The abstract `DomQuery` class (parent of the `CssSelector` and `XPathQuery` classes) now has some methods to narrow the selected matches further: `first()`, `last()`, `nth(n)`, `even()`, `odd()`.

### Changed
* __BREAKING__: Removed `PoliteHttpLoader` and traits `WaitPolitely` and `CheckRobotsTxt`. Converted the traits to classes `Throttler` and `RobotsTxtHandler` which are dependencies of the `HttpLoader`. The `HttpLoader` internally gets default instances of those classes. The `RobotsTxtHandler` will respect robots.txt rules by default if you use a `BotUserAgent` and it won't if you use a normal `UserAgent`. You can access the loader's `RobotsTxtHandler` via `HttpLoader::robotsTxt()`. You can pass your own instance of the `Throttler` to the loader and also access it via `HttpLoader::throttle()` to change settings.

### Fixed
* Getting absolute links via the `GetLink` and `GetLinks` steps and the `toAbsoluteUrl()` method of the `CssSelector` and `XPathQuery` classes, now also look for `<base>` tags in HTML when resolving the URLs.
* The `SimpleCsvFileStore` can now also save results with nested data (but only second level). It just concatenates the values separated with a ` | `.

## [0.5.0] - 2022-09-03
### Added
* You can now call the new `useHeadlessBrowser` method on the `HttpLoader` class to use a headless Chrome browser to load pages. This is enough to get HTML after executing javascript in the browser. For more sophisticated tasks a separate Loader and/or Steps should better be created.
* With the `maxOutputs()` method of the abstract `Step` class you can now limit how many outputs a certain step should yield at max. That's for example helpful during development, when you want to run the crawler only with a small subset of the data/requests it will actually have to process when you eventually remove the limits. When a step has reached its limit, it won't even call the `invoke()` method any longer until the step is reset after a run.
* With the new `outputHook()` method of the abstract `Crawler` class you can set a closure that'll receive all the outputs from all the steps. Should be only for debugging reasons.
* The `extract()` method of the `Html` and `Xml` (children of `Dom`) steps now also works with a single selector instead of an array with a mapping. Sometimes you'll want to just get a simple string output e.g. for a next step, instead of an array with mapped extracted data.
* In addition to `uniqueOutputs()` there is now also `uniqueInputs()`. It works exactly the same as `uniqueOutputs()`, filtering duplicate input values instead. Optionally also by a key when expected input is an array or an object.
* In order to be able to also get absolute links when using the `extract()` method of Dom steps, the abstract `DomQuery` class now has a method `toAbsoluteUrl()`. The Dom step will automatically provide the `DomQuery` instance with the base url, presumed that the input was an instance of the `RespondedRequest` class and resolve the selected value against that base url.

### Changed
* Remove some not so important log messages.
* Improve behavior of group step's `combineToSingleOutput()`. When steps yield multiple outputs, don't combine all yielded outputs to one. Instead, combine the first output from the first step with the first output from the second step, and so on.
* When results are not explicitly composed, but the outputs of the last step are arrays with string keys, it sets those keys on the Result object instead of setting a key `unnamed` with the whole array as value.

### Fixed
* The static methods `Html::getLink()` and `Html::getLinks()` now also work without argument, like the `GetLink` and `GetLinks` classes.
* When a `DomQuery` (CSS selector or XPath query) doesn't match anything, its `apply()` method now returns `null` (instead of an empty string). When the `Html(/Xml)::extract()` method is used with a single, not matching selector/query, nothing is yielded. When it's used with an array with a mapping, it yields an array with null values. If the selector for one of the methods `Html(/Xml)::each()`, `Html(/Xml)::first()` or `Html(/Xml)::last()` doesn't match anything, that's not causing an error any longer, it just won't yield anything.
* Removed the (unnecessary) second argument from the `Loop::withInput()` method because when `keepLoopingWithoutOutput()` is called and `withInput()` is called after that call, it resets the behavior.
* Issue when date format for expires date in cookie doesn't have dashes in `d-M-Y` (so `d M Y`).

## [0.4.1] - 2022-05-10
### Fixed
* The `Json` step now also works with Http responses as input.

## [0.4.0] - 2022-05-06
### Added
* The `BaseStep` class now has `where()` and `orWhere()` methods to filter step outputs. You can set multiple filters that will be applied to all outputs. When setting a filter using `orWhere` it's linked to the previously added Filter with "OR". Outputs not matching one of the filters, are not yielded. The available filters can be accessed through static methods on the new `Filter` class. Currently available filters are comparison filters (equal, greater/less than,...), a few string filters (contains, starts/ends with) and url filters (scheme, domain, host,...).
* The `GetLink` and `GetLinks` steps now have methods `onSameDomain()`, `notOnSameDomain()`, `onDomain()`, `onSameHost()`, `notOnSameHost()`, `onHost()` to restrict the which links to find.
* Automatically add the crawler's logger to the `Store` so you can also log messages from there. This can be breaking as the `StoreInterface` now also requires the `addLogger` method. The new abstract `Store` class already implements it, so you can just extend it.

### Changed
* The `Csv` step can now also be used without defining a column mapping. In that case it will use the values from the first line (so this makes sense when there are column headlines) as output array keys.

## [0.3.0] - 2022-04-27
### Added
* By calling `monitorMemoryUsage()` you can tell the Crawler to add log messages with the current memory usage after every step invocation. You can also set a limit in bytes when to start monitoring and below the limit it won't log memory usage.

### Fixed
* Previously the __use of Generators__ actually didn't make a lot of sense, because the outputs of one step were only iterated and passed on to the next step, after the current step was invoked with all its inputs. That makes steps with a lot of inputs bottlenecks and causes bigger memory consumption. So, changed the crawler to immediately pass on outputs of one step to the next step if there is one.

## [0.2.0] - 2022-04-25
### Added
* `uniqueOutputs()` method to Steps to get only unique output values. If outputs are array or object, you can provide a key that will be used as identifier to check for uniqueness. Otherwise, the arrays or objects will be serialized for comparison which will probably be slower.
* `runAndTraverse()` method to Crawler, so you don't need to manually traverse the Generator, if you don't need the results where you're calling the crawler.
* Implement the behaviour for when a `Group` step should add something to the Result using `setResultKey()` or `addKeysToResult()`, which was still missing. For groups this will only work when using `combineToSingleOutput`.


================================================
FILE: CONTRIBUTING.md
================================================
# Contributing to this Package

That you're reading this must mean you consider contributing to
this package. So first off: Awesome! 👍🤘

## Bugs

In case you encounter any bugs please
[file an issue](https://github.com/crwlrsoft/crawler/issues/new).
Describe the issue as well as you can and provide an example to
reproduce it.  
Maybe you're not 100 percent sure whether what you've discovered
is a bug or the intended behavior. You can still file an issue
and tell us which results you'd expect.

If you know how to fix the issue you're welcome to send a pull
request. 💪

## New Features

If you have ideas for new features you can tell us about it on
[Twitter](https://twitter.com/crwlrsoft) or via
[crwlr.software](https://www.crwlr.software/contact) or just
send a pull request. Please keep in mind that there is no
guarantee that your feature will be merged.

## Conventions

### Coding Style

This package follows the
[PSR-12](https://www.php-fig.org/psr/psr-12/) coding standard.
You can run PHP CS Fixer via `composer cs` for a dry run or
`composer cs-fix` to automatically fix code style issues.

### Code quality tools

When you're making changes to this package please always run
tests and linting. Commands:  
`composer test`  
`composer test-integration`
`composer cs`
`composer stan`

Ideally you add the pre-commit git hook that is shipped with
this repo that will run tests and linting. Add it to your local
clone by running:  
`composer add-git-hooks`

The integration tests start a simple PHP web server for the
testing purpose on port 8000. If you have anything else running
on that port, the integration tests won't work.

Also, please don't forget to add new test cases if necessary.

### Documentation

For any code change that changes/adds something for users of
the package, please don't forget to add an entry to the
`CHANGELOG.md` file.

## Appreciation

When your pull request is merged I will show some love and tweet
about it. Also, if you meet me in person I will be glad to buy you
a beer.


================================================
FILE: LICENSE
================================================
Copyright (c) 2026 Christian Olear

Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject
to the following conditions:

The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.


================================================
FILE: README.md
================================================
<p align="center"><a href="https://www.crwlr.software" target="_blank"><img src="https://github.com/crwlrsoft/graphics/blob/eee6cf48ee491b538d11b9acd7ee71fbcdbe3a09/crwlr-logo.png" alt="crwlr.software logo" width="260"></a></p>

# Library for Rapid (Web) Crawler and Scraper Development

This library provides kind of a framework and a lot of ready to use, so-called __steps__, that you can use as building blocks, to build your own crawlers and scrapers with.

To give you an overview, here's a list of things that it helps you with:
* [Crawler __Politeness__](https://www.crwlr.software/packages/crawler/the-crawler/politeness) &#128519; (respecting robots.txt, throttling,...)
* Load URLs using
    * [a __(PSR-18) HTTP client__](https://www.crwlr.software/packages/crawler/the-crawler/loaders) (default is of course Guzzle)
    * or a [__headless browser__](https://www.crwlr.software/packages/crawler/the-crawler/loaders#using-a-headless-browser) (chrome) to get source after Javascript execution
* [Get __absolute links__ from HTML documents](https://www.crwlr.software/packages/crawler/included-steps/html#html-get-link) &#x1F517;
* [Get __sitemaps__ from robots.txt and get all URLs from those sitemaps](https://www.crwlr.software/packages/crawler/included-steps/sitemap)
* [__Crawl__ (load) all pages of a website](https://www.crwlr.software/packages/crawler/included-steps/http#crawling) &#x1F577;
* [Use __cookies__ (or don't)](https://www.crwlr.software/packages/crawler/the-crawler/loaders#http-loader) &#x1F36A;
* [Use any __HTTP methods__ (GET, POST,...) and send any headers or body](https://www.crwlr.software/packages/crawler/included-steps/http#http-requests)
* [Easily iterate over __paginated__ list pages](https://www.crwlr.software/packages/crawler/included-steps/http#paginating) &#x1F501;
* Extract data from:
    * [__HTML__](https://www.crwlr.software/packages/crawler/included-steps/html#extracting-data) and also [__XML__](https://www.crwlr.software/packages/crawler/included-steps/xml) (using CSS selectors or XPath queries)
    * [__JSON__](https://www.crwlr.software/packages/crawler/included-steps/json) (using dot notation)
    * [__CSV__](https://www.crwlr.software/packages/crawler/included-steps/csv) (map columns)
* [Extract __schema.org__ structured data](https://www.crwlr.software/packages/crawler/included-steps/html#schema-org) in __JSON-LD__ format from HTML documents
* [Keep memory usage low](https://www.crwlr.software/packages/crawler/crawling-procedure#memory-usage) by using PHP __Generators__ &#x1F4AA;
* [__Cache__ HTTP responses](https://www.crwlr.software/packages/crawler/response-cache) during development, so you don't have to load pages again and again after every code change
* [Get __logs__](https://www.crwlr.software/packages/crawler/the-crawler#loggers) about what your crawler is doing (accepts any PSR-3 LoggerInterface)
* And a lot more...

## Documentation

You can find the documentation at [crwlr.software](https://www.crwlr.software/packages/crawler/getting-started).

## Contributing

If you consider contributing something to this package, read the [contribution guide (CONTRIBUTING.md)](CONTRIBUTING.md).


================================================
FILE: bin/add-git-hooks
================================================
#!/usr/bin/env php
<?php

$src = __DIR__ . '/../git-hooks/pre-commit';
$dest = __DIR__ . '/../.git/hooks/pre-commit';

copy($src, $dest);
chmod($dest, 0755);


================================================
FILE: composer.json
================================================
{
    "name": "crwlr/crawler",
    "description": "Web crawling and scraping library.",
    "type": "library",
    "keywords": [
        "crwlr",
        "crawl",
        "crawler",
        "crawling",
        "scrape",
        "scraping",
        "scraper",
        "web",
        "bot"
    ],
    "homepage": "https://www.crwlr.software/packages/crawler",
    "license": "MIT",
    "authors": [
        {
            "name": "Christian Olear",
            "homepage": "https://www.otsch.codes",
            "role": "Developer"
        }
    ],
    "support": {
        "issues": "https://github.com/crwlrsoft/crawler/issues",
        "source": "https://github.com/crwlrsoft/crawler",
        "docs": "https://www.crwlr.software/packages/crawler"
    },
    "require": {
        "ext-dom": "*",
        "php": "^8.1",
        "crwlr/robots-txt": "^1.1",
        "crwlr/schema-org": "^0.2|^0.3",
        "crwlr/url": "^2.1",
        "psr/log": "^2.0|^3.0",
        "symfony/dom-crawler": "^6.0|^7.0",
        "symfony/css-selector": "^6.0|^7.0",
        "psr/simple-cache": "^1.0|^2.0|^3.0",
        "guzzlehttp/guzzle": "^7.4",
        "adbario/php-dot-notation": "^3.1",
        "chrome-php/chrome": "^1.7",
        "crwlr/utils": "^1.2",
        "crwlr/html-2-text": "^0.1.0"
    },
    "require-dev": {
        "pestphp/pest": "^2.3|^3.0|^4.0",
        "mockery/mockery": "^1.5",
        "phpstan/phpstan": "^1.4|^2.0",
        "phpstan/phpstan-mockery": "^1.0|^2.0",
        "phpstan/extension-installer": "^1.1",
        "phpstan/phpstan-phpunit": "^1.0|^2.0",
        "friendsofphp/php-cs-fixer": "^3.57",
        "spatie/invade": "^2.0",
        "symfony/process": "^6.0|^7.0"
    },
    "suggest": {
        "ext-zlib": "Needed to uncompress compressed responses",
        "voku/portable-ascii": "^2.0"
    },
    "funding": [
        {
            "type": "github",
            "url": "https://github.com/sponsors/otsch"
        }
    ],
    "autoload": {
        "psr-4": {
            "Crwlr\\Crawler\\": "src/"
        }
    },
    "autoload-dev": {
        "psr-4": {
            "tests\\": "tests/"
        }
    },
    "scripts": {
        "test": "./vendor/bin/pest --exclude-group=integration --exclude-group=php84 --display-warnings --bail",
        "test-php84": "./vendor/bin/pest --group=php84 --display-warnings --bail",
        "test-integration": "./vendor/bin/pest --group=integration --display-warnings --bail",
        "stan": "@php -d memory_limit=4G vendor/bin/phpstan analyse",
        "cs": "php-cs-fixer fix -v --dry-run",
        "cs-fix": "php-cs-fixer fix -v",
        "add-git-hooks": "@php bin/add-git-hooks"
    },
    "config": {
        "allow-plugins": {
            "pestphp/pest-plugin": true,
            "phpstan/extension-installer": true
        }
    }
}


================================================
FILE: git-hooks/pre-commit
================================================
#!/usr/bin/env php
<?php

run('composer test', 'Unit tests');
run('composer test-integration', 'Integration tests');
run('composer cs-fix', 'PHP Coding Standards Fixer');
run('composer stan', 'PHPStan');
exit(0);

function run(string $command, ?string $descriptiveName = null)
{
    printLine(blue('RUN ' . ($descriptiveName ?? $command) . '...'));
    exec($command, $output, $returnCode);
    handleFail($output, $returnCode);
    showSummary($output);
}

function handleFail($output, $returnCode)
{
    if ($returnCode !== 0) {
        printLine(red('Failed:'));
        printLines($output);
        printLine(red('Aborting commit...'));
        exit(1);
    }
}

function showSummary(array $output)
{
    printBlankLine();
    printLine(green('Summary:'));
    outputLastNotEmptyLine($output);
    printBlankLine();
}

function outputLastNotEmptyLine(array $output)
{
    while (count($output) > 0) {
        $lastLine = array_pop($output);

        if (trim($lastLine) !== '') {
            printLine($lastLine);
            return;
        }
    }
}

function printLine(string $string)
{
    echo $string . PHP_EOL;
}

function printLines(array $lines)
{
    echo implode(PHP_EOL, $lines) . PHP_EOL;
}

function printBlankLine()
{
    printLine('');
}

function red(string $string): string
{
    return color('0;31', $string);
}

function green(string $string): string
{
    return color('0;32', $string);
}

function blue(string $string): string
{
    return color('0;34', $string);
}

function color(string $colorCode, string $string): string
{
    return "\e[" . $colorCode . "m" . $string . "\e[0m";
}


================================================
FILE: phpstan.neon
================================================
parameters:
    level: 8
    paths:
        - src
        - tests
    excludePaths:
        analyse:
            - tests/_Integration/_Server
    reportUnmatchedIgnoredErrors: false
    ignoreErrors:
        - "#^Call to an undefined method Pest\\\\PendingCalls\\\\TestCall\\|Pest\\\\Support\\\\HigherOrderTapProxy\\:\\:(with|throws)\\(\\).$#"
        - "#^Access to an undefined property Spatie\\\\Invade\\\\Invader#"
        - "#^Call to an undefined method Spatie\\\\Invade\\\\Invader#"
        - "#^Call to protected method [a-zA-Z]{5,30}\\(\\) of class PHPUnit\\\\Framework\\\\TestCase.#"
        - "#^(?:Parameter|Method) .+ has invalid (return )?type Dom\\\\.+\\.#"
        - "#^Call to .+ on an unknown class Dom\\\\.+\\.#"
        - "#^Property .+ has unknown class Dom\\\\.+ as its type\\.#"
        - "#^Class Dom\\\\.+ not found.#"
        - "#^Access to property .+ on an unknown class Dom\\\\.+\\.#"
        - "#^PHPDoc tag .+ contains unknown class Dom\\\\.+\\.#"
        - "#^Call to an undefined (static )?method Dom\\\\.+::.+\\(\\)\\.#"
        - "#^Access to an undefined property Dom\\\\.+::\\$.+\\.#"
        - "#^Function .+ has invalid return type Dom\\\\.+\\.#"
        - "#^(?:Used )?(?:C|c)onstant DOM\\\\.+ not found\\.#"
        - "#^Instantiated class Dom\\\\.+ not found.#"


================================================
FILE: phpunit.xml
================================================
<?xml version="1.0" encoding="UTF-8"?>
<phpunit xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:noNamespaceSchemaLocation="https://schema.phpunit.de/10.1/phpunit.xsd" bootstrap="vendor/autoload.php" colors="true" cacheDirectory=".phpunit.cache">
  <testsuites>
    <testsuite name="Test Suite">
      <directory suffix="Test.php">./tests</directory>
    </testsuite>
  </testsuites>
  <coverage/>
  <source>
    <include>
      <directory suffix=".php">./app</directory>
      <directory suffix=".php">./src</directory>
    </include>
  </source>
</phpunit>


================================================
FILE: src/Cache/CacheItem.php
================================================
<?php

namespace Crwlr\Crawler\Cache;

use DateInterval;
use DateTimeImmutable;
use Exception;

class CacheItem
{
    protected string $key;

    public function __construct(
        protected mixed $value,
        ?string $key = null,
        public readonly int|DateInterval $ttl = 3600,
        public readonly DateTimeImmutable $createdAt = new DateTimeImmutable(),
    ) {
        if (!$key) {
            if (is_object($this->value) && method_exists($this->value, 'cacheKey')) {
                $this->key = $this->value->cacheKey();
            } else {
                $this->key = md5(serialize($this->value));
            }
        } else {
            $this->key = $key;
        }
    }

    public function key(): string
    {
        return $this->key;
    }

    public function value(): mixed
    {
        return $this->value;
    }

    /**
     * @throws Exception
     */
    public function isExpired(): bool
    {
        $ttl = $this->ttl instanceof DateInterval ? $this->ttl : new DateInterval('PT' . $this->ttl . 'S');

        return time() > $this->createdAt->add($ttl)->getTimestamp();
    }

    /**
     * Get a new instance with same data but a different time to live.
     */
    public function withTtl(DateInterval|int $ttl): CacheItem
    {
        return new CacheItem($this->value, $this->key, $ttl, $this->createdAt);
    }

    /**
     * @return mixed[]
     */
    public function __serialize(): array
    {
        return [
            'value' => $this->value,
            'key' => $this->key,
            'ttl' => $this->ttl,
            'createdAt' => $this->createdAt,
        ];
    }

    /**
     * @param mixed[] $data
     */
    public function __unserialize(array $data): void
    {
        $this->value = $data['value'];

        $this->key = $data['key'];

        $this->ttl = $data['ttl'];

        $this->createdAt = $data['createdAt'];
    }
}


================================================
FILE: src/Cache/Exceptions/MissingZlibExtensionException.php
================================================
<?php

namespace Crwlr\Crawler\Cache\Exceptions;

use Exception;
use Psr\SimpleCache\CacheException;

class MissingZlibExtensionException extends Exception implements CacheException {}


================================================
FILE: src/Cache/Exceptions/ReadingCacheFailedException.php
================================================
<?php

namespace Crwlr\Crawler\Cache\Exceptions;

use Exception;
use Psr\SimpleCache\CacheException;

class ReadingCacheFailedException extends Exception implements CacheException {}


================================================
FILE: src/Cache/FileCache.php
================================================
<?php

namespace Crwlr\Crawler\Cache;

use Crwlr\Crawler\Cache\Exceptions\MissingZlibExtensionException;
use Crwlr\Crawler\Cache\Exceptions\ReadingCacheFailedException;
use Crwlr\Crawler\Utils\Gzip;
use DateInterval;
use Exception;
use Psr\SimpleCache\CacheInterface;
use Psr\SimpleCache\InvalidArgumentException;
use Throwable;

class FileCache implements CacheInterface
{
    protected DateInterval|int $ttl = 3600;

    protected bool $useCompression = false;

    public function __construct(
        protected readonly string $basePath,
    ) {}

    public function useCompression(): static
    {
        $this->useCompression = true;

        return $this;
    }

    public function ttl(DateInterval|int $ttl): static
    {
        $this->ttl = $ttl;

        return $this;
    }

    /**
     * @throws MissingZlibExtensionException|ReadingCacheFailedException|Exception|InvalidArgumentException
     */
    public function has(string $key): bool
    {
        if (file_exists($this->basePath . '/' . $key)) {
            $cacheItem = $this->getCacheItem($key);

            if (!$cacheItem->isExpired()) {
                return true;
            }

            $this->delete($key);
        }

        return false;
    }

    /**
     * @throws ReadingCacheFailedException|MissingZlibExtensionException|Exception|InvalidArgumentException
     */
    public function get(string $key, mixed $default = null): mixed
    {
        if (file_exists($this->basePath . '/' . $key)) {
            $cacheItem = $this->getCacheItem($key);

            if (!$cacheItem->isExpired()) {
                return $cacheItem->value();
            }

            $this->delete($key);
        }

        return $default;
    }

    /**
     * @throws MissingZlibExtensionException
     */
    public function set(string $key, mixed $value, DateInterval|int|null $ttl = null): bool
    {
        if (!$value instanceof CacheItem) {
            $value = new CacheItem($value, $key, $ttl ?? $this->ttl);
        } elseif ($value->key() !== $key) {
            $value = new CacheItem($value->value(), $key, $ttl ?? $value->ttl);
        }

        return $this->saveCacheItem($value);
    }

    public function delete(string $key): bool
    {
        return unlink($this->basePath . '/' . $key);
    }

    public function prolong(string $key, DateInterval|int $ttl): bool
    {
        try {
            $item = $this->getCacheItem($key);

            return $this->saveCacheItem($item->withTtl($ttl));
        } catch (Throwable) {
            return false;
        }
    }

    /**
     * @throws InvalidArgumentException
     */
    public function clear(): bool
    {
        $allFiles = scandir($this->basePath);

        if (is_array($allFiles)) {
            foreach ($allFiles as $file) {
                if ($file !== '.' && $file !== '..' && $file !== '.gitkeep' && !$this->delete($file)) {
                    return false;
                }
            }
        }

        return true;
    }

    public function prolongAll(DateInterval|int $ttl): bool
    {
        $allFiles = scandir($this->basePath);

        if (is_array($allFiles)) {
            foreach ($allFiles as $file) {
                if ($file !== '.' && $file !== '..' && $file !== '.gitkeep' && !$this->prolong($file, $ttl)) {
                    return false;
                }
            }
        }

        return true;
    }

    /**
     * @return iterable<mixed>
     * @throws MissingZlibExtensionException|ReadingCacheFailedException|InvalidArgumentException
     */
    public function getMultiple(iterable $keys, mixed $default = null): iterable
    {
        $items = [];

        foreach ($keys as $key) {
            $items[$key] = $this->get($key, $default);
        }

        return $items;
    }

    /**
     * @param iterable<mixed> $values
     * @throws MissingZlibExtensionException
     */
    public function setMultiple(iterable $values, DateInterval|int|null $ttl = null): bool
    {
        foreach ($values as $key => $value) {
            if (!$this->set($key, $value, $ttl)) {
                return false;
            }
        }

        return true;
    }

    public function deleteMultiple(iterable $keys): bool
    {
        foreach ($keys as $key) {
            if (!$this->delete($key)) {
                return false;
            }
        }

        return true;
    }

    /**
     * @throws MissingZlibExtensionException
     * @throws ReadingCacheFailedException
     */
    protected function getCacheItem(string $key): CacheItem
    {
        $fileContent = $this->getFileContents($key);

        if ($this->useCompression) {
            $fileContent = $this->decode($fileContent);
        }

        $unserialized = $this->unserialize($fileContent);

        if (!$unserialized instanceof CacheItem) {
            $unserialized = new CacheItem($unserialized, $key);
        }

        return $unserialized;
    }

    /**
     * @throws MissingZlibExtensionException
     */
    protected function saveCacheItem(CacheItem $item): bool
    {
        $content = serialize($item);

        if ($this->useCompression) {
            $content = $this->encode($content);
        }

        return file_put_contents($this->basePath . '/' . $item->key(), $content) !== false;
    }

    protected function unserialize(string $content): mixed
    {
        // Temporarily set a new error handler, so unserializing a compressed string does not result in a PHP warning.
        set_error_handler(function ($errno, $errstr) {
            return $errno === E_WARNING && str_starts_with($errstr, 'unserialize(): Error at offset 0 of ');
        });

        $unserialized = unserialize($content);

        if ($unserialized === false) { // if unserializing fails, try if the string is compressed.
            try {
                $content = $this->decode($content);

                $unserialized = unserialize($content);
            } catch (Throwable) {
            }
        }

        restore_error_handler();

        return $unserialized;
    }

    /**
     * @throws ReadingCacheFailedException
     */
    protected function getFileContents(string $key): string
    {
        $fileContent = file_get_contents($this->basePath . '/' . $key);

        if ($fileContent === false) {
            throw new ReadingCacheFailedException('Failed to read cache file.');
        }

        return $fileContent;
    }

    /**
     * @throws MissingZlibExtensionException
     */
    protected function encode(string $content): string
    {
        try {
            return Gzip::encode($content, true);
        } catch (MissingZlibExtensionException) {
            throw new MissingZlibExtensionException(
                'Can\'t compress response cache data. Compression needs PHP ext-zlib installed.',
            );
        }
    }

    /**
     * @throws MissingZlibExtensionException
     */
    protected function decode(string $content): string
    {
        try {
            return Gzip::decode($content, true);
        } catch (MissingZlibExtensionException) {
            throw new MissingZlibExtensionException('FileCache compression needs PHP ext-zlib installed.');
        }
    }
}


================================================
FILE: src/Crawler.php
================================================
<?php

namespace Crwlr\Crawler;

use Closure;
use Crwlr\Crawler\Loader\LoaderInterface;
use Crwlr\Crawler\Logger\CliLogger;
use Crwlr\Crawler\Steps\BaseStep;
use Crwlr\Crawler\Steps\Exceptions\PreRunValidationException;
use Crwlr\Crawler\Steps\Group;
use Crwlr\Crawler\Steps\StepInterface;
use Crwlr\Crawler\Stores\StoreInterface;
use Crwlr\Crawler\UserAgents\UserAgentInterface;
use Exception;
use Generator;
use InvalidArgumentException;
use Psr\Log\LoggerInterface;

abstract class Crawler
{
    protected UserAgentInterface $userAgent;

    /**
     * @var LoaderInterface
     */
    protected LoaderInterface $loader;

    protected LoggerInterface $logger;

    protected mixed $inputs = [];

    /**
     * @var array<int, StepInterface>
     */
    protected array $steps = [];

    protected ?StoreInterface $store = null;

    protected bool|int $monitorMemoryUsage = false;

    protected ?Closure $outputHook = null;

    public function __construct()
    {
        $this->userAgent = $this->userAgent();

        $this->logger = $this->logger();

        $this->loader = $this->loader($this->userAgent, $this->logger);
    }

    public function __clone(): void
    {
        $this->inputs = [];

        $this->steps = [];

        $this->store = null;

        $this->outputHook = null;
    }

    abstract protected function userAgent(): UserAgentInterface;

    /**
     * @param UserAgentInterface $userAgent
     * @param LoggerInterface $logger
     * @return LoaderInterface
     */
    abstract protected function loader(UserAgentInterface $userAgent, LoggerInterface $logger): LoaderInterface;

    public static function group(): Group
    {
        return new Group();
    }

    public static function setMemoryLimit(string $memoryLimit): false|string
    {
        return ini_set('memory_limit', $memoryLimit);
    }

    public static function getMemoryLimit(): false|string
    {
        return ini_get('memory_limit');
    }

    public function getSubCrawler(): Crawler
    {
        return clone $this;
    }

    public function getUserAgent(): UserAgentInterface
    {
        return $this->userAgent;
    }

    public function setUserAgent(UserAgentInterface $userAgent): static
    {
        $this->userAgent = $userAgent;

        $this->loader = $this->loader($userAgent, $this->logger);

        return $this;
    }

    public function getLogger(): LoggerInterface
    {
        return $this->logger;
    }

    /**
     * @return LoaderInterface|array<string, LoaderInterface>
     */
    public function getLoader(): LoaderInterface|array
    {
        return $this->loader;
    }

    public function setStore(StoreInterface $store): static
    {
        $store->addLogger($this->logger);

        $this->store = $store;

        return $this;
    }

    public function input(mixed $input): static
    {
        $this->inputs[] = $input;

        return $this;
    }

    /**
     * @param mixed[] $inputs
     */
    public function inputs(array $inputs): static
    {
        $this->inputs = array_merge($this->inputs, $inputs);

        return $this;
    }

    /**
     * @param StepInterface $step
     * @return $this
     * @throws InvalidArgumentException
     */
    public function addStep(StepInterface $step): static
    {
        $step->addLogger($this->logger);

        if (method_exists($step, 'setLoader')) {
            $step->setLoader($this->loader);
        }

        if ($step instanceof BaseStep) {
            $step->setParentCrawler($this);
        }

        $this->steps[] = $step;

        return $this;
    }

    /**
     * Run the crawler and traverse results
     *
     * When you've set a store, or you just don't need the results for any other reason (e.g. you use the crawler for
     * cache warming) where you're calling the crawler, use this method.
     *
     * @throws Exception
     */
    public function runAndTraverse(): void
    {
        foreach ($this->run() as $result) {
        }
    }

    /**
     * Easy way to just crawl and dump the results
     *
     * @throws Exception
     */
    public function runAndDump(): void
    {
        foreach ($this->run() as $result) {
            var_dump($result->toArray());
        }
    }

    /**
     * Run the Crawler
     *
     * Handles calling all the steps and cascading the data from step to step.
     * It returns a Generator, so when using this method directly, you need to traverse the Generator, otherwise nothing
     * happens. Alternatively you can use runAndTraverse().
     *
     * @return Generator<Result>
     * @throws Exception|PreRunValidationException
     */
    public function run(): Generator
    {
        $this->validateSteps();

        $inputs = $this->prepareInput();

        if ($this->firstStep()) {
            foreach ($inputs as $input) {
                $results = $this->invokeStepsRecursive($input, $this->firstStep(), 0);

                /** @var Generator<Result> $results */

                yield from $results;
            }
        }

        $this->reset();
    }

    /**
     * Use this method if you want the crawler to add log messages with the current memory usage after every step
     * invocation.
     *
     * @param int|null $ifAboveXBytes  You can provide an int of bytes as a limit above which the crawler should log
     *                                 the usage.
     */
    public function monitorMemoryUsage(?int $ifAboveXBytes = null): static
    {
        $this->monitorMemoryUsage = $ifAboveXBytes ?? true;

        return $this;
    }

    public function outputHook(Closure $callback): static
    {
        $this->outputHook = $callback;

        return $this;
    }

    protected function logger(): LoggerInterface
    {
        return new CliLogger();
    }

    /**
     * @return Generator<Output|Result>
     */
    protected function invokeStepsRecursive(Input $input, StepInterface $step, int $stepIndex): Generator
    {
        $outputs = $step->invokeStep($input);

        $nextStep = $this->nextStep($stepIndex);

        if (!$nextStep) {
            yield from $this->storeAndReturnOutputsAsResults($outputs);

            return;
        }

        foreach ($outputs as $output) {
            if ($this->monitorMemoryUsage !== false) {
                $this->logMemoryUsage();
            }

            $this->outputHook?->call($this, $output, $stepIndex, $step);

            yield from $this->invokeStepsRecursive(
                new Input($output),
                $nextStep,
                $stepIndex + 1,
            );
        }
    }

    /**
     * @param Generator<Output> $outputs
     * @return Generator<Result>
     */
    protected function storeAndReturnOutputsAsResults(Generator $outputs): Generator
    {
        foreach ($outputs as $output) {
            $this->outputHook?->call($this, $output, count($this->steps) - 1, end($this->steps));

            $result = new Result();

            foreach ($output->keep as $key => $value) {
                $result->set($key, $value);
            }

            if (!$this->lastStep()?->keepsAnything()) {
                if ($output->isArrayWithStringKeys()) {
                    foreach ($output->get() as $key => $value) {
                        $result->set($key, $value);
                    }
                } else {
                    $result->set('unnamed', $output->get());
                }
            }

            $this->store?->store($result);

            yield $result;
        }
    }

    /**
     * @throws PreRunValidationException
     */
    protected function validateSteps(): void
    {
        $previousStep = null;

        foreach ($this->steps as $index => $step) {
            if ($index > 0) {
                $previousStep = $this->steps[$index - 1];
            }

            if (method_exists($step, 'validateBeforeRun')) {
                try {
                    $step->validateBeforeRun($previousStep ?? $this->inputs);
                } catch (PreRunValidationException $exception) {
                    $this->logger->error(
                        'Pre-Run validation error in step number ' . ($index + 1) . ': ' . $exception->getMessage(),
                    );

                    throw $exception;
                }
            }
        }
    }

    /**
     * @return Input[]
     * @throws Exception
     */
    protected function prepareInput(): array
    {
        return array_map(function ($input) {
            return new Input($input);
        }, $this->inputs);
    }

    protected function logMemoryUsage(): void
    {
        $memoryUsage = memory_get_usage();

        if (!is_int($this->monitorMemoryUsage) || $memoryUsage > $this->monitorMemoryUsage) {
            $this->logger->info('memory usage: ' . $memoryUsage);
        }
    }

    protected function firstStep(): ?StepInterface
    {
        return $this->steps[0] ?? null;
    }

    protected function lastStep(): ?BaseStep
    {
        $lastStep = end($this->steps);

        if (!$lastStep instanceof BaseStep) {
            return null;
        }

        return $lastStep;
    }

    protected function nextStep(int $afterIndex): ?StepInterface
    {
        return $this->steps[$afterIndex + 1] ?? null;
    }

    protected function reset(): void
    {
        $this->inputs = [];

        foreach ($this->steps as $step) {
            $step->resetAfterRun();
        }
    }
}


================================================
FILE: src/HttpCrawler/AnonymousHttpCrawlerBuilder.php
================================================
<?php

namespace Crwlr\Crawler\HttpCrawler;

use Crwlr\Crawler\HttpCrawler;
use Crwlr\Crawler\UserAgents\BotUserAgent;
use Crwlr\Crawler\UserAgents\UserAgent;
use Crwlr\Crawler\UserAgents\UserAgentInterface;

class AnonymousHttpCrawlerBuilder
{
    public function __construct() {}

    public function withBotUserAgent(string $productToken): HttpCrawler
    {
        $instance = new class extends HttpCrawler {
            protected function userAgent(): UserAgentInterface
            {
                return new UserAgent('temp');
            }
        };

        $instance->setUserAgent(new BotUserAgent($productToken));

        return $instance;
    }

    public function withUserAgent(string|UserAgentInterface $userAgent): HttpCrawler
    {
        $instance = new class extends HttpCrawler {
            protected function userAgent(): UserAgentInterface
            {
                return new UserAgent('temp');
            }
        };

        $userAgent = $userAgent instanceof UserAgentInterface ? $userAgent : new UserAgent($userAgent);

        $instance->setUserAgent($userAgent);

        return $instance;
    }

    public function withMozilla5CompatibleUserAgent(): HttpCrawler
    {
        return $this->withUserAgent(UserAgent::mozilla5CompatibleBrowser());
    }
}


================================================
FILE: src/HttpCrawler.php
================================================
<?php

namespace Crwlr\Crawler;

use Crwlr\Crawler\HttpCrawler\AnonymousHttpCrawlerBuilder;
use Crwlr\Crawler\Loader\Http\HttpLoader;
use Crwlr\Crawler\Loader\LoaderInterface;
use Crwlr\Crawler\UserAgents\UserAgentInterface;
use Psr\Log\LoggerInterface;

/**
 * @method HttpLoader getLoader()
 */

abstract class HttpCrawler extends Crawler
{
    /**
     * @return LoaderInterface
     */
    protected function loader(UserAgentInterface $userAgent, LoggerInterface $logger): LoaderInterface
    {
        return new HttpLoader($userAgent, logger: $logger);
    }

    public static function make(): HttpCrawler\AnonymousHttpCrawlerBuilder
    {
        return new AnonymousHttpCrawlerBuilder();
    }
}


================================================
FILE: src/Input.php
================================================
<?php

namespace Crwlr\Crawler;

class Input extends Io {}


================================================
FILE: src/Io.php
================================================
<?php

namespace Crwlr\Crawler;

use Crwlr\Crawler\Utils\OutputTypeHelper;

class Io
{
    protected string|int|float|bool|null $key = null;

    /**
     * @param mixed[] $keep
     */
    final public function __construct(
        protected mixed $value,
        public array $keep = [],
    ) {
        if ($value instanceof self) {
            $this->value = $value->value;

            $this->keep = $value->keep;
        }
    }

    public function withValue(mixed $value): static
    {
        return new static($value, $this->keep);
    }

    public function withPropertyValue(string $key, mixed $value): static
    {
        if (!$this->isArrayWithStringKeys()) {
            return new static($this);
        }

        $newValue = $this->value;

        $newValue[$key] = $value;

        return $this->withValue($newValue);
    }

    public function get(): mixed
    {
        return $this->value;
    }

    public function getProperty(string $key, mixed $fallbackValue = null): mixed
    {
        if (is_array($this->value)) {
            return $this->value[$key] ?? $fallbackValue;
        } elseif (is_object($this->value)) {
            $array = OutputTypeHelper::objectToArray($this->value);

            return $array[$key] ?? $fallbackValue;
        }

        return $fallbackValue;
    }

    /**
     * Sets and returns a key to use as identifier
     *
     * To only get unique results from a step use the key this method creates for comparison.
     * In case the output values are arrays or objects and contain a unique identifier that can be used, provide that
     * key name, so it doesn't need to create a key from the whole array/object.
     */
    public function setKey(?string $useFromValue = null): string
    {
        if ($useFromValue && is_array($this->value) && array_key_exists($useFromValue, $this->value)) {
            $this->key = $this->valueToString($this->value[$useFromValue]);
        } elseif ($useFromValue && is_object($this->value) && property_exists($this->value, $useFromValue)) {
            $this->key = $this->valueToString($this->value->{$useFromValue});
        } else {
            $this->key = $this->valueToString($this->value);
        }

        return $this->key;
    }

    public function getKey(): string|int|float|bool|null
    {
        if ($this->key === null) {
            $this->setKey();
        }

        return $this->key;
    }

    /**
     * @param mixed[] $data
     */
    public function keep(array $data): static
    {
        $this->keep = array_merge_recursive($this->keep, $data);

        return $this;
    }

    public function isArrayWithStringKeys(): bool
    {
        if (!is_array($this->value)) {
            return false;
        }

        foreach ($this->value as $key => $value) {
            if (!is_string($key)) {
                return false;
            }
        }

        return true;
    }

    protected function valueToString(mixed $value): string
    {
        if (is_array($value) || is_object($value)) {
            return md5(serialize($this->value));
        } elseif (is_int($value) || is_float($value)) {
            return (string) $value;
        } elseif (is_bool($value)) {
            return $value ? 'true' : 'false';
        } elseif (is_null($value)) {
            return 'null';
        }

        return $value;
    }
}


================================================
FILE: src/Loader/Http/Browser/Screenshot.php
================================================
<?php

namespace Crwlr\Crawler\Loader\Http\Browser;

class Screenshot
{
    public function __construct(
        public readonly string $path,
    ) {}
}


================================================
FILE: src/Loader/Http/Browser/ScreenshotConfig.php
================================================
<?php

namespace Crwlr\Crawler\Loader\Http\Browser;

use Crwlr\Utils\Microseconds;
use HeadlessChromium\Clip;
use HeadlessChromium\Exception\CommunicationException\CannotReadResponse;
use HeadlessChromium\Exception\CommunicationException\InvalidResponse;
use HeadlessChromium\Page;

class ScreenshotConfig
{
    public function __construct(
        public string $storePath,
        public string $fileType = 'png',
        public ?int $quality = null,
        public bool $fullPage = false,
    ) {}

    public static function make(string $storePath): self
    {
        return new self($storePath);
    }

    /**
     * @throws CannotReadResponse
     * @throws InvalidResponse
     */
    public function getFullPath(Page $page): string
    {
        $filename = md5($page->getCurrentUrl()) . '-' . Microseconds::now()->value . '.' . $this->fileType;

        return $this->storePath . (!str_ends_with($this->storePath, '/') ? '/' : '') . $filename;
    }

    public function setImageFileType(string $type): self
    {
        if (in_array($type, ['jpeg', 'png', 'webp'], true)) {
            $this->fileType = $type;

            if (in_array($type, ['jpeg', 'webp'], true) && $this->quality === null) {
                $this->quality = 80;
            } elseif ($type === 'png' && $this->quality !== null) {
                $this->quality = null;
            }
        }

        return $this;
    }

    public function setQuality(int $quality): self
    {
        if (in_array($this->fileType, ['jpeg', 'webp'], true) && $quality > 0 && $quality <= 100) {
            $this->quality = $quality;
        }

        return $this;
    }

    public function setFullPage(): self
    {
        $this->fullPage = true;

        return $this;
    }

    /**
     * @return array<string, int|string|bool|Clip>
     */
    public function toChromePhpScreenshotConfig(Page $page): array
    {
        $config = ['format' => $this->fileType];

        if ($this->quality && in_array($this->fileType, ['jpeg', 'webp'], true)) {
            $config['quality'] = $this->quality;
        }

        if ($this->fullPage) {
            $config['captureBeyondViewport'] = true;

            $config['clip'] = $page->getFullPageClip();
        }

        return $config;
    }
}


================================================
FILE: src/Loader/Http/Cache/RetryManager.php
================================================
<?php

namespace Crwlr\Crawler\Loader\Http\Cache;

/**
 * @internal
 */
class RetryManager
{
    /**
     * @param int[]|null $only
     * @param int[]|null $except
     */
    public function __construct(
        private ?array $only = null,
        private ?array $except = null,
    ) {}

    /**
     * @param int|int[] $statusCodes
     */
    public function only(int|array $statusCodes): static
    {
        $statusCodes = is_array($statusCodes) ? $statusCodes : [$statusCodes];

        $this->only = $statusCodes;

        return $this;
    }

    /**
     * @param int|int[] $statusCodes
     */
    public function except(int|array $statusCodes): static
    {
        $statusCodes = is_array($statusCodes) ? $statusCodes : [$statusCodes];

        $this->except = $statusCodes;

        return $this;
    }

    public function shallBeRetried(int $statusCode): bool
    {
        return $statusCode >= 400 &&
            ($this->except === null || !in_array($statusCode, $this->except, true)) &&
            ($this->only === null || in_array($statusCode, $this->only, true));
    }
}


================================================
FILE: src/Loader/Http/Cookies/Cookie.php
================================================
<?php

namespace Crwlr\Crawler\Loader\Http\Cookies;

use Crwlr\Crawler\Loader\Http\Cookies\Exceptions\InvalidCookieException;
use Crwlr\Url\Psr\Uri;
use Crwlr\Url\Url;
use Exception;
use Psr\Http\Message\UriInterface;

class Cookie
{
    protected Url $receivedFromUrl;

    protected string $receivedFromHost;

    protected string $cookieName;

    protected string $cookieValue;

    protected ?Date $expires = null;

    protected ?int $maxAge = null;

    protected int $receivedAtTimestamp = 0;

    protected string $domain;

    protected bool $domainSetViaAttribute = false;

    protected ?string $path = null;

    protected bool $secure = false;

    protected bool $httpOnly = false;

    protected string $sameSite = 'Lax';

    /**
     * @throws InvalidCookieException
     * @throws Exception
     */
    public function __construct(
        string|Url              $receivedFromUrl,
        protected readonly string $setCookieHeader,
    ) {
        $this->receivedFromUrl = $receivedFromUrl instanceof Url ? $receivedFromUrl : Url::parse($receivedFromUrl);

        if (
            !is_string($this->receivedFromUrl->host()) ||
            empty($this->receivedFromUrl->host())
        ) {
            throw new InvalidCookieException('Url where cookie was received from has no host or domain');
        }

        $this->receivedFromHost = $this->receivedFromUrl->host();

        $this->setDomain($this->receivedFromUrl->domain() ?? $this->receivedFromUrl->host());

        $this->parseSetCookieHeader($this->setCookieHeader);
    }

    /**
     * @throws Exception
     */
    public function shouldBeSentTo(string|UriInterface|Url $url): bool
    {
        $url = $url instanceof Url ? $url : Url::parse($url);

        $urlHost = $url->host() ?? '';

        return
            str_contains($urlHost, $this->domain()) &&
            (!$this->hasHostPrefix() || $urlHost === $this->receivedFromHost) &&
            (!$this->secure() || $url->scheme() === 'https' || in_array($urlHost, ['localhost', '127.0.0.1'], true)) &&
            (!$this->path() || $this->pathMatches($url)) &&
            !$this->isExpired();
    }

    public function __toString(): string
    {
        return $this->name() . '=' . $this->value();
    }

    public function receivedFromUrl(): UriInterface
    {
        return new Uri($this->receivedFromUrl);
    }

    public function name(): string
    {
        return $this->cookieName;
    }

    public function value(): string
    {
        return $this->cookieValue;
    }

    public function expires(): ?Date
    {
        return $this->expires;
    }

    public function maxAge(): ?int
    {
        return $this->maxAge;
    }

    public function isExpired(): bool
    {
        if ($this->expires() === null && $this->maxAge() === null) {
            return false;
        }

        $nowTimestamp = time();

        if ($this->expires() instanceof Date && $nowTimestamp >= $this->expires()->dateTime()->getTimestamp()) {
            return true;
        }

        return $this->maxAge() !== null &&
            ($this->maxAge() <= 0 || $nowTimestamp > ($this->receivedAtTimestamp + $this->maxAge()));
    }

    public function domain(): string
    {
        return $this->domain;
    }

    public function path(): ?string
    {
        return $this->path;
    }

    public function secure(): bool
    {
        return $this->secure;
    }

    public function httpOnly(): bool
    {
        return $this->httpOnly;
    }

    public function sameSite(): string
    {
        return $this->sameSite;
    }

    /**
     * @throws Exception
     */
    public function isReceivedSecure(): bool
    {
        return $this->receivedFromUrl->scheme() === 'https';
    }

    public function hasSecurePrefix(): bool
    {
        return str_starts_with($this->cookieName, '__Secure-');
    }

    public function hasHostPrefix(): bool
    {
        return str_starts_with($this->cookieName, '__Host-');
    }

    /**
     * @throws InvalidCookieException
     */
    protected function parseSetCookieHeader(string $setCookieHeader): void
    {
        $splitAtSemicolon = explode(';', $setCookieHeader);

        $splitFirstPart = explode('=', trim(array_shift($splitAtSemicolon)), 2);

        if (count($splitFirstPart) !== 2) {
            throw new InvalidCookieException('Invalid cookie string');
        }

        [$this->cookieName, $this->cookieValue] = $splitFirstPart;

        foreach ($splitAtSemicolon as $attribute) {
            $this->parseAttribute($attribute);
        }

        $this->checkPrefixes();
    }

    /**
     * @throws InvalidCookieException
     */
    protected function parseAttribute(string $attribute): void
    {
        $splitAtEquals = explode('=', trim($attribute), 2);

        $attributeName = strtolower($splitAtEquals[0]);

        $attributeValue = $splitAtEquals[1] ?? '';

        if ($attributeName === 'expires') {
            $this->setExpires($attributeValue);
        } elseif ($attributeName === 'max-age') {
            $this->setMaxAge($attributeValue);
        } elseif ($attributeName === 'domain') {
            $this->setDomain($attributeValue, true);
        } elseif ($attributeName === 'path') {
            $this->setPath($attributeValue);
        } elseif ($attributeName === 'secure') {
            $this->setSecure();
        } elseif ($attributeName === 'httponly') {
            $this->httpOnly = true;
        } elseif ($attributeName === 'samesite') {
            $this->setSameSite($attributeValue);
        }
    }

    /**
     * @see https://datatracker.ietf.org/doc/html/draft-west-cookie-prefixes#section-3
     * @throws InvalidCookieException
     * @throws Exception
     */
    protected function checkPrefixes(): void
    {
        if ($this->hasSecurePrefix() || $this->hasHostPrefix()) {
            if (!$this->isReceivedSecure()) {
                throw new InvalidCookieException(
                    'Cookie is prefixed with __Secure- or __Host- but was not sent via https',
                );
            }

            if (!$this->secure()) {
                throw new InvalidCookieException(
                    'Cookie is prefixed with __Secure- or __Host- but Secure flag was not sent',
                );
            }
        }

        if ($this->hasHostPrefix()) {
            if ($this->domainSetViaAttribute) {
                throw new InvalidCookieException('Cookie with __Host- prefix must not contain a Domain attribute');
            }

            if ($this->path !== '/') {
                throw new InvalidCookieException('Cookie with __Host- prefix must have a Path attribute with value /');
            }
        }
    }

    protected function setExpires(string $value): void
    {
        $this->expires = new Date($value);
    }

    protected function setMaxAge(string $value): void
    {
        $this->maxAge = (int) $value;

        $this->receivedAtTimestamp = time();
    }

    /**
     * @throws InvalidCookieException
     * @throws Exception
     */
    protected function setDomain(string $value, bool $viaAttribute = false): void
    {
        if (str_starts_with($value, '.')) {
            $value = substr($value, 1);
        }

        if (!str_contains($this->receivedFromHost, $value)) {
            throw new InvalidCookieException(
                'Setting cookie for ' . $value . ' from ' . $this->receivedFromUrl->host() . ' is not allowed.',
            );
        }

        $this->domain = $value;

        if ($viaAttribute) {
            $this->domainSetViaAttribute = true;
        }
    }

    protected function setPath(string $path): void
    {
        $this->path = $path;
    }

    /**
     * @throws InvalidCookieException
     * @throws Exception
     */
    protected function setSecure(): void
    {
        if (!$this->isReceivedSecure()) {
            throw new InvalidCookieException(
                'Secure flag can\'t be set when cookie was sent from non-https document url.',
            );
        }

        $this->secure = true;
    }

    /**
     * @throws InvalidCookieException
     */
    protected function setSameSite(string $value): void
    {
        $value = strtolower($value);

        if (!in_array(strtolower($value), ['strict', 'lax', 'none'], true)) {
            throw new InvalidCookieException('Invalid value for attribute SameSite');
        }

        $this->sameSite = ucfirst($value);
    }

    /**
     * @throws Exception
     */
    protected function pathMatches(Url $url): bool
    {
        $path = $this->path() ?? '';

        $urlPath = $url->path() ?? '';

        return str_starts_with($urlPath, $path) &&
            (
                $urlPath === $path ||
                $path === '/' ||
                str_starts_with($urlPath, $path . '/')
            );
    }
}


================================================
FILE: src/Loader/Http/Cookies/CookieJar.php
================================================
<?php

namespace Crwlr\Crawler\Loader\Http\Cookies;

use Crwlr\Crawler\Loader\Http\Cookies\Exceptions\InvalidCookieException;
use Crwlr\Url\Url;
use DateTime;
use Exception;
use HeadlessChromium\Cookies\Cookie as BrowserCookie;
use HeadlessChromium\Cookies\CookiesCollection;
use Psr\Http\Message\ResponseInterface;
use Psr\Http\Message\UriInterface;

class CookieJar
{
    /**
     * @var Cookie[][]
     */
    protected array $jar = [];

    /**
     * @param string $domain
     * @return Cookie[]
     */
    public function allByDomain(string $domain): array
    {
        if (array_key_exists($domain, $this->jar)) {
            return $this->jar[$domain];
        }

        return [];
    }

    public function flush(): void
    {
        $this->jar = [];
    }

    /**
     * @throws InvalidCookieException
     * @throws Exception
     */
    public function addFrom(string|UriInterface|Url $url, ResponseInterface|CookiesCollection $response): void
    {
        if ($response instanceof CookiesCollection) {
            $this->addFromBrowserCookieCollection($url, $response);
        } else {
            $cookieHeaders = $response->getHeader('set-cookie');

            if (!empty($cookieHeaders)) {
                $url = !$url instanceof Url ? Url::parse($url) : $url;

                $domain = $this->getForDomainFromUrl($url);

                if ($domain) {
                    foreach ($cookieHeaders as $cookieHeader) {
                        $cookie = new Cookie($url, $cookieHeader);

                        $this->jar[$domain][$cookie->name()] = $cookie;
                    }
                }
            }
        }
    }

    /**
     * @throws InvalidCookieException
     * @throws Exception
     */
    public function addFromBrowserCookieCollection(string|UriInterface|Url $url, CookiesCollection $collection): void
    {
        if ($collection->count() === 0) {
            return;
        }

        if (!$url instanceof Url) {
            $url = Url::parse($url);
        }

        $domain = $this->getForDomainFromUrl($url);

        if ($domain) {
            foreach ($collection as $cookie) {
                $setCookie = new Cookie($url, $this->buildSetCookieHeaderFromBrowserCookie($cookie));

                $this->jar[$domain][$setCookie->name()] = $setCookie;
            }
        }
    }

    /**
     * @return Cookie[]
     * @throws Exception
     */
    public function getFor(string|UriInterface $url): array
    {
        $forDomain = $this->getForDomainFromUrl($url);

        if (!$forDomain || !array_key_exists($forDomain, $this->jar)) {
            return [];
        }

        $cookiesToSend = [];

        foreach ($this->jar[$forDomain] as $cookie) {
            if ($cookie->shouldBeSentTo($url)) {
                $cookiesToSend[] = $cookie;
            }
        }

        return $cookiesToSend;
    }

    /**
     * @throws Exception
     */
    protected function getForDomainFromUrl(string|UriInterface|Url $url): ?string
    {
        if (!$url instanceof Url) {
            $url = Url::parse($url);
        }

        $forDomain = empty($url->domain()) ? $url->host() : $url->domain();

        if (!is_string($forDomain)) {
            return null;
        }

        return $forDomain;
    }

    protected function buildSetCookieHeaderFromBrowserCookie(BrowserCookie $cookie): string
    {
        $attributes = [
            'domain' => 'Domain',
            'expires' => 'Expires',
            'max-age' => 'Max-Age',
            'path' => 'Path',
            'secure' => 'Secure',
            'httpOnly' => 'HttpOnly',
            'sameSite' => 'SameSite',
        ];

        $parts = [sprintf('%s=%s', $cookie->getName(), $cookie->getValue())];

        foreach ($attributes as $name => $setCookieName) {
            $setCookieValue = $cookie->offsetGet($name);

            if (empty($setCookieValue)) {
                continue;
            }

            // "Expires" attribute
            if ($name === 'expires') {
                if ($setCookieValue !== -1) {
                    $parts[] = sprintf('%s=%s', $setCookieName, $this->formatExpiresValue($setCookieValue));
                }

                continue;
            }

            // Flag attributes
            if ($setCookieValue === true) {
                $parts[] = $setCookieName;

                continue;
            }

            $parts[] = sprintf('%s=%s', $setCookieName, $setCookieValue);
        }

        return implode('; ', $parts);
    }

    private function formatExpiresValue(mixed $value): string
    {
        if (is_numeric($value)) {
            $value = (string) $value;

            if (str_contains($value, '.')) {
                $expires = strlen(explode('.', $value, 2)[1]) <= 3 ?
                    DateTime::createFromFormat('U.v', $value) :
                    DateTime::createFromFormat('U.u', $value);
            } else {
                $expires = DateTime::createFromFormat('U', $value);
            }

            if ($expires !== false) {
                return $expires->format('l, d M Y H:i:s T');
            }
        }

        return (string) $value;
    }
}


================================================
FILE: src/Loader/Http/Cookies/Date.php
================================================
<?php

namespace Crwlr\Crawler\Loader\Http\Cookies;

use DateTime;
use DateTimeInterface;
use InvalidArgumentException;

class Date
{
    protected ?DateTime $dateTime = null;

    public function __construct(protected readonly string $httpDateString) {}

    /**
     * @throws InvalidArgumentException
     */
    public function dateTime(): DateTime
    {
        if (!$this->dateTime instanceof DateTime) {
            $dateTime = DateTime::createFromFormat(DateTimeInterface::COOKIE, $this->httpDateString);

            if (!$dateTime instanceof DateTime) {
                $dateTime = DateTime::createFromFormat('l, d M Y H:i:s T', $this->httpDateString);

                if (!$dateTime instanceof DateTime) {
                    throw new InvalidArgumentException('Can\'t parse date string ' . $this->httpDateString);
                }
            }

            $this->dateTime = $dateTime;
        }

        return $this->dateTime;
    }
}


================================================
FILE: src/Loader/Http/Cookies/Exceptions/InvalidCookieException.php
================================================
<?php

namespace Crwlr\Crawler\Loader\Http\Cookies\Exceptions;

use Exception;

class InvalidCookieException extends Exception {}


================================================
FILE: src/Loader/Http/Exceptions/LoadingException.php
================================================
<?php

namespace Crwlr\Crawler\Loader\Http\Exceptions;

use Exception;
use Psr\Http\Message\UriInterface;
use Throwable;

class LoadingException extends Exception
{
    public ?int $httpStatusCode = null;

    public static function from(Throwable $previousException): self
    {
        return new self(
            'Loading failed. Exception of type ' . get_class($previousException) . ' was thrown. Exception message: ' .
            $previousException->getMessage(),
            previous: $previousException,
        );
    }

    public static function make(string|UriInterface $uri, ?int $httpStatusCode = null): self
    {
        if ($uri instanceof UriInterface) {
            $uri = (string) $uri;
        }

        $message = 'Failed to load ' . $uri;

        if ($httpStatusCode !== null) {
            $message .= ' (' . $httpStatusCode . ').';
        } else {
            $message .= '.';
        }

        $instance = new self($message);

        if ($httpStatusCode !== null) {
            $instance->httpStatusCode = $httpStatusCode;
        }

        return $instance;
    }
}


================================================
FILE: src/Loader/Http/HeadlessBrowserLoaderHelper.php
================================================
<?php

namespace Crwlr\Crawler\Loader\Http;

use Closure;
use Crwlr\Crawler\Loader\Http\Browser\Screenshot;
use Crwlr\Crawler\Loader\Http\Cookies\CookieJar;
use Crwlr\Crawler\Loader\Http\Cookies\Exceptions\InvalidCookieException;
use Crwlr\Crawler\Loader\Http\Messages\RespondedRequest;
use Crwlr\Crawler\Loader\Http\Politeness\Throttler;
use Exception;
use GuzzleHttp\Psr7\Response;
use HeadlessChromium\Browser;
use HeadlessChromium\BrowserFactory;
use HeadlessChromium\Communication\Message;
use HeadlessChromium\Exception\CommunicationException;
use HeadlessChromium\Exception\CommunicationException\CannotReadResponse;
use HeadlessChromium\Exception\CommunicationException\InvalidResponse;
use HeadlessChromium\Exception\CommunicationException\ResponseHasError;
use HeadlessChromium\Exception\JavascriptException;
use HeadlessChromium\Exception\NavigationExpired;
use HeadlessChromium\Exception\NoResponseAvailable;
use HeadlessChromium\Exception\OperationTimedOut;
use HeadlessChromium\Exception\TargetDestroyed;
use HeadlessChromium\Page;
use Psr\Http\Message\RequestInterface;
use Psr\Http\Message\UriInterface;
use Psr\Log\LoggerInterface;
use Throwable;

class HeadlessBrowserLoaderHelper
{
    protected ?string $executable = null;

    /**
     * @var array<string, mixed>
     */
    protected array $options = [
        'windowSize' => [1920, 1000],
    ];

    protected bool $optionsDirty = false;

    protected ?Browser $browser = null;

    protected ?Page $page = null;

    protected ?string $proxy = null;

    protected ?string $waitForEvent = null;

    protected int $timeout = 30_000;

    protected ?string $pageInitScript = null;

    protected bool $useNativeUserAgent = false;

    protected bool $includeShadowElements = false;

    /**
     * @var Closure[]
     */
    protected array $tempPostNavigateHooks = [];

    public function __construct(
        private ?BrowserFactory $browserFactory = null,
        protected ?LoggerInterface $logger = null,
    ) {}

    /**
     * Set temporary post navigate hooks
     *
     * They will be executed after the next call to navigateToPageAndGetRespondedRequest()
     * and forgotten afterward.
     *
     * @param Closure[] $hooks
     */
    public function setTempPostNavigateHooks(array $hooks): static
    {
        $this->tempPostNavigateHooks = $hooks;

        return $this;
    }

    /**
     * @throws OperationTimedOut
     * @throws CommunicationException
     * @throws NoResponseAvailable
     * @throws NavigationExpired
     * @throws InvalidResponse
     * @throws CannotReadResponse
     * @throws ResponseHasError
     * @throws JavascriptException
     * @throws Exception
     */
    public function navigateToPageAndGetRespondedRequest(
        RequestInterface $request,
        Throttler $throttler,
        ?string $proxy = null,
        ?CookieJar $cookieJar = null,
    ): RespondedRequest {
        if (!$this->page || $this->shouldRenewBrowser($proxy)) {
            $this->page = $this->getBrowser($request, $proxy)->createPage();
        } else {
            try {
                $this->page->assertNotClosed();
            } catch (TargetDestroyed) {
                $this->page = $this->getBrowser($request, $proxy)->createPage();
            }
        }

        if ($cookieJar === null) {
            $this->page->getSession()->sendMessageSync(new Message('Network.clearBrowserCookies'));
        }

        $statusCode = 200;

        $responseHeaders = [];

        $requestId = null;

        $this->page->getSession()->once(
            "method:Network.responseReceived",
            function ($params) use (&$statusCode, &$responseHeaders, &$requestId) {
                $statusCode = $params['response']['status'];

                $responseHeaders = $this->sanitizeResponseHeaders($params['response']['headers']);

                $requestId = $params['requestId'] ?? null;
            },
        );

        $throttler->trackRequestStartFor($request->getUri());

        $this->navigate($request->getUri()->__toString());

        $throttler->trackRequestEndFor($request->getUri());

        $hookActionData = $this->callPostNavigateHooks();

        if (is_string($requestId) && $this->page && !$this->responseIsHtmlDocument($this->page)) {
            $html = $this->tryToGetRawResponseBody($this->page, $requestId) ?? $this->getHtmlFromPage();
        } else {
            $html = $this->getHtmlFromPage();
        }

        $this->addCookiesToJar($cookieJar, $request->getUri());

        return new RespondedRequest(
            $request,
            new Response($statusCode, $responseHeaders, $html),
            $hookActionData['screenshots'] ?? [],
        );
    }

    public function getOpenBrowser(): ?Browser
    {
        return $this->browser;
    }

    public function getOpenPage(): ?Page
    {
        return $this->page;
    }

    /**
     * @throws Exception
     */
    public function closeBrowser(): void
    {
        if ($this->browser) {
            if ($this->page) {
                $this->page->close();

                $this->page = null;
            }

            $this->browser->close();

            $this->browser = null;
        }
    }

    public function setExecutable(string $executable): static
    {
        $this->executable = $executable;

        return $this;
    }

    /**
     * @param array<string, mixed> $options
     */
    public function setOptions(array $options): static
    {
        $this->options = $options;

        $this->optionsDirty = true;

        return $this;
    }

    /**
     * @param array<string, mixed> $options
     */
    public function addOptions(array $options): static
    {
        foreach ($options as $key => $value) {
            $this->options[$key] = $value;
        }

        $this->optionsDirty = true;

        return $this;
    }

    public function waitForNavigationEvent(string $eventName): static
    {
        $this->waitForEvent = $eventName;

        return $this;
    }

    public function getTimeout(): int
    {
        return $this->timeout;
    }

    public function setTimeout(int $timeout): static
    {
        $this->timeout = $timeout;

        return $this;
    }

    /**
     * @param string[] $headers
     * @return string[]
     */
    public function sanitizeResponseHeaders(array $headers): array
    {
        foreach ($headers as $key => $value) {
            $headers[$key] = explode(PHP_EOL, $value)[0];
        }

        return $headers;
    }

    /**
     * @param string $scriptSource
     * @return $this
     */
    public function setPageInitScript(string $scriptSource): static
    {
        $this->pageInitScript = $scriptSource;

        return $this;
    }

    public function useNativeUserAgent(): static
    {
        $this->useNativeUserAgent = true;

        return $this;
    }

    public function includeShadowElementsInHtml(): static
    {
        $this->includeShadowElements = true;

        return $this;
    }

    /**
     * @throws OperationTimedOut
     * @throws CommunicationException
     * @throws NavigationExpired
     * @throws NoResponseAvailable
     * @throws InvalidResponse
     * @throws CannotReadResponse
     * @throws ResponseHasError
     */
    protected function navigate(string $url): void
    {
        if ($this->waitForEvent) {
            $this->page?->navigate($url)->waitForNavigation($this->waitForEvent, $this->timeout);
        } else {
            $this->page?->navigate($url)->waitForNavigation(timeout: $this->timeout);
        }
    }

    /**
     * @return array<string, mixed>
     */
    protected function callPostNavigateHooks(): array
    {
        $returnData = [];

        if (!empty($this->tempPostNavigateHooks)) {
            foreach ($this->tempPostNavigateHooks as $hook) {
                $returnValue = $hook->call($this, $this->page, $this->logger);

                if ($returnValue instanceof Screenshot) {
                    if (!array_key_exists('screenshots', $returnData)) {
                        $returnData['screenshots'] = [$returnValue];
                    } else {
                        $returnData['screenshots'][] = $returnValue;
                    }
                }
            }
        }

        $this->tempPostNavigateHooks = [];

        return $returnData;
    }

    /**
     * @throws CommunicationException
     * @throws OperationTimedOut
     * @throws NoResponseAvailable
     * @throws InvalidCookieException
     */
    protected function addCookiesToJar(?CookieJar $cookieJar, UriInterface $requestUrl): void
    {
        if (!$cookieJar) {
            return;
        }

        $cookies = $this->page?->getCookies();

        if ($cookies) {
            $cookieJar->addFrom($requestUrl, $cookies);
        }
    }

    /**
     * @throws Exception
     */
    protected function getBrowser(
        RequestInterface $request,
        ?string $proxy = null,
    ): Browser {
        if (!$this->browser || $this->shouldRenewBrowser($proxy)) {
            $this->closeBrowser();

            $options = $this->optionsFromRequest($request, $proxy);

            if (!$this->browserFactory) {
                $this->browserFactory = new BrowserFactory($this->executable);
            }

            $this->browser = $this->browserFactory->createBrowser($options);

            if ($this->pageInitScript) {
                $this->browser->setPagePreScript($this->pageInitScript);
            }

            $this->optionsDirty = false;
        }

        return $this->browser;
    }

    protected function shouldRenewBrowser(?string $proxy): bool
    {
        return $this->optionsDirty || ($proxy !== $this->proxy);
    }

    /**
     * @param RequestInterface $request
     * @return array<string, mixed>
     */
    protected function optionsFromRequest(RequestInterface $request, ?string $proxy = null): array
    {
        $options = $this->options;

        if (isset($request->getHeader('User-Agent')[0]) && !$this->useNativeUserAgent) {
            $options['userAgent'] = $request->getHeader('User-Agent')[0];
        } elseif ($this->useNativeUserAgent && !empty($request->getHeader('User-Agent'))) {
            $request = $request->withoutHeader('User-Agent');
        }

        $options['headers'] = array_merge(
            $options['headers'] ?? [],
            $this->prepareRequestHeaders($request->getHeaders()),
        );

        if (!empty($proxy)) {
            $this->proxy = $options['proxyServer'] = $proxy;
        } else {
            $this->proxy = null;
        }

        return $options;
    }

    /**
     * @param mixed[] $headers
     * @return array<string, string>
     */
    protected function prepareRequestHeaders(array $headers = []): array
    {
        $headers = $this->removeHeadersCausingErrorWithHeadlessBrowser($headers);

        return array_map(function ($headerValue) {
            return is_array($headerValue) ? implode(';', $headerValue) : $headerValue;
        }, $headers);
    }

    /**
     * @param mixed[] $headers
     * @return mixed[]
     */
    protected function removeHeadersCausingErrorWithHeadlessBrowser(array $headers = []): array
    {
        $removeHeaders = ['host'];

        foreach ($headers as $headerName => $headerValue) {
            if (in_array(strtolower($headerName), $removeHeaders, true)) {
                unset($headers[$headerName]);
            }
        }

        return $headers;
    }

    protected function responseIsHtmlDocument(?Page $page = null): bool
    {
        if (!$page) {
            return false;
        }

        try {
            return $page->evaluate(
                <<<JS
                (document.contentType === 'text/html' || document instanceof HTMLDocument) &&
                !(document.contentType === 'text/plain' && document.body.textContent.trimLeft().startsWith('<?xml '))
                JS,
            )->getReturnValue(3000);
        } catch (Throwable $e) {
            return true;
        }
    }

    /**
     * In production, retrieving the raw response body using the Network.getResponseBody message sometimes failed.
     * Waiting briefly before sending the message appeared to resolve the issue.
     * So, this method tries up to three times with a brief wait between each attempt.
     */
    protected function tryToGetRawResponseBody(Page $page, string $requestId): ?string
    {
        for ($i = 1; $i <= 3; $i++) {
            try {
                $message = $page->getSession()->sendMessageSync(new Message('Network.getResponseBody', [
                    'requestId' => $requestId,
                ]));

                if ($message->isSuccessful() && $message->getData()['result']['body']) {
                    return $message->getData()['result']['body'];
                }
            } catch (Throwable) {
            }

            usleep($i * 100000);
        }

        return null;
    }

    /**
     * @throws CommunicationException
     * @throws JavascriptException
     */
    protected function getHtmlFromPage(): string
    {
        if ($this->page instanceof Page && $this->includeShadowElements) {
            try {
                // Found this script on
                // https://stackoverflow.com/questions/69867758/how-can-i-get-all-the-html-in-a-document-or-node-containing-shadowroot-elements
                return $this->page->evaluate(<<<JS
                    function extractHTML(node) {
                        if (!node) return ''
                        if (node.nodeType===3) return node.textContent;
                        if (node.nodeType!==1) return ''

                        let html = ''
                        let outer = node.cloneNode();
                        node = node.shadowRoot || node

                        if (node.children.length) {
                            for (let n of node.childNodes) {
                                if (n.assignedNodes) {
                                    if (n.assignedNodes()[0]) {
                                        html += extractHTML(n.assignedNodes()[0])
                                    } else { html += n.innerHTML }
                                } else { html += extractHTML(n) }
                            }
                        } else { html = node.innerHTML }

                        outer.innerHTML = html

                        return outer.outerHTML
                    }

                    extractHTML(document.documentElement);
                    JS)->getReturnValue();
            } catch (Throwable) {
                return $this->page->getHtml();
            }
        }

        return $this->page?->getHtml() ?? '';
    }
}


================================================
FILE: src/Loader/Http/HttpLoader.php
================================================
<?php

namespace Crwlr\Crawler\Loader\Http;

use Crwlr\Crawler\Loader\Http\Cache\RetryManager;
use Crwlr\Crawler\Loader\Http\Cookies\CookieJar;
use Crwlr\Crawler\Loader\Http\Exceptions\LoadingException;
use Crwlr\Crawler\Loader\Http\Messages\RespondedRequest;
use Crwlr\Crawler\Loader\Http\Politeness\RetryErrorResponseHandler;
use Crwlr\Crawler\Loader\Http\Politeness\RobotsTxtHandler;
use Crwlr\Crawler\Loader\Http\Politeness\Throttler;
use Crwlr\Crawler\Loader\Loader;
use Crwlr\Crawler\Steps\Filters\FilterInterface;
use Crwlr\Crawler\UserAgents\UserAgentInterface;
use Crwlr\Crawler\Utils\RequestKey;
use Crwlr\Url\Exceptions\InvalidUrlException;
use Crwlr\Url\Url;
use Error;
use Exception;
use GuzzleHttp\Client;
use GuzzleHttp\Exception\GuzzleException;
use GuzzleHttp\Psr7\Request;
use HeadlessChromium\Exception\CommunicationException;
use HeadlessChromium\Exception\CommunicationException\CannotReadResponse;
use HeadlessChromium\Exception\CommunicationException\InvalidResponse;
use HeadlessChromium\Exception\CommunicationException\ResponseHasError;
use HeadlessChromium\Exception\JavascriptException;
use HeadlessChromium\Exception\NavigationExpired;
use HeadlessChromium\Exception\NoResponseAvailable;
use HeadlessChromium\Exception\OperationTimedOut;
use InvalidArgumentException;
use Psr\Http\Client\ClientExceptionInterface;
use Psr\Http\Client\ClientInterface;
use Psr\Http\Message\RequestInterface;
use Psr\Http\Message\ResponseInterface;
use Psr\Http\Message\UriInterface;
use Psr\Log\LoggerInterface;
use Throwable;

class HttpLoader extends Loader
{
    protected ClientInterface $httpClient;

    protected CookieJar $cookieJar;

    protected bool $useCookies = true;

    protected ?HeadlessBrowserLoaderHelper $browserHelper = null;

    protected bool $useHeadlessBrowser = false;

    protected ?RobotsTxtHandler $robotsTxtHandler = null;

    protected Throttler $throttler;

    /**
     * @var mixed[]
     */
    protected array $defaultGuzzleClientConfig = [
        'connect_timeout' => 10,
        'timeout' => 60,
    ];

    protected int $maxRedirects = 10;

    protected ?RetryManager $retryCachedErrorResponses = null;

    protected bool $writeOnlyCache = false;

    /**
     * @var array<int, FilterInterface>
     */
    protected array $cacheUrlFilters = [];

    protected bool $skipCacheForNextRequest = false;

    protected ?ProxyManager $proxies = null;

    /**
     * @param mixed[] $defaultGuzzleClientConfig
     */
    public function __construct(
        UserAgentInterface $userAgent,
        ?ClientInterface $httpClient = null,
        ?LoggerInterface $logger = null,
        ?Throttler $throttler = null,
        protected RetryErrorResponseHandler $retryErrorResponseHandler = new RetryErrorResponseHandler(),
        array $defaultGuzzleClientConfig = [],
    ) {
        parent::__construct($userAgent, $logger);

        $this->retryErrorResponseHandler->setLogger($this->logger);

        $this->httpClient = $httpClient ?? new Client($this->mergeClientConfigWithDefaults($defaultGuzzleClientConfig));

        $this->onSuccess(function (RequestInterface $request, ResponseInterface $response, LoggerInterface $logger) {
            $logger->info('Loaded ' . $request->getUri()->__toString());
        });

        $this->onError(function (RequestInterface $request, Exception|Error|ResponseInterface $exceptionOrResponse, $logger) {
            $logMessage = 'Failed to load ' . $request->getUri()->__toString() . ': ';

            if ($exceptionOrResponse instanceof ResponseInterface) {
                $logMessage .= 'got response ' . $exceptionOrResponse->getStatusCode() . ' - ' .
                    $exceptionOrResponse->getReasonPhrase();
            } else {
                $logMessage .= $exceptionOrResponse->getMessage();
            }

            $logger->error($logMessage);
        });

        $this->cookieJar = new CookieJar();

        $this->throttler = $throttler ?? new Throttler();
    }

    /**
     * @param mixed $subject
     * @return RespondedRequest|null
     */
    public function load(mixed $subject): ?RespondedRequest
    {
        $this->_resetCalledHooks();

        try {
            $request = $this->validateSubjectType($subject);
        } catch (InvalidArgumentException|Exception $exception) {
            $url = $subject instanceof RequestInterface ? (string) $subject->getUri() : (string) $subject;

            $this->logger->error('Invalid input URL: ' . $url . ' - ' . $exception->getMessage());

            return null;
        }

        try {
            if (!$this->isAllowedToBeLoaded($request->getUri())) {
                return null;
            }

            $isFromCache = false;

            $respondedRequest = $this->tryLoading($request, $isFromCache);

            if ($respondedRequest->response->getStatusCode() < 400) {
                $this->callHook('onSuccess', $request, $respondedRequest->response);
            } else {
                $this->callHook('onError', $request, $respondedRequest->response);
            }

            if (!$isFromCache) {
                $this->addToCache($respondedRequest);
            }

            return $respondedRequest;
        } catch (Throwable $exception) {
            // Don't move to finally so hooks don't run before it.
            $this->throttler->trackRequestEndFor($request->getUri());

            $this->callHook('onError', $request, $exception);

            return null;
        } finally {
            $this->callHook('afterLoad', $request);

            $this->_resetCalledHooks();
        }
    }

    /**
     * @throws LoadingException|InvalidArgumentException|Exception
     */
    public function loadOrFail(mixed $subject): RespondedRequest
    {
        $this->_resetCalledHooks();

        $request = $this->validateSubjectType($subject);

        try {
            $this->isAllowedToBeLoaded($request->getUri(), true);

            $isFromCache = false;

            $respondedRequest = $this->tryLoading($request, $isFromCache);

            if ($respondedRequest->response->getStatusCode() >= 400) {
                throw LoadingException::make($request->getUri(), $respondedRequest->response->getStatusCode());
            }

            $this->callHook('onSuccess', $request, $respondedRequest->response);

            $this->callHook('afterLoad', $request);

            if (!$isFromCache) {
                $this->addToCache($respondedRequest);
            }

            return $respondedRequest;
        } catch (Throwable $exception) {
            $this->_resetCalledHooks();

            throw LoadingException::from($exception);
        }
    }

    public function dontUseCookies(): static
    {
        $this->useCookies = false;

        return $this;
    }

    public function flushCookies(): void
    {
        $this->cookieJar->flush();
    }

    public function useHeadlessBrowser(): static
    {
        $this->useHeadlessBrowser = true;

        return $this;
    }

    /**
     * @throws Exception
     */
    public function useHttpClient(): static
    {
        $this->useHeadlessBrowser = false;

        $this->browser()->closeBrowser();

        return $this;
    }

    public function usesHeadlessBrowser(): bool
    {
        return $this->useHeadlessBrowser;
    }

    public function setMaxRedirects(int $maxRedirects): static
    {
        $this->maxRedirects = $maxRedirects;

        return $this;
    }

    public function robotsTxt(): RobotsTxtHandler
    {
        if (!$this->robotsTxtHandler) {
            $this->robotsTxtHandler = new RobotsTxtHandler($this, $this->logger);
        }

        return $this->robotsTxtHandler;
    }

    public function throttle(): Throttler
    {
        return $this->throttler;
    }

    public function retryCachedErrorResponses(): RetryManager
    {
        $this->retryCachedErrorResponses = new RetryManager();

        return $this->retryCachedErrorResponses;
    }

    public function writeOnlyCache(): static
    {
        $this->writeOnlyCache = true;

        return $this;
    }

    public function cacheOnlyWhereUrl(FilterInterface $filter): static
    {
        $this->cacheUrlFilters[] = $filter;

        return $this;
    }

    /**
     * @throws Exception
     */
    public function useProxy(string $proxyUrl): void
    {
        $this->checkIfProxiesCanBeUsed();

        $this->proxies = new ProxyManager([$proxyUrl]);
    }

    /**
     * @param string[] $proxyUrls
     * @throws Exception
     */
    public function useRotatingProxies(array $proxyUrls): void
    {
        $this->checkIfProxiesCanBeUsed();

        $this->proxies = new ProxyManager($proxyUrls);
    }

    public function browser(): HeadlessBrowserLoaderHelper
    {
        if (!$this->browserHelper) {
            $this->browserHelper = new HeadlessBrowserLoaderHelper(logger: $this->logger);
        }

        return $this->browserHelper;
    }

    /**
     * @throws \Psr\SimpleCache\InvalidArgumentException
     */
    public function addToCache(RespondedRequest $respondedRequest): void
    {
        if ($this->cache && $this->shouldResponseBeCached($respondedRequest)) {
            $this->cache->set($respondedRequest->cacheKey(), $respondedRequest);
        }
    }

    public function skipCacheForNextRequest(): static
    {
        $this->skipCacheForNextRequest = true;

        return $this;
    }

    /**
     * @throws LoadingException|Throwable|\Psr\SimpleCache\InvalidArgumentException
     */
    protected function tryLoading(
        RequestInterface $request,
        bool &$isFromCache,
    ): RespondedRequest {
        $request = $this->prepareRequest($request);

        $this->callHook('beforeLoad', $request);

        $respondedRequest = $this->shouldRequestBeServedFromCache($request) ? $this->getFromCache($request) : null;

        if ($respondedRequest) {
            $isFromCache = true;

            $respondedRequest->setIsServedFromCache();

            $this->callHook('onCacheHit', $request, $respondedRequest->response);
        }

        $this->skipCacheForNextRequest = false;

        if (!$respondedRequest) {
            $respondedRequest = $this->waitForGoAndLoad($request);
        }

        return $respondedRequest;
    }

    /**
     * @throws ClientExceptionInterface
     * @throws GuzzleException
     * @throws LoadingException
     * @throws CommunicationException
     * @throws CannotReadResponse
     * @throws InvalidResponse
     * @throws ResponseHasError
     * @throws JavascriptException
     * @throws NavigationExpired
     * @throws NoResponseAvailable
     * @throws OperationTimedOut
     * @throws Exception
     */
    protected function waitForGoAndLoad(RequestInterface $request): RespondedRequest
    {
        $this->throttler->waitForGo($request->getUri());

        $respondedRequest = $this->loadViaClientOrHeadlessBrowser($request);

        if ($this->retryErrorResponseHandler->shouldWait($respondedRequest)) {
            $respondedRequest = $this->retryErrorResponseHandler->handleRetries(
                $respondedRequest,
                function () use ($request) {
                    $request = $this->prepareRequest($request);

                    return $this->loadViaClientOrHeadlessBrowser($request);
                },
            );
        }

        return $respondedRequest;
    }

    /**
     * @throws ClientExceptionInterface
     * @throws GuzzleException
     * @throws LoadingException
     * @throws CommunicationException
     * @throws CannotReadResponse
     * @throws InvalidResponse
     * @throws ResponseHasError
     * @throws JavascriptException
     * @throws NavigationExpired
     * @throws NoResponseAvailable
     * @throws OperationTimedOut
     */
    protected function loadViaClientOrHeadlessBrowser(RequestInterface $request): RespondedRequest
    {
        if ($this->useHeadlessBrowser) {
            $proxy = $this->proxies?->getProxy() ?? null;

            return $this->browser()->navigateToPageAndGetRespondedRequest(
                $request,
                $this->throttler,
                $proxy,
                $this->useCookies ? $this->cookieJar : null,
            );
        }

        return $this->handleRedirects($request);
    }

    /**
     * @throws ClientExceptionInterface
     * @throws LoadingException
     * @throws GuzzleException
     * @throws Exception
     */
    protected function handleRedirects(
        RequestInterface  $request,
        ?RespondedRequest $respondedRequest = null,
        int $redirectNumber = 0,
    ): RespondedRequest {
        if ($redirectNumber >= $this->maxRedirects) {
            throw new LoadingException('Too many redirects.');
        }

        if (!$respondedRequest) {
            $this->throttler->trackRequestStartFor($request->getUri());
        }

        if ($this->proxies && $this->httpClient instanceof Client) {
            $response = $this->sendProxiedRequestUsingGuzzle($request, $this->httpClient);
        } else {
            $response = $this->httpClient->sendRequest($request);
        }

        if (!$respondedRequest) {
            $respondedRequest = new RespondedRequest($request, $response);
        } else {
            $respondedRequest->setResponse($response);
        }

        $this->addCookiesToJar($respondedRequest);

        if ($respondedRequest->isRedirect()) {
            $this->logger()->info('Load redirect to: ' . $respondedRequest->effectiveUri());

            $newRequest = $request->withUri(Url::parsePsr7($respondedRequest->effectiveUri()));

            $redirectNumber++;

            return $this->handleRedirects($newRequest, $respondedRequest, $redirectNumber);
        } else {
            $this->throttler->trackRequestEndFor($respondedRequest->request->getUri());
        }

        return $respondedRequest;
    }

    /**
     * @throws GuzzleException
     */
    protected function sendProxiedRequestUsingGuzzle(RequestInterface $request, Client $client): ResponseInterface
    {
        return $client->request(
            $request->getMethod(),
            $request->getUri(),
            [
                'headers' => $request->getHeaders(),
                'proxy' => $this->proxies?->getProxy(),
                'version' => $request->getProtocolVersion(),
                'body' => $request->getBody(),
            ],
        );
    }

    /**
     * @return void
     * @throws Exception
     */
    protected function checkIfProxiesCanBeUsed(): void
    {
        if (!$this->usesHeadlessBrowser() && !$this->httpClient instanceof Client) {
            throw new Exception(
                'The included proxy feature can only be used when using a guzzle HTTP client or headless chrome ' .
                'browser for loading.',
            );
        }
    }

    /**
     * @param mixed[] $config
     * @return mixed[]
     */
    protected function mergeClientConfigWithDefaults(array $config): array
    {
        $merged = $this->defaultGuzzleClientConfig;

        foreach ($config as $key => $value) {
            $merged[$key] = $value;
        }

        return $merged;
    }

    /**
     * @throws LoadingException
     * @throws Exception
     */
    protected function isAllowedToBeLoaded(UriInterface $uri, bool $throwsException = false): bool
    {
        if (!$this->robotsTxt()->isAllowed($uri)) {
            $message = 'Crawler is not allowed to load ' . $uri . ' according to robots.txt file.';

            $this->logger->warning($message);

            if ($throwsException) {
                throw new LoadingException($message);
            }

            return false;
        }

        return true;
    }

    /**
     * @throws \Psr\SimpleCache\InvalidArgumentException
     * @throws Exception
     */
    protected function getFromCache(RequestInterface $request): ?RespondedRequest
    {
        if (!$this->cache || $this->writeOnlyCache) {
            return null;
        }

        $key = RequestKey::from($request);

        if ($this->cache->has($key)) {
            $this->logger->info('Found ' . $request->getUri()->__toString() . ' in cache.');

            $respondedRequest = $this->cache->get($key);

            // Previously, until v0.7 just used serialized arrays. Leave this for backwards compatibility.
            if (is_array($respondedRequest)) {
                $respondedRequest = RespondedRequest::fromArray($respondedRequest);
            }

            if ($this->retryCachedErrorResponses?->shallBeRetried($respondedRequest->response->getStatusCode())) {
                $this->logger->info('Cached response was an error response, retry.');

                return null;
            }

            return $respondedRequest;
        }

        return null;
    }

    protected function shouldResponseBeCached(RespondedRequest $respondedRequest): bool
    {
        if (!empty($this->cacheUrlFilters)) {
            foreach ($this->cacheUrlFilters as $filter) {
                $noUrlMatched = true;

                foreach ($respondedRequest->allUris() as $url) {
                    if ($filter->evaluate($url)) {
                        $noUrlMatched = false;
                    }
                }

                if ($noUrlMatched) {
                    return false;
                }
            }
        }

        return true;
    }

    protected function shouldRequestBeServedFromCache(RequestInterface $request): bool
    {
        if ($this->skipCacheForNextRequest === true) {
            return false;
        }

        if (!empty($this->cacheUrlFilters)) {
            foreach ($this->cacheUrlFilters as $filter) {
                if (!$filter->evaluate((string) $request->getUri())) {
                    return false;
                }
            }
        }

        return true;
    }

    /**
     * @throws InvalidArgumentException|Exception
     */
    protected function validateSubjectType(RequestInterface|string $requestOrUri): RequestInterface
    {
        if (is_string($requestOrUri)) {
            try {
                $url = Url::parse($requestOrUri);

                if ($url->isRelativeReference()) {
                    throw new InvalidArgumentException(
                        'The URI is a relative reference and therefore can\'t be loaded.',
                    );
                }

                return new Request('GET', $url->toPsr7());
            } catch (InvalidUrlException) {
                throw new InvalidArgumentException('Invalid URL.');
            }
        } elseif (
            empty(trim($requestOrUri->getUri()->getScheme())) &&
            Url::parse($requestOrUri->getUri())->isRelativeReference()
        ) {
            throw new InvalidArgumentException('The URI is a relative reference and therefore can\'t be loaded.');
        }

        return $requestOrUri;
    }

    /**
     * @throws Exception
     */
    protected function prepareRequest(RequestInterface $request): RequestInterface
    {
        $request = $request->withHeader('User-Agent', $this->userAgent->__toString());

        // When writing tests I found that guzzle somehow messed up headers with multiple strings as value in the PSR-7
        // request object. It sent only the last part of the array, instead of concatenating the array of strings to a
        // comma separated string. Don't know if that happens with all handlers (curl, stream), will investigate
        // further. But until this is fixed, we just prepare the headers ourselves.
        foreach ($request->getHeaders() as $headerName => $headerValues) {
            $request = $request->withHeader($headerName, $request->getHeaderLine($headerName));
        }

        return $this->addCookiesToRequest($request);
    }

    protected function addCookiesToJar(RespondedRequest $respondedRequest): void
    {
        if ($this->useCookies) {
            try {
                $this->cookieJar->addFrom($respondedRequest->effectiveUri(), $respondedRequest->response);
            } catch (Exception $exception) {
                $this->logger->warning('Problem when adding cookies to the Jar: ' . $exception->getMessage());
            }
        }
    }

    /**
     * @throws Exception
     */
    protected function addCookiesToRequest(RequestInterface $request): RequestInterface
    {
        if (!$this->useCookies) {
            return $request;
        }

        foreach ($this->cookieJar->getFor($request->getUri()) as $cookie) {
            $request = $request->withAddedHeader('Cookie', $cookie->__toString());
        }

        return $request;
    }
}


================================================
FILE: src/Loader/Http/Messages/RespondedRequest.php
================================================
<?php

namespace Crwlr\Crawler\Loader\Http\Messages;

use Crwlr\Crawler\Cache\Exceptions\MissingZlibExtensionException;
use Crwlr\Crawler\Loader\Http\Browser\Screenshot;
use Crwlr\Crawler\Steps\Loading\Http;
use Crwlr\Crawler\Utils\RequestKey;
use Crwlr\Url\Url;
use Exception;
use GuzzleHttp\Psr7\Request;
use GuzzleHttp\Psr7\Response;
use Psr\Http\Message\RequestInterface;
use Psr\Http\Message\ResponseInterface;

class RespondedRequest
{
    /**
     * @var string[]
     */
    protected array $redirects = [];

    protected bool $isServedFromCache = false;

    /**
     * @param Screenshot[] $screenshots
     * @throws Exception
     */
    public function __construct(
        public RequestInterface $request,
        public ResponseInterface $response,
        public array $screenshots = [],
    ) {
        $this->setResponse($this->response);
    }

    /**
     * @param mixed[] $data
     * @return RespondedRequest
     * @throws Exception
     */
    public static function fromArray(array $data): RespondedRequest
    {
        $respondedRequest = new RespondedRequest(
            self::requestFromArray($data),
            self::responseFromArray($data),
            self::screenshotsFromArray($data),
        );

        if ($data['effectiveUri'] && $data['effectiveUri'] !== $data['requestUri']) {
            $respondedRequest->addRedirectUri($data['effectiveUri']);
        }

        return $respondedRequest;
    }

    /**
     * @return mixed[]
     * @throws MissingZlibExtensionException
     */
    public function __serialize(): array
    {
        return [
            'requestMethod' => $this->request->getMethod(),
            'requestUri' => $this->request->getUri()->__toString(),
            'requestHeaders' => $this->request->getHeaders(),
            'requestBody' => Http::getBodyString($this->request),
            'effectiveUri' => $this->effectiveUri(),
            'responseStatusCode' => $this->response->getStatusCode(),
            'responseHeaders' => $this->response->getHeaders(),
            'responseBody' => Http::getBodyString($this->response),
            'screenshots' => array_map(fn(Screenshot $screenshot) => $screenshot->path, $this->screenshots),
        ];
    }

    /**
     * @return mixed[]
     * @throws MissingZlibExtensionException
     */
    public function toArrayForResult(): array
    {
        $serialized = $this->__serialize();

        $mapping = [
            'url' => 'effectiveUri',
            'uri' => 'effectiveUri',
            'status' => 'responseStatusCode',
            'headers' => 'responseHeaders',
            'body' => 'responseBody',
        ];

        foreach ($mapping as $newKey => $originalKey) {
            $serialized[$newKey] = $serialized[$originalKey];
        }

        return $serialized;
    }

    /**
     * @param mixed[] $data
     * @throws Exception
     */
    public function __unserialize(array $data): void
    {
        $this->request = self::requestFromArray($data);

        $this->response = self::responseFromArray($data);

        if ($data['effectiveUri'] && $data['effectiveUri'] !== $data['requestUri']) {
            $this->addRedirectUri($data['effectiveUri']);
        }

        $this->screenshots = self::screenshotsFromArray($data);
    }

    public function effectiveUri(): string
    {
        return empty($this->redirects) ? $this->requestedUri() : end($this->redirects);
    }

    public function requestedUri(): string
    {
        return $this->request->getUri();
    }

    /**
     * @return array<int, string>
     */
    public function allUris(): array
    {
        $uris = [$this->requestedUri() => $this->requestedUri()];

        foreach ($this->redirects as $redirect) {
            $uris[$redirect] = $redirect;
        }

        return array_values($uris);
    }

    public function isRedirect(): bool
    {
        return $this->response->getStatusCode() >= 300 && $this->response->getStatusCode() < 400;
    }

    /**
     * @return string[]
     */
    public function redirects(): array
    {
        return $this->redirects;
    }

    /**
     * @throws Exception
     */
    public function setResponse(ResponseInterface $response): void
    {
        $this->response = $response;

        if ($this->isRedirect()) {
            $this->addRedirectUri();
        }
    }

    /**
     * @throws Exception
     */
    public function addRedirectUri(?string $redirectUri = null): void
    {
        $redirectUri = Url::parse($this->effectiveUri())
            ->resolve($redirectUri ?? $this->response->getHeaderLine('Location'))
            ->__toString();

        // Add it only if different from the previous one.
        if ($redirectUri !== end($this->redirects)) {
            $this->redirects[] = $redirectUri;
        }
    }

    public function cacheKey(): string
    {
        return RequestKey::from($this->request);
    }

    public function isServedFromCache(): bool
    {
        return $this->isServedFromCache;
    }

    public function setIsServedFromCache(bool $value = true): void
    {
        $this->isServedFromCache = $value;
    }

    /**
     * @param mixed[] $data
     */
    protected static function requestFromArray(array $data): Request
    {
        return new Request(
            $data['requestMethod'],
            $data['requestUri'],
            $data['requestHeaders'],
            $data['requestBody'],
        );
    }

    /**
     * @param mixed[] $data
     */
    protected static function responseFromArray(array $data): Response
    {
        return new Response(
            $data['responseStatusCode'],
            $data['responseHeaders'],
            $data['responseBody'],
        );
    }

    /**
     * @param mixed[] $data
     * @return Screenshot[]
     */
    protected static function screenshotsFromArray(array $data): array
    {
        $screenshots = [];

        if (array_key_exists('screenshots', $data)) {
            foreach ($data['screenshots'] as $screenshot) {
                if (file_exists($screenshot)) {
                    $screenshots[] = new Screenshot($screenshot);
                }
            }
        }

        return $screenshots;
    }
}


================================================
FILE: src/Loader/Http/Politeness/RetryErrorResponseHandler.php
================================================
<?php

namespace Crwlr\Crawler\Loader\Http\Politeness;

use Closure;
use Crwlr\Crawler\Loader\Http\Exceptions\LoadingException;
use Crwlr\Crawler\Loader\Http\Messages\RespondedRequest;
use Psr\Http\Message\ResponseInterface;
use Psr\Log\LoggerInterface;

class RetryErrorResponseHandler
{
    protected ?LoggerInterface $logger = null;

    /**
     * @var array<int, string>
     */
    protected array $waitErrors = [
        429 => 'Too many Requests',
        503 => 'Service Unavailable',
    ];

    /**
     * @param int[] $wait
     */
    public function __construct(
        protected int $retries = 2,
        protected array $wait = [10, 60],
        protected int $maxWait = 60,
    ) {}

    public function shouldWait(RespondedRequest $respondedRequest): bool
    {
        if (array_key_exists($respondedRequest->response->getStatusCode(), $this->waitErrors)) {
            return true;
        }

        return false;
    }

    public function setLogger(LoggerInterface $logger): void
    {
        $this->logger = $logger;
    }

    /**
     * @throws LoadingException
     */
    public function handleRetries(
        RespondedRequest $respondedRequest,
        Closure $retryCallback,
    ): RespondedRequest {
        $this->logReceivedErrorResponseMessage($respondedRequest);

        $retries = 0;

        $this->wait[0] = $this->getWaitTimeFromResponse($respondedRequest->response) ?? $this->wait[0];

        while ($retries < $this->retries) {
            $this->logWaitForRetryMessage($retries);

            sleep($this->wait[$retries]);

            $respondedRequest = $retryCallback();

            if ($respondedRequest instanceof RespondedRequest && !$this->shouldWait($respondedRequest)) {
                return $respondedRequest;
            } elseif ($respondedRequest) {
                $this->logRepeatedErrorMessage($respondedRequest);
            }

            $retries++;
        }

        $this->logger?->error('Stop crawling');

        throw new LoadingException('Stopped crawling because of repeated error responses.');
    }

    /**
     * @throws LoadingException
     */
    protected function getWaitTimeFromResponse(ResponseInterface $response): ?int
    {
        $retryAfterHeader = $response->getHeader('Retry-After');

        if (!empty($retryAfterHeader)) {
            $retryAfterHeader = reset($retryAfterHeader);

            if (is_numeric($retryAfterHeader)) {
                $waitFor = (int) $retryAfterHeader;

                if ($waitFor > $this->maxWait) {
                    $this->retryAfterExceedsLimitMessage($response);
                }

                return (int) $retryAfterHeader;
            }
        }

        return null;
    }

    protected function getResponseCodeAndReasonPhrase(RespondedRequest|ResponseInterface $respondedRequest): string
    {
        $response = $respondedRequest instanceof RespondedRequest ? $respondedRequest->response : $respondedRequest;

        $statusCode = $response->getStatusCode();

        if (array_key_exists($statusCode, $this->waitErrors)) {
            return $statusCode . ' (' . $this->waitErrors[$statusCode] . ')';
        }

        return '?';
    }

    protected function logReceivedErrorResponseMessage(RespondedRequest $respondedRequest): void
    {
        $statusCodeAndReasonPhrase = $this->getResponseCodeAndReasonPhrase($respondedRequest);

        $this->logger?->warning(
            'Request to ' . $respondedRequest->requestedUri() . ' returned ' . $statusCodeAndReasonPhrase,
        );
    }

    protected function logWaitForRetryMessage(int $retryNumber): void
    {
        $this->logger?->warning('Will wait for ' . $this->wait[$retryNumber] . ' seconds and then retry');
    }

    protected function logRepeatedErrorMessage(RespondedRequest $respondedRequest): void
    {
        $statusCodeAndReasonPhrase = $this->getResponseCodeAndReasonPhrase($respondedRequest);

        $this->logger?->warning('Retry again received an error response: ' . $statusCodeAndReasonPhrase);
    }

    /**
     * @throws LoadingException
     */
    protected function retryAfterExceedsLimitMessage(ResponseInterface $response): string
    {
        $statusCodeAndReasonPhrase = $this->getResponseCodeAndReasonPhrase($response);

        $message = 'Retry-After header in ' . $statusCodeAndReasonPhrase . ' response, requires to wait longer ' .
            'than the defined max wait time for this case. If you want to increase this limit, set it ' .
            'in the ErrorResponseHandler of your HttpLoader instance.';

        $this->logger?->error($message);

        throw new LoadingException($message);
    }
}


================================================
FILE: src/Loader/Http/Politeness/RobotsTxtHandler.php
================================================
<?php

namespace Crwlr\Crawler\Loader\Http\Politeness;

use Crwlr\Crawler\Loader\Http\HttpLoader;
use Crwlr\Crawler\Loader\Loader;
use Crwlr\Crawler\Steps\Loading\Http;
use Crwlr\Crawler\UserAgents\BotUserAgent;
use Crwlr\Crawler\UserAgents\UserAgentInterface;
use Crwlr\RobotsTxt\Exceptions\InvalidRobotsTxtFileException;
use Crwlr\RobotsTxt\RobotsTxt;
use Crwlr\Url\Url;
use Exception;
use Psr\Http\Message\UriInterface;
use Psr\Log\LoggerInterface;

class RobotsTxtHandler
{
    protected UserAgentInterface $userAgent;

    /**
     * @var array<string, RobotsTxt>
     */
    protected array $robotsTxts = [];

    protected bool $ignoreWildcardRules = false;

    public function __construct(
        protected Loader $loader,
        protected ?LoggerInterface $logger = null,
    ) {
        $this->userAgent = $this->loader->userAgent();
    }

    public function ignoreWildcardRules(): void
    {
        $this->ignoreWildcardRules = true;
    }

    /**
     * @throws Exception
     */
    public function isAllowed(string|UriInterface|Url $url): bool
    {
        if (!$this->userAgent instanceof BotUserAgent) {
            return true;
        }

        $url = $this->getUrlInstance($url);

        if ($url->path() === '/robots.txt') {
            return true;
        }

        $robotsTxt = $this->getRobotsTxtFor($url);

        if ($this->ignoreWildcardRules) {
            return !$robotsTxt->isExplicitlyNotAllowedFor($url, $this->userAgent->productToken());
        }

        return $robotsTxt->isAllowed($url, $this->userAgent->productToken());
    }

    /**
     * @return string[]
     * @throws InvalidRobotsTxtFileException
     */
    public function getSitemaps(string|UriInterface|Url $url): array
    {
        return $this->getRobotsTxtFor($url)->sitemaps();
    }

    /**
     * @throws InvalidRobotsTxtFileException|Exception
     */
    protected function getRobotsTxtFor(string|UriInterface|Url $url): RobotsTxt
    {
        $url = $this->getUrlInstance($url);

        $root = $url->root();

        if (isset($this->robotsTxts[$root])) {
            return $this->robotsTxts[$root];
        }

        $robotsTxtContent = $this->loadRobotsTxtContent($root . '/robots.txt');

        try {
            $this->robotsTxts[$root] = RobotsTxt::parse($robotsTxtContent);
        } catch (Exception $exception) {
            $this->logger?->warning('Failed to parse robots.txt: ' . $exception->getMessage());

            $this->robotsTxts[$root] = RobotsTxt::parse('');
        }

        return $this->robotsTxts[$root];
    }

    protected function loadRobotsTxtContent(string $robotsTxtUrl): string
    {
        $usedHeadlessBrowser = false;

        if ($this->loader instanceof HttpLoader) {
            // If loader is set to use headless browser, temporary switch to using PSR-18 HTTP Client.
            $usedHeadlessBrowser = $this->loader->usesHeadlessBrowser();

            $this->loader->useHttpClient();
        }

        $response = $this->loader->load($robotsTxtUrl);

        if ($this->loader instanceof HttpLoader && $usedHeadlessBrowser) {
            $this->loader->useHeadlessBrowser();
        }

        return $response ? Http::getBodyString($response) : '';
    }

    protected function getUrlInstance(string|UriInterface|Url $url): Url
    {
        if (is_string($url) || $url instanceof UriInterface) {
            return Url::parse($url);
        }

        return $url;
    }
}


================================================
FILE: src/Loader/Http/Politeness/Throttler.php
================================================
<?php

namespace Crwlr\Crawler\Loader\Http\Politeness;

use Crwlr\Crawler\Loader\Http\Politeness\TimingUnits\MultipleOf;
use Crwlr\Url\Url;
use Crwlr\Utils\Microseconds;
use Exception;
use InvalidArgumentException;
use Psr\Http\Message\UriInterface;

class Throttler
{
    /**
     * @var array<string, Microseconds>
     */
    protected array $latestRequestTimes = [];

    /**
     * @var array<string, Microseconds>
     */
    protected array $latestResponseTimes = [];

    /**
     * @var array<string, Microseconds>
     */
    protected array $latestDurations = [];

    protected Microseconds|MultipleOf $from;

    protected Microseconds|MultipleOf $to;

    protected Microseconds $min;

    /**
     * @var string[]
     */
    private array $_currentRequestUrls = [];

    /**
     * @throws InvalidArgumentException
     */
    public function __construct(
        Microseconds|MultipleOf|null $from = null,
        Microseconds|MultipleOf|null $to = null,
        ?Microseconds $min = null,
        protected ?Microseconds $max = null,
    ) {
        $this->from = $from ?? new MultipleOf(1.0);

        $this->to = $to ?? new MultipleOf(2.0);

        $this->validateFromAndTo();

        $this->min = $min ?? Microseconds::fromSeconds(0.25);
    }

    /**
     * @throws InvalidArgumentException
     */
    public function waitBetween(Microseconds|MultipleOf $from, Microseconds|MultipleOf $to): static
    {
        $this->from = $from;

        $this->to = $to;

        $this->validateFromAndTo();

        return $this;
    }

    public function waitAtLeast(Microseconds $seconds): static
    {
        $this->min = $seconds;

        return $this;
    }

    public function waitAtMax(Microseconds $seconds): static
    {
        $this->max = $seconds;

        return $this;
    }

    /**
     * @throws Exception
     */
    public function trackRequestStartFor(UriInterface $url): void
    {
        $domain = $this->getDomain($url);

        $this->latestRequestTimes[$domain] = $this->time();

        $this->_internalTrackStartFor($url);
    }

    /**
     * @throws Exception
     */
    public function trackRequestEndFor(UriInterface $url): void
    {
        if (!$this->_requestToUrlWasStarted($url)) {
            return;
        }

        $domain = $this->getDomain($url);

        if (!isset($this->latestRequestTimes[$domain])) {
            return;
        }

        $this->latestResponseTimes[$domain] = $responseTime = $this->time();

        $this->latestDurations[$domain] = $responseTime->subtract($this->latestRequestTimes[$domain]);

        unset($this->latestRequestTimes[$domain]);

        $this->_internalTrackEndFor($url);
    }

    /**
     * @throws Exception
     */
    public function waitForGo(UriInterface $url): void
    {
        $domain = $this->getDomain($url);

        if (!isset($this->latestDurations[$domain])) {
            return;
        }

        $waitUntil = $this->calcWaitUntil($this->latestDurations[$domain], $this->latestResponseTimes[$domain]);

        $now = $this->time();

        if ($now->isGreaterThanOrEqual($waitUntil)) {
            return;
        }

        $wait = $waitUntil->subtract($now);

        usleep($wait->value);
    }

    protected function time(): Microseconds
    {
        return Microseconds::fromSeconds(microtime(true));
    }

    /**
     * @throws Exception
     */
    protected function getDomain(UriInterface $url): string
    {
        $domain = Url::parse($url)->domain();

        if (!$domain) {
            $domain = $url->getHost();
        }

        if (!is_string($domain)) {
            $domain = '*';
        }

        return $domain;
    }

    protected function calcWaitUntil(
        Microseconds $latestResponseDuration,
        Microseconds $latestResponseTime,
    ): Microseconds {
        $from = $this->from instanceof MultipleOf ? $this->from->calc($latestResponseDuration) : $this->from;

        $to = $this->to instanceof MultipleOf ? $this->to->calc($latestResponseDuration) : $this->to;

        $waitValue = $this->getRandBetween($from, $to);

        if ($this->min->isGreaterThan($waitValue)) {
            $waitValue = $this->min;
        }

        if ($this->max && $this->max->isLessThan($waitValue)) {
            $waitValue = $this->max;
        }

        return $latestResponseTime->add($waitValue);
    }

    protected function getRandBetween(Microseconds $from, Microseconds $to): Microseconds
    {
        if ($from->equals($to)) {
            return $from;
        }

        return new Microseconds(rand($from->value, $to->value));
    }

    /**
     * @internal
     */
    protected function _internalTrackStartFor(UriInterface $url): void
    {
        $urlString = (string) $url;

        $this->_currentRequestUrls[$urlString] = $urlString;
    }

    /**
     * @internal
     */
    protected function _internalTrackEndFor(UriInterface $url): void
    {
        unset($this->_currentRequestUrls[(string) $url]);
    }

    protected function _requestToUrlWasStarted(UriInterface $url): bool
    {
        $urlString = (string) $url;

        if (array_key_exists($urlString, $this->_currentRequestUrls)) {
            return true;
        }

        return false;
    }

    protected function validateFromAndTo(): void
    {
        if (!$this->fromAndToAreOfSameType()) {
            throw new InvalidArgumentException('From and to values must be of the same type (Seconds or MultipleOf).');
        }

        if ($this->fromIsGreaterThanTo()) {
            throw new InvalidArgumentException('From value can\'t be greater than to value.');
        }
    }

    protected function fromAndToAreOfSameType(): bool
    {
        return ($this->from instanceof Microseconds && $this->to instanceof Microseconds) ||
            ($this->from instanceof MultipleOf && $this->to instanceof MultipleOf);
    }

    protected function fromIsGreaterThanTo(): bool
    {
        if ($this->from instanceof Microseconds && $this->to instanceof Microseconds) {
            return $this->from->isGreaterThan($this->to);
        }

        if ($this->from instanceof MultipleOf && $this->to instanceof MultipleOf) {
            return $this->from->factorIsGreaterThan($this->to);
        }

        return false;
    }
}


================================================
FILE: src/Loader/Http/Politeness/TimingUnits/MultipleOf.php
================================================
<?php

namespace Crwlr\Crawler\Loader\Http\Politeness\TimingUnits;

use Crwlr\Utils\Microseconds;

class MultipleOf
{
    public function __construct(public readonly float $factor) {}

    public function calc(Microseconds $microseconds): Microseconds
    {
        $factorTwoDecimalsAsInt = (int) (round($this->factor, 2) * 100);

        $result = (int) round(($microseconds->value * $factorTwoDecimalsAsInt) / 100);

        return new Microseconds($result);
    }

    public function factorIsGreaterThan(MultipleOf $multipleOf): bool
    {
        return $this->factor > $multipleOf->factor;
    }
}


================================================
FILE: src/Loader/Http/ProxyManager.php
================================================
<?php

namespace Crwlr\Crawler\Loader\Http;

class ProxyManager
{
    protected ?int $lastUsedProxy = null;

    /**
     * @param string[] $proxies
     */
    public function __construct(protected array $proxies)
    {
        $this->proxies = array_values($this->proxies);
    }

    public function singleProxy(): bool
    {
        return count($this->proxies) === 1;
    }

    public function hasOnlySingleProxy(): bool
    {
        return count($this->proxies) === 1;
    }

    public function hasMultipleProxies(): bool
    {
        return count($this->proxies) > 1;
    }

    public function getProxy(): string
    {
        if ($this->hasOnlySingleProxy()) {
            return $this->proxies[0];
        }

        if ($this->lastUsedProxy === null || !isset($this->proxies[$this->lastUsedProxy + 1])) {
            $this->lastUsedProxy = 0;
        } else {
            $this->lastUsedProxy += 1;
        }

        return $this->proxies[$this->lastUsedProxy];
    }
}


================================================
FILE: src/Loader/Loader.php
================================================
<?php

namespace Crwlr\Crawler\Loader;

use Crwlr\Crawler\Logger\CliLogger;
use Crwlr\Crawler\UserAgents\UserAgentInterface;
use Psr\Http\Message\UriInterface;
use Psr\Log\LoggerInterface;
use Psr\SimpleCache\CacheInterface;

abstract class Loader implements LoaderInterface
{
    protected LoggerInterface $logger;

    protected ?CacheInterface $cache = null;

    /**
     * @var array<string, callable[]>
     */
    protected array $hooks = [
        'beforeLoad' => [],
        'onCacheHit' => [],
        'onSuccess' => [],
        'onError' => [],
        'afterLoad' => [],
    ];

    /**
     * @var array<string, bool>
     */
    private array $_hooksCalledInCurrentLoadCall = [];

    public function __construct(
        protected UserAgentInterface $userAgent,
        ?LoggerInterface $logger = null,
    ) {
        $this->logger = $logger ?? new CliLogger();
    }

    public function beforeLoad(callable $callback): void
    {
        $this->addHookCallback('beforeLoad', $callback);
    }

    public function onCacheHit(callable $callback): void
    {
        $this->addHookCallback('onCacheHit', $callback);
    }

    public function onSuccess(callable $callback): void
    {
        $this->addHookCallback('onSuccess', $callback);
    }

    public function onError(callable $callback): void
    {
        $this->addHookCallback('onError', $callback);
    }

    public function afterLoad(callable $callback): void
    {
        $this->addHookCallback('afterLoad', $callback);
    }

    public function setCache(CacheInterface $cache): static
    {
        $this->cache = $cache;

        return $this;
    }

    public function userAgent(): UserAgentInterface
    {
        return $this->userAgent;
    }

    /**
     * Can be implemented in a child class to check if it is allowed to load a certain uri (e.g. check robots.txt)
     * Throw a LoadingException when it's not allowed and $throwsException is set to true.
     */
    protected function isAllowedToBeLoaded(UriInterface $uri, bool $throwsException = false): bool
    {
        return true;
    }

    protected function callHook(string $hook, mixed ...$arguments): void
    {
        if (!array_key_exists($hook, $this->hooks)) {
            return;
        }

        if (array_key_exists($hook, $this->_hooksCalledInCurrentLoadCall)) {
            $this->logger->warning(
                $hook . ' was already called in this load call. Probably a problem in the loader implementation.',
            );
        }

        if (
            $hook === 'afterLoad' &&
            !empty($this->hooks[$hook]) &&
            !array_key_exists('beforeLoad', $this->_hooksCalledInCurrentLoadCall)
        ) {
            $this->logger->warning(
                'The afterLoad hook was called without a preceding call to the beforeLoad hook. Therefore don\'t ' .
                'run the hook callbacks. Most likely an exception/error occurred  before the beforeLoad hook call.',
            );

            return;
        }

        $arguments[] = $this->logger;

        foreach ($this->hooks[$hook] as $callback) {
            call_user_func($callback, ...$arguments);
        }

        $this->_hooksCalledInCurrentLoadCall[$hook] = true;
    }

    protected function logger(): LoggerInterface
    {
        return $this->logger;
    }

    protected function addHookCallback(string $hook, callable $callback): void
    {
        $this->hooks[$hook][] = $callback;
    }

    /**
     * @internal
     * @return void
     */
    protected function _resetCalledHooks(): void
    {
        $this->_hooksCalledInCurrentLoadCall = [];
    }
}


================================================
FILE: src/Loader/LoaderInterface.php
================================================
<?php

namespace Crwlr\Crawler\Loader;

use Crwlr\Crawler\Loader\Http\Exceptions\LoadingException;
use InvalidArgumentException;
use Psr\SimpleCache\CacheInterface;

interface LoaderInterface
{
    /**
     * @param mixed $subject  The subject to load, whatever the Loader implementation needs to load something.
     * @return mixed
     */
    public function load(mixed $subject): mixed;

    /**
     * @throws InvalidArgumentException  Throw an InvalidArgumentException when the type of $subject argument isn't
     *                                   valid for the Loader implementation.
     * @throws LoadingException  Throw one when loading failed.
     */
    public function loadOrFail(mixed $subject): mixed;

    /**
     * Add an implementation of the PSR-16 CacheInterface that the Loader will use to cache loaded resources.
     */
    public function setCache(CacheInterface $cache): static;
}


================================================
FILE: src/Logger/CliLogger.php
================================================
<?php

namespace Crwlr\Crawler\Logger;

use DateTime;
use InvalidArgumentException;
use Psr\Log\LoggerInterface;
use Stringable;
use UnexpectedValueException;

class CliLogger implements LoggerInterface
{
    public function emergency(string|Stringable $message, array $context = []): void
    {
        $this->log('emergency', $message, $context);
    }

    public function alert(string|Stringable $message, array $context = []): void
    {
        $this->log('alert', $message, $context);
    }

    public function critical(string|Stringable $message, array $context = []): void
    {
        $this->log('critical', $message, $context);
    }

    public function error(string|Stringable $message, array $context = []): void
    {
        $this->log('error', $message, $context);
    }

    public function warning(string|Stringable $message, array $context = []): void
    {
        $this->log('warning', $message, $context);
    }

    public function notice(string|Stringable $message, array $context = []): void
    {
        $this->log('notice', $message, $context);
    }

    public function info(string|Stringable $message, array $context = []): void
    {
        $this->log('info', $message, $context);
    }

    public function debug(string|Stringable $message, array $context = []): void
    {
        $this->log('debug', $message, $context);
    }

    /**
     * @param mixed $level
     * @param mixed[] $context
     */
    public function log($level, string|Stringable $message, array $context = []): void
    {
        if (!is_string($level)) {
            throw new InvalidArgumentException('Level must be string.');
        }

        if (!in_array($level, ['emergency', 'alert', 'critical', 'error', 'warning', 'notice', 'info', 'debug'], true)) {
            throw new UnexpectedValueException('Unknown log level.');
        }

        $this->printTimeAndLevel($level);
        echo $message . "\n";
    }

    protected function printTimeAndLevel(string $level): void
    {
        echo $this->time() . " \033[0;" . $this->levelColor($level) . "m[" . strtoupper($level) . "]\033[0m ";
    }

    protected function time(): string
    {
        return (new DateTime())->format('H:i:s:u');
    }

    protected function levelColor(string $level): string
    {
        $levelColors = [
            'emergency' => '91', // bright red
            'alert' => '91',
            'critical' => '91',
            'error' => '31',     // red
            'warning' => '36',   // cyan
            'notice' => '34',    // blue
            'info' => '32',      // green
            'debug' => '33',     // yellow
        ];

        return $levelColors[$level];
    }
}


================================================
FILE: src/Logger/PreStepInvocationLogger.php
================================================
<?php

namespace Crwlr\Crawler\Logger;

use InvalidArgumentException;
use Psr\Log\LoggerInterface;
use Stringable;
use UnexpectedValueException;

class PreStepInvocationLogger implements LoggerInterface
{
    /**
     * @var array<int, array<string, string>>
     */
    public array $messages = [];

    public function emergency(string|Stringable $message, array $context = []): void
    {
        $this->log('emergency', $message, $context);
    }

    public function alert(string|Stringable $message, array $context = []): void
    {
        $this->log('alert', $message, $context);
    }

    public function critical(string|Stringable $message, array $context = []): void
    {
        $this->log('critical', $message, $context);
    }

    public function error(string|Stringable $message, array $context = []): void
    {
        $this->log('error', $message, $context);
    }

    public function warning(string|Stringable $message, array $context = []): void
    {
        $this->log('warning', $message, $context);
    }

    public function notice(string|Stringable $message, array $context = []): void
    {
        $this->log('notice', $message, $context);
    }

    public function info(string|Stringable $message, array $context = []): void
    {
        $this->log('info', $message, $context);
    }

    public function debug(string|Stringable $message, array $context = []): void
    {
        $this->log('debug', $message, $context);
    }

    /**
     * @param mixed $level
     * @param mixed[] $context
     */
    public function log($level, string|Stringable $message, array $context = []): void
    {
        if (!is_string($level)) {
            throw new InvalidArgumentException('Level must be string.');
        }

        if (!in_array($level, ['emergency', 'alert', 'critical', 'error', 'warning', 'notice', 'info', 'debug'], true)) {
            throw new UnexpectedValueException('Unknown log level.');
        }

        $this->messages[] = ['level' => $level, 'message' => $message];
    }

    public function passToOtherLogger(LoggerInterface $logger): void
    {
        foreach ($this->messages as $message) {
            $logger->{$message['level']}($message['message']);
        }
    }
}


================================================
FILE: src/Output.php
================================================
<?php

namespace Crwlr\Crawler;

class Output extends Io {}


================================================
FILE: src/Result.php
================================================
<?php

namespace Crwlr\Crawler;

use Crwlr\Crawler\Utils\OutputTypeHelper;

final class Result
{
    /**
     * @var mixed[]
     */
    private array $data = [];

    public function __construct(protected ?Result $result = null)
    {
        if ($result) {
            $this->data = $result->data;
        }
    }

    public function set(string $key, mixed $value): self
    {
        if ($key === '') {
            $key = $this->getUnnamedKey();
        }

        if (array_key_exists($key, $this->data)) {
            if (!is_array($this->data[$key]) || $this->isAssociativeArray($this->data[$key])) {
                $this->data[$key] = [$this->data[$key], $value];
            } else {
                $this->data[$key][] = $value;
            }
        } else {
            $this->data[$key] = $value;
        }

        return $this;
    }

    public function has(string $key): bool
    {
        return array_key_exists($key, $this->data);
    }

    public function get(string $key, mixed $default = null): mixed
    {
        if ($this->has($key)) {
            return $this->data[$key];
        }

        return $default;
    }

    /**
     * @return mixed[]
     */
    public function toArray(): array
    {
        $data = OutputTypeHelper::recursiveChildObjectsToArray($this->data);

        if (
            count($data) === 1 &&
            str_contains('unnamed', array_key_first($data)) &&
            OutputTypeHelper::isAssociativeArray($data[array_key_first($data)])
        ) {
            return $data[array_key_first($data)];
        }

        return $data;
    }

    private function getUnnamedKey(): string
    {
        $i = 1;

        while ($this->get('unnamed' . $i) !== null) {
            $i++;
        }

        return 'unnamed' . $i;
    }

    /**
     * @param mixed[] $array
     */
    private function isAssociativeArray(array $array): bool
    {
        foreach ($array as $key => $value) {
            return is_string($key);
        }

        return false;
    }
}


================================================
FILE: src/Steps/BaseStep.php
================================================
<?php

namespace Crwlr\Crawler\Steps;

use Adbar\Dot;
use Closure;
use Crwlr\Crawler\Crawler;
use Crwlr\Crawler\Input;
use Crwlr\Crawler\Io;
use Crwlr\Crawler\Logger\PreStepInvocationLogger;
use Crwlr\Crawler\Output;
use Crwlr\Crawler\Result;
use Crwlr\Crawler\Steps\Exceptions\PreRunValidationException;
use Crwlr\Crawler\Steps\Filters\Filterable;
use Crwlr\Crawler\Steps\Refiners\RefinerInterface;
use Crwlr\Crawler\Utils\OutputTypeHelper;
use Generator;
use InvalidArgumentException;
use Psr\Log\LoggerInterface;

/**
 * Base class for classes Step and Group which share some things in terms of adding output data to Result objects.
 */

abstract class BaseStep implements StepInterface
{
    use Filterable;

    /**
     * true means: keep the whole output array/object
     * string: keep that one key from the (array/object) output
     * array: keep those keys from the (array/object) output
     *
     * @var bool|string|string[]
     */
    protected bool|string|array $keep = false;

    /**
     * Same as $keep, but for input data.
     *
     * @var bool|string|string[]
     */
    protected bool|string|array $keepFromInput = false;

    protected ?string $keepAs = null;

    protected ?string $keepInputAs = null;

    protected ?Crawler $parentCrawler = null;

    /**
     * @var array<string, Closure>
     */
    protected array $subCrawlers = [];

    protected ?LoggerInterface $logger = null;

    protected ?string $useInputKey = null;

    protected bool|string $uniqueInput = false;

    /**
     * @var array<int|string, true>
     */
    protected array $uniqueInputKeys = [];

    protected bool|string $uniqueOutput = false;

    /**
     * @var array<int|string, true>
     */
    protected array $uniqueOutputKeys = [];

    /**
     * @var array<Closure|RefinerInterface|array{ key: string, refiner: Closure|RefinerInterface}>
     */
    protected array $refiners = [];

    protected ?string $outputKey = null;

    protected ?int $maxOutputs = null;

    protected int $currentOutputCount = 0;

    private ?Input $fullOriginalInput = null;

    /**
     * @param Input $input
     * @return Generator<Output>
     */
    abstract public function invokeStep(Input $input): Generator;

    public function addLogger(LoggerInterface $logger): static
    {
        if ($this->logger instanceof PreStepInvocationLogger) {
            $this->logger->passToOtherLogger($logger);
        }

        $this->logger = $logger;

        if (!empty($this->refiners)) {
            foreach ($this->refiners as $refiner) {
                if ($refiner instanceof RefinerInterface) {
                    $refiner->addLogger($logger);
                } elseif (is_array($refiner) && $refiner['refiner'] instanceof RefinerInterface) {
                    $refiner['refiner']->addLogger($logger);
                }
            }
        }

        return $this;
    }

    public function setParentCrawler(Crawler $crawler): static
    {
        $this->parentCrawler = $crawler;

        return $this;
    }

    /**
     * @param string|string[]|null $keys
     */
    public function keep(string|array|null $keys = null): static
    {
        if ($keys === null) {
            $this->keep = true;
        } else {
            $this->keep = $keys;
        }

        return $this;
    }

    public function keepAs(string $key): static
    {
        $this->keepAs = $key;

        return $this;
    }

    /**
     * @param string|string[]|null $keys
     */
    public function keepFromInput(string|array|null $keys = null): static
    {
        if ($keys === null) {
            $this->keepFromInput = true;
        } else {
            $this->keepFromInput = $keys;
        }

        return $this;
    }

    public function keepInputAs(string $key): static
    {
        $this->keepInputAs = $key;

        return $this;
    }

    public function keepsAnything(): bool
    {
        return $this->keepsAnythingFromOutputData() || $this->keepsAnythingFromInputData();
    }

    public function keepsAnythingFromInputData(): bool
    {
        return $this->keepFromInput !== false || $this->keepInputAs !== null;
    }

    public function keepsAnythingFromOutputData(): bool
    {
        return $this->keep !== false || $this->keepAs !== null;
    }

    public function useInputKey(string $key): static
    {
        $this->useInputKey = $key;

        return $this;
    }

    public function uniqueInputs(?string $key = null): static
    {
        $this->uniqueInput = $key ?? true;

        return $this;
    }

    public function uniqueOutputs(?string $key = null): static
    {
        $this->uniqueOutput = $key ?? true;

        return $this;
    }

    public function refineOutput(
        string|Closure|RefinerInterface $keyOrRefiner,
        Closure|RefinerInterface|null $refiner = null,
    ): static {
        if ($refiner instanceof RefinerInterface && $this->logger) {
            $refiner->addLogger($this->logger);
        } elseif ($keyOrRefiner instanceof RefinerInterface && $this->logger) {
            $keyOrRefiner->addLogger($this->logger);
        }

        if (is_string($keyOrRefiner) && $refiner === null) {
            throw new InvalidArgumentException(
                'You have to provide a Refiner (Closure or instance of RefinerInterface)',
            );
        } elseif (is_string($keyOrRefiner)) {
            $this->refiners[] = ['key' => $keyOrRefiner, 'refiner' => $refiner];
        } else {
            $this->refiners[] = $keyOrRefiner;
        }

        return $this;
    }

    public function outputKey(string $key): static
    {
        $this->outputKey = $key;

        return $this;
    }

    public function maxOutputs(int $maxOutputs): static
    {
        $this->maxOutputs = $maxOutputs;

        return $this;
    }

    public function resetAfterRun(): void
    {
        $this->uniqueOutputKeys = $this->uniqueInputKeys = [];

        $this->currentOutputCount = 0;
    }

    /**
     * Define what type of outputs the step will yield
     *
     * Defining this in any step, helps to identify potential errors upfront when a crawler run is started.
     * If the step will only yield associative array (or object) outputs,
     * return StepOutputType::AssociativeArrayOrObject.
     * If it will only yield scalar (string, int, float, bool) outputs, return StepOutputType::Scalar.
     *
     * If it can potentially yield both types, but you can determine what it will yield, based on the state of the
     * class, please implement this. Only if it can't be defined upfront, because it depends on the input, return
     * StepOutputType::Mixed.
     *
     * @return StepOutputType
     */
    public function outputType(): StepOutputType
    {
        return StepOutputType::Mixed;
    }

    /**
     * @param BaseStep|mixed[] $previousStepOrInitialInputs
     * @throws PreRunValidationException
     */
    public function validateBeforeRun(BaseStep|array $previousStepOrInitialInputs): void
    {
        if (!$previousStepOrInitialInputs instanceof BaseStep) {
            $this->validateFirstStepBeforeRun($previousStepOrInitialInputs);
        }

        if ($this->keep !== false && $this->keepAs === null && $this->outputKey === null) {
            $outputType = $this->outputType();

            if ($outputType === StepOutputType::Scalar) {
                throw new PreRunValidationException(
                    'Keeping data from a step that yields scalar value outputs (= single string/int/bool/float with ' .
                    'no key like in an associative array or object) requires to define a key, by using keepAs() ' .
                    'instead of keep()',
                );
            } elseif ($outputType === StepOutputType::Mixed) {
                $this->logger?->warning(
                    $this->getPreValidationRunMessageStartWithStepClassName() . ' potentially yields scalar value ' .
                    'outputs (= single string/int/bool/float with no key like in an associative array or object). ' .
                    'If it does (yield a scalar value output), it can not keep that output value, because it needs ' .
                    'a key for that. To avoid this, define a key for scalar outputs by using the keepAs() method.',
                );
            }
        }

        if (
            $this->keepFromInput !== false &&
            $previousStepOrInitialInputs instanceof BaseStep &&
            $this->keepInputAs === null
        ) {
            $previousStepOutputType = $previousStepOrInitialInputs->outputType();

            if ($previousStepOutputType === StepOutputType::Scalar) {
                throw new PreRunValidationException(
                    'You are trying to keep data from a step\'s input with keepFromInput(), but the step before it ' .
                    'returns scalar value outputs (= single string/int/bool/float with no key like in an associative ' .
                    'array or object). Please define a key for the input data to keep, by using keepAs() instead.',
                );
            } elseif ($previousStepOutputType === StepOutputType::Mixed) {
                $this->logger?->warning(
                    $this->getPreValidationRunMessageStartWithStepClassName($previousStepOrInitialInputs) .
                    ' potentially yields scalar value outputs (= single string/int/bool/float with no key like in ' .
                    'an associative array or object). If it does (yield a scalar value output) the next step can not ' .
                    'keep it by using keepFromInput(). To avoid this, define a key for scalar inputs by using the ' .
                    'keepInputAs() method.',
                );
            }
        }
    }

    public function subCrawlerFor(string $for, Closure $crawlerBuilder): static
    {
        $this->subCrawlers[$for] = $crawlerBuilder;

        return $this;
    }

    /**
     * In case useInputKey() was used, use this method to store the original input so you can still
     * access it later.
     */
    protected function storeOriginalInput(Input $input): void
    {
        $this->fullOriginalInput = $input;
    }

    /**
     * In case useInputKey() was used, this method shall still provide access to the full input object,
     * that the step was last called with.
     */
    protected function getFullOriginalInput(): ?Input
    {
        return $this->fullOriginalInput;
    }

    protected function runSubCrawlersFor(Output $output): Output
    {
        if (empty($this->subCrawlers)) {
            return $output;
        }

        if (!$output->isArrayWithStringKeys()) {
            $this->logger?->error(
                'The sub crawler feature works only with outputs that are associative arrays (arrays with ' .
                'string keys). The feature was called with an output of type ' . gettype($output->get()) . '.',
            );

            return $output;
        }

        if (!$this->parentCrawler) {
            $this->logger?->error('Can\'t make sub crawler, because the step has no reference to the parent crawler.');
        } else {
            foreach ($this->subCrawlers as $forKey => $crawlerBuilder) {
                $outputValue = $output->getProperty($forKey);

                if ($outputValue !== null) {
                    $crawler = $crawlerBuilder($this->parentCrawler->getSubCrawler());

                    is_array($outputValue) ? $crawler->inputs($outputValue) : $crawler->input($outputValue);

                    $results = [];

                    foreach ($crawler->run() as $result) {
                        $results[] = $result;
                    }

                    $resultCount = count($results);

                    if ($resultCount === 0) {
                        $output = $output->withPropertyValue($forKey, null);
                    } elseif ($resultCount === 1) {
                        $output = $output->withPropertyValue($forKey, $results[0]->toArray());
                    } else {
                        $output = $output->withPropertyValue(
                            $forKey,
                            array_map(function (Result $result) {
                                return $result->toArray();
                            }, $results),
                        );
                    }
                }
            }
        }

        return $output;
    }

    /**
     * If you want to define aliases for certain output keys that can be used with keep(),
     * define this method in the child class and return the mappings.
     *
     * @return array<string, string>  alias => output key
     */
    protected function outputKeyAliases(): array
    {
        return [];
    }

    /**
     * @param mixed[] $initialInputs
     * @throws PreRunValidationException
     */
    protected function validateFirstStepBeforeRun(array $initialInputs): void
    {
        if ($initialInputs === []) {
            $this->logger?->error('You did not provide any initial inputs for your crawler.');

            return;
        }

        if ($this->keepFromInput !== false) {
            foreach ($initialInputs as $input) {
                if (!OutputTypeHelper::isAssociativeArrayOrObject($input)) {
                    throw new PreRunValidationException(
                        'The initial inputs contain scalar values (without keys) and you are calling keepFromInput() ' .
                        'on the first step (if not the first step in your whole crawler, check sub crawlers). Please ' .
                        'use keepInputAs() instead with a key, that the input value should have in the kept data.',
                    );
                }
            }
        }
    }

    protected function getPreValidationRunMessageStartWithStepClassName(?BaseStep $step = null): string
    {
        $stepClassName = $this->getStepClassName($step);

        if ($stepClassName) {
            return 'The ' . $stepClassName . ' step';
        } else {
            $stepClassName = $this->getParentStepClassName($step);

            if (
                $stepClassName &&
                $stepClassName !== 'Crwlr\\Crawler\\Steps\\Step' &&
                $stepClassName !== 'Crwlr\\Crawler\\Steps\\BaseStep'
            ) {
                return 'An anonymous class step, that is extending the ' . $stepClassName . ' step';
            } else {
                return 'An anonymous class step';
            }
        }
    }

    protected function getStepClassName(?BaseStep $step = null): ?string
    {
        $stepClassName = get_class($step ?? $this);

        if (str_contains($stepClassName, '@anonymous')) {
            return null;
        }

        return $stepClassName;
    }

    protected function getParentStepClassName(?BaseStep $step = null): ?string
    {
        $parents = class_parents($step ?? $this);

        $firstLevelParent = reset($parents);

        if ($firstLevelParent && !str_contains($firstLevelParent, '@anonymous')) {
            return $firstLevelParent;
        }

        return null;
    }

    protected function getInputKeyToUse(Input $input): ?Input
    {
        if ($this->useInputKey !== null) {
            $inputValue = $input->get();

            if (!is_array($inputValue) || !array_key_exists($this->useInputKey, $inputValue)) {
                if (!array_key_exists($this->useInputKey, $input->keep)) {
                    $warningMessage = '';

                    if (!is_array($inputValue)) {
                        $warningMessage = 'Can\'t get key from input, because input is of type ' .
                            gettype($inputValue) . ' instead of array.';
                    } elseif (!array_key_exists($this->useInputKey, $inputValue)) {
                        $warningMessage = 'Can\'t get key from input, because it does not exist.';
                    }

                    if (!empty($input->keep)) {
                        $warningMessage .= ' Key also is not present in data kept from previous steps.';
                    }

                    $this->logger?->warning($warningMessage);

                    return null;
                }

                $valueToUse = $input->keep[$this->useInputKey];
            } else {
                $valueToUse = $inputValue[$this->useInputKey];
            }

            $input = $input->withValue($valueToUse);
        }

        return $input;
    }

    protected function inputOrOutputIsUnique(Io $io): bool
    {
        $uniquenessSetting = $io instanceof Input ? $this->uniqueInput : $this->uniqueOutput;

        $uniqueKeys = $io instanceof Input ? $this->uniqueInputKeys : $this->uniqueOutputKeys;

        $key = is_string($uniquenessSetting) ? $io->setKey($uniquenessSetting) : $io->setKey();

        if (isset($uniqueKeys[$key])) {
            return false;
        }

        if ($io instanceof Input) {
            $this->uniqueInputKeys[$key] = true; // Don't keep value, just the key, to keep memory usage low.
        } else {
            $this->uniqueOutputKeys[$key] = true;
        }

        return true;
    }

    protected function applyRefiners(mixed $outputValue, mixed $inputValue): mixed
    {
        foreach ($this->refiners as $refiner) {
            $outputValueToRefine = $outputValue;

            if (is_array($refiner) && isset($outputValue[$refiner['key']])) {
                $outputValueToRefine = $outputValue[$refiner['key']];
            }

            if ($refiner instanceof Closure) {
                $refinedOutputValue = $refiner->call($this, $outputValueToRefine, $inputValue);
            } elseif ($refiner instanceof RefinerInterface) {
                $refinedOutputValue = $refiner->refine($outputValueToRefine);
            } else {
                if ($refiner['refiner'] instanceof Closure) {
                    $refinedOutputValue = $refiner['refiner']->call($this, $outputValueToRefine, $inputValue);
                } else {
                    $refinedOutputValue = $refiner['refiner']->refine($outputValueToRefine);
                }
            }

            if (is_array($refiner) && isset($outputValue[$refiner['key']])) {
                $outputValue[$refiner['key']] = $refinedOutputValue;
            } else {
                $outputValue = $refinedOutputValue;
            }
        }

        return $outputValue;
    }

    protected function makeOutput(mixed $outputData, Input $input): Output
    {
        $output = new Output(
            $outputData,
            $input->keep,
        );

        $output = $this->runSubCrawlersFor($output);

        $this->keepData($output, $input);

        return $output;
    }

    protected function keepData(Output $output, Input $input): void
    {
        if (!$this->keepsAnything()) {
            return;
        }

        if ($this->keepsAnythingFromInputData()) {
            $inputDataToKeep = $this->getInputDataToKeep($input, $output->keep);

            if (!empty($inputDataToKeep)) {
                $output->keep($inputDataToKeep);
            }
        }

        if ($this->keepsAnythingFromOutputData()) {
            $outputDataToKeep = $this->getOutputDataToKeep($output, $output->keep);

            if (!empty($outputDataToKeep)) {
                $output->keep($outputDataToKeep);
            }
        }
    }

    /**
     * @param array<string, mixed> $alreadyKept
     * @return mixed[]|null
     */
    protected function getOutputDataToKeep(Output $output, array $alreadyKept): ?array
    {
        return $this->getInputOrOutputDataToKeep($output, $alreadyKept);
    }

    /**
     * @param array<string, mixed> $alreadyKept
     * @return mixed[]|null
     */
    protected function getInputDataToKeep(Input $input, array $alreadyKept): ?array
    {
        return $this->getInputOrOutputDataToKeep($input, $alreadyKept);
    }

    /**
     * @param array<string, mixed> $alreadyKept
     * @return mixed[]|null
     */
    protected function getInputOrOutputDataToKeep(Io $io, array $alreadyKept): ?array
    {
        $keepProperty = $io instanceof Output ? $this->keep : $this->keepFromInput;

        $keepAsProperty = $io instanceof Output ? $this->keepAs : $this->keepInputAs;

        $data = $io->get();

        $isScalarValue = OutputTypeHelper::isScalar($data);

        if ($keepAsProperty !== null && ($isScalarValue || $keepProperty === false)) {
            return [$keepAsProperty => $data];
        } elseif ($keepProperty !== false) {
            if ($isScalarValue) {
                $variableMessagePart = $io instanceof Output ? 'yielded an output' : 'received an input';

                $this->logger?->error(
                    'A ' . get_class($this) . ' step ' . $variableMessagePart . ' that is neither an associative ' .
                    'array, nor an object, so there is no key for the value to keep. Please define a key for the ' .
                    'output by using keepAs() instead of keep(). The value is now kept with an \'unnamed\' key.',
                );

                return [$this->nextUnnamedKey($alreadyKept) => $data];
            }

            $data = !is_array($data) ? OutputTypeHelper::objectToArray($data) : $data;

            if ($keepProperty === true) {
                return $data;
            } elseif (is_string($keepProperty)) {
                return [$keepProperty => $this->getOutputPropertyFromArray($keepProperty, $data)];
            }

            return $this->mapKeepProperties($data, $keepProperty);
        }

        return null;
    }

    /**
     * @param array<string, mixed> $data
     * @return string
     */
    protected function nextUnnamedKey(array $data): string
    {
        $i = 1;

        while (isset($data['unnamed' . $i])) {
            $i++;
        }

        return 'unnamed' . $i;
    }

    /**
     * @param mixed[] $data
     * @param array<int|string, string> $keep
     * @return mixed[]
     */
    protected function mapKeepProperties(array $data, array $keep): array
    {
        $keepData = [];

        foreach ($keep as $key => $value) {
            if (is_int($key)) {
                $keepData[$value] = $this->getOutputPropertyFromArray($value, $data);
            } elseif (is_string($key)) {
                $keepData[$key] = $this->getOutputPropertyFromArray($value, $data);
            }
        }

        return $keepData;
    }

    /**
     * @param mixed[] $data
     */
    protected function getOutputPropertyFromArray(string $key, array $data): mixed
    {
        if (array_key_exists($key, $data)) {
            return $data[$key];
        } elseif ($this->isOutputKeyAlias($key)) {
            return $data[$this->getOutputKeyAliasRealKey($key)];
        }

        $data = OutputTypeHelper::recursiveChildObjectsToArray($data);

        $dot = new Dot($data);

        return $dot->get($key);
    }

    protected function isOutputKeyAlias(string $key): bool
    {
        return array_key_exists($key, $this->outputKeyAliases());
    }

    protected function getOutputKeyAliasRealKey(string $key): string
    {
        $mapping = $this->outputKeyAliases();

        return $mapping[$key];
    }

    protected function maxOutputsExceeded(): bool
    {
        return $this->maxOutputs !== null && $this->currentOutputCount >= $this->maxOutputs;
    }

    protected function trackYieldedOutput(): void
    {
        if ($this->maxOutputs !== null) {
            $this->currentOutputCount += 1;
        }
    }
}


================================================
FILE: src/Steps/Csv.php
================================================
<?php

namespace Crwlr\Crawler\Steps;

use Exception;
use Generator;
use InvalidArgumentException;

class Csv extends Step
{
    protected string $method = 'string';

    protected string $separator = ',';

    protected string $enclosure = '"';

    protected string $escape = '\\';

    /**
     * @param array<string|null> $columnMapping
     */
    public function __construct(protected array $columnMapping = [], protected bool $skipFirstLine = false) {}

    /**
     * @param array<string|null> $columnMapping
     */
    public static function parseString(array $columnMapping = [], bool $skipFirstLine = false): self
    {
        return new self($columnMapping, $skipFirstLine);
    }

    /**
     * @param array<string|null> $columnMapping
     */
    public static function parseFile(array $columnMapping = [], bool $skipFirstLine = false): self
    {
        $instance = new self($columnMapping, $skipFirstLine);

        $instance->method = 'file';

        return $instance;
    }

    public function skipFirstLine(): static
    {
        $this->skipFirstLine = true;

        return $this;
    }

    public function separator(string $separator): static
    {
        if (strlen($separator) > 1) {
            throw new InvalidArgumentException('CSV separator must be single character');
        }

        $this->separator = $separator;

        return $this;
    }

    public function enclosure(string $enclosure): static
    {
        $this->enclosure = $enclosure;

        return $this;
    }

    public function escape(string $escape): static
    {
        $this->escape = $escape;

        return $this;
    }

    public function outputType(): StepOutputType
    {
        return StepOutputType::AssociativeArrayOrObject;
    }

    protected function validateAndSanitizeInput(mixed $input): string
    {
        if ($this->method === 'string') {
            return $this->validateAndSanitizeStringOrHttpResponse($input);
        } elseif ($this->method === 'file') {
            return $this->validateAndSanitizeStringOrStringable($input);
        } else {
            throw new InvalidArgumentException('Parse CSV method must be string or file');
        }
    }

    /**
     * @param string $input
     * @throws Exception
     */
    protected function invoke(mixed $input): Generator
    {
        if ($this->method === 'file') {
            if (!file_exists($input)) {
                throw new Exception('CSV file not found');
            }

            yield from $this->readFile($input);
        } elseif ($this->method === 'string') {
            yield from $this->mapLines(explode(PHP_EOL, $input));
        }
    }

    protected function readFile(string $filePath): Generator
    {
        $handle = fopen($filePath, 'r');

        if ($handle === false) {
            return;
        }

        $isFirstLine = true;

        while (($row = fgetcsv($handle, 0, $this->separator, $this->enclosure, $this->escape)) !== false) {
            if ($isFirstLine) {
                if (empty($this->columnMapping)) {
                    $this->columnMapping = $row;
                }

                $isFirstLine = false;

                if ($this->skipFirstLine) {
                    continue;
                }
            }

            yield $this->mapRow($row);
        }

        fclose($handle);
    }

    /**
     * @param string[] $lines
     * @return Generator
     */
    protected function mapLines(array $lines): Generator
    {
        foreach ($lines as $key => $line) {
            if ($key === 0 && $this->skipFirstLine) {
                if (empty($this->columnMapping)) {
                    $this->columnMapping = str_getcsv($line, $this->separator, $this->enclosure, $this->escape);
                }

                continue;
            }

            if (!empty($line)) {
                yield $this->mapRow(str_getcsv($line, $this->separator, $this->enclosure, $this->escape));
            }
        }
    }

    /**
     * @param mixed[] $row
     * @return mixed[]
     */
    protected function mapRow(array $row): array
    {
        $count = 0;
        $mapped = [];

        foreach ($row as $column) {
            if (isset($this->columnMapping[$count]) && !empty($this->columnMapping[$count])) {
                $mapped[$this->columnMapping[$count]] = $column;
            }

            $count++;
        }

        return $mapped;
    }
}


================================================
FILE: src/Steps/Dom/DomDocument.php
================================================
<?php

namespace Crwlr\Crawler\Steps\Dom;

use Dom\Document;
use Symfony\Component\DomCrawler\Crawler;

abstract class DomDocument extends Node
{
    public function __construct(string $source)
    {
        parent::__construct($this->makeDocumentInstance($source)); // @phpstan-ignore-line
    }

    /**
     * @param string $source
     * @return Document|Crawler
     */
    abstract protected function makeDocumentInstance(string $source): object;
}


================================================
FILE: src/Steps/Dom/HtmlDocument.php
================================================
<?php

namespace Crwlr\Crawler\Steps\Dom;

use Crwlr\Utils\PhpVersion;
use DOMNode;
use Symfony\Component\DomCrawler\Crawler;

use const DOM\HTML_NO_DEFAULT_NS;

/**
 * @method HtmlElement|null querySelector(string $selector)
 * @method NodeList<int, HtmlElement> querySelectorAll(string $selector)
 * @method NodeList<int, HtmlElement> queryXPath(string $selector)
 */

class HtmlDocument extends DomDocument
{
    /**
     * Gets the href attribute of a <base> tag in the document
     *
     * In case there are multiple base elements in the document:
     * https://developer.mozilla.org/en-US/docs/Web/HTML/Element/base
     * "If multiple <base> elements are used, only the first href and first target are obeyed..."
     */
    public function getBaseHref(): ?string
    {
        $baseTag = $this->querySelector('base');

        return $baseTag?->getAttribute('href');
    }

    public function outerHtml(): string
    {
        return $this->outerSource();
    }

    /**
     * @param \Dom\Node|DOMNode|Crawler $node
     */
    protected function makeChildNodeInstance(object $node): Node
    {
        return new HtmlElement($node);
    }

    /**
     * @return \Dom\HTMLDocument|Crawler
     */
    protected function makeDocumentInstance(string $source): object
    {
        $source = $this->fixInvalidCharactersInSource($source);

        if (PhpVersion::isAtLeast(8, 4)) {
            return \Dom\HTMLDocument::createFromString($source, HTML_NO_DEFAULT_NS | LIBXML_NOERROR);
        }

        return new Crawler($source);
    }

    /**
     * Converts charset to HTML-entities to ensure valid parsing.
     */
    private function fixInvalidCharactersInSource(string $source): string
    {
        if (function_exists('iconv')) {
            $charset = preg_match('//u', $source) ? 'UTF-8' : 'ISO-8859-1';

            preg_match('/(charset *= *["\']?)([a-zA-Z\-0-9_:.]+)/i', $source, $matches);

            if ($matches && !empty($matches[2])) {
                $declaredCharset = strtoupper($matches[2]);
            } else {
                $declaredCharset = null;
            }

            if ($charset === 'ISO-8859-1' && $declaredCharset === 'UTF-8') {
                $fixedSource = iconv("ISO-8859-1", "UTF-8//TRANSLIT", $source);

                if ($fixedSource !== false) {
                    $source = $fixedSource;
                }
            }
        }

        return $source;
    }
}


================================================
FILE: src/Steps/Dom/HtmlElement.php
================================================
<?php

namespace Crwlr\Crawler\Steps\Dom;

use DOMNode;
use Symfony\Component\DomCrawler\Crawler;

/**
 * @method HtmlElement|null querySelector(string $selector)
 * @method NodeList<int, HtmlElement> querySelectorAll(string $selector)
 * @method NodeList<int, HtmlElement> queryXPath(string $selector)
 */

class HtmlElement extends Node
{
    public function outerHtml(): string
    {
        return $this->outerSource();
    }

    public function innerHtml(): string
    {
        return $this->innerSource();
    }

    public function html(): string
    {
        return $this->innerHtml();
    }

    /**
     * @param \Dom\Node|DOMNode|Crawler $node
     */
    protected function makeChildNodeInstance(object $node): Node
    {
        return new HtmlElement($node);
    }
}


================================================
FILE: src/Steps/Dom/Node.php
================================================
<?php

namespace Crwlr\Crawler\Steps\Dom;

use Dom\Document;
use Dom\Element;
use Dom\XPath;
use DOMNode;
use Symfony\Component\DomCrawler\Crawler;

abstract class Node
{
    /**
     * @var \Dom\Node|Element|Crawler
     */
    private object $node;

    /**
     * @param \Dom\Node|Element|DOMNode|Crawler $node
     */
    public function __construct(object $node)
    {
        if ($node instanceof DOMNode) {
            $node = new Crawler($node);
        }

        $this->node = $node;
    }

    public function querySelector(string $selector): ?Node
    {
        if ($this->node instanceof Crawler) {
            $filtered = $this->node->filter($selector);

            return $filtered->count() > 0 ? $this->makeChildNodeInstance($filtered->first()) : null;
        }

        $result = $this->node->querySelector($selector);

        return $result !== null ? $this->makeChildNodeInstance($result) : null;
    }

    public function querySelectorAll(string $selector): NodeList
    {
        if ($this->node instanceof Crawler) {
            return $this->makeNodeListInstance($this->node->filter($selector));
        }

        return $this->makeNodeListInstance($this->node->querySelectorAll($selector));
    }

    public function queryXPath(string $query): NodeList
    {
        $node = $this->node;

        if (!$node instanceof Crawler) {
            $node = new Crawler($this->outerSource());
        }

        return $this->makeNodeListInstance($node->filterXPath($query));
    }

    public function removeNodesMatchingSelector(string $selector): void
    {
        foreach ($this->querySelectorAll($selector) as $node) {
            if ($node->node instanceof Crawler) {
                $node = $node->node->getNode(0);

                if ($node) {
                    $node->parentNode?->removeChild($node);
                }
            } else {
                $node->node->parentNode?->removeChild($node->node);
            }
        }
    }

    public function removeNodesMatchingXPath(string $query): void
    {
        if ($this->node instanceof Crawler) {
            foreach ($this->node->filterXPath($query) as $node) {
                $node->parentNode?->removeChild($node);
            }
        } else {
            $node = $this->getParentDocumentOfNode($this->node);

            if ($node) {
                $xpath = new XPath($node);

                foreach ($xpath->query($query) as $node) {
                    $node->parentNode?->removeChild($node);
                }
            }
        }
    }

    public function nodeName(): string
    {
        if ($this->node instanceof Crawler) {
            $nodeName = $this->node->nodeName();
        } else {
            $nodeName = $this->node->nodeName ?? '';
        }

        return strtolower($nodeName);
    }

    public function text(): string
    {
        if ($this->node instanceof Crawler) {
            $text = $this->node->text();
        } else {
            $text = is_string($this->node->textContent) ? $this->node->textContent : '';
        }

        return trim(
            preg_replace("/(?:[ \n\r\t\x0C]{2,}+|[\n\r\t\x0C])/", ' ', $text) ?? $text,
            " \n\r\t\x0C",
        );
    }

    public function getAttribute(string $attributeName): ?string
    {
        if ($this->node instanceof Crawler) {
            return $this->node->attr($attributeName);
        }

        return $this->node->getAttribute($attributeName);
    }

    /**
     * @param \Dom\Node|DOMNode|Crawler $node
     */
    abstract protected function makeChildNodeInstance(object $node): Node;

    protected function outerSource(): string
    {
        if ($this->node instanceof Crawler) {
            return $this->node->count() > 0 ? $this->node->outerHtml() : '';
        }

        if ($this->node instanceof Document) {
            $node = $this->node->documentElement;

            if ($this->node instanceof \Dom\HTMLDocument) {
                return $this->node->saveHTML($node);
            } elseif ($this->node instanceof \Dom\XMLDocument) {
                $source = $this->node->saveXML($node);

                return $source !== false ? $source : '';
            }
        }

        $parentDocument = $this->getParentDocumentOfNode($this->node);

        if ($parentDocument) {
            if ($parentDocument instanceof \Dom\HTMLDocument) {
                return $parentDocument->saveHTML($this->node);
            } elseif ($parentDocument instanceof \Dom\XMLDocument) {
                $source = $parentDocument->saveXML($this->node);

                return $source !== false ? $source : '';
            }
        }

        return $this->node->innerHTML;
    }

    protected function innerSource(): string
    {
        if ($this->node instanceof Crawler) {
            return $this->node->html();
        }

        return $this->node->innerHTML;
    }

    /**
     * @param \Dom\NodeList<\Dom\Node>|Crawler $nodeList
     */
    protected function makeNodeListInstance(object $nodeList): NodeList
    {
        return new NodeList(
            $nodeList,
            function (object $node): Node {
                /** @var DOMNode|\Dom\Node $node */
                return $this->makeChildNodeInstance($node);
            },
        );
    }

    /**
     * @param \Dom\Node|Element $node
     * @return Document|null
     */
    private function getParentDocumentOfNode(object $node): ?object
    {
        if ($node instanceof Document) {
            return $node;
        }

        $parentDocument = $node->parentNode;

        while ($parentDocument && !$parentDocument instanceof Document) {
            $parentDocument = $parentDocument->parentNode;
        }

        if ($parentDocument instanceof Document) {
            return $parentDocument;
        }

        return null;
    }
}


================================================
FILE: src/Steps/Dom/NodeList.php
================================================
<?php

namespace Crwlr\Crawler\Steps\Dom;

use ArrayIterator;
use Closure;
use Countable;
use Dom\Element;
use DOMNode;
use Exception;
use Iterator;
use IteratorAggregate;
use Symfony\Component\DomCrawler\Crawler;

/**
 * @implements IteratorAggregate<int, Node>
 */

class NodeList implements IteratorAggregate, Countable
{
    /**
     * @param \Dom\NodeList<\Dom\Node>|\Dom\NodeList<Element>|Crawler|array<Node> $nodeList
     */
    public function __construct(
        private readonly object|array $nodeList,
        private readonly ?Closure $makeNodeInstance = null,
    ) {}

    /**
     * @throws Exception
     */
    public function first(): ?Node
    {
        $iterator = $this->getIterator();

        $iterator->rewind();

        return $iterator->current();
    }

    /**
     * @throws Exception
     */
    public function last(): ?Node
    {
        $iterator = $this->getIterator();

        foreach ($iterator as $node) {
        }

        return $node ?? null;
    }

    /**
     * @throws Exception
     */
    public function nth(int $index): ?Node
    {
        $iterator = $this->getIterator();

        $i = 0;

        foreach ($iterator as $node) {
            if (($i + 1) === $index) {
                return $node;
            }

            $i++;
        }

        return null;
    }

    /**
     * @return mixed[]
     * @throws Exception
     */
    public function each(Closure $callback): array
    {
        $data = [];

        foreach ($this->getIterator() as $key => $node) {
            $data[] = $callback($node, $key);
        }

        return $data;
    }

    /**
     * @return int<0, max>
     */
    public function count(): int
    {
        if (is_array($this->nodeList)) {
            return count($this->nodeList);
        }

        return max(0, $this->nodeList->count());
    }

    public function getIterator(): Iterator
    {
        if (is_array($this->nodeList)) {
            return new ArrayIterator($this->nodeList);
        }

        $iterator = $this->nodeList->getIterator();

        /** @var Iterator<int, DOMNode|\Dom\Node> $iterator */

        return new class ($iterator, $this->makeNodeInstance) implements Iterator {
            /**
             * @param Iterator<int, DOMNode|\Dom\Node> $iterator
             */
            public function __construct(
                private readonly Iterator $iterator,
                private readonly ?Closure $makeNodeInstanceCallback = null,
            ) {}

            public function current(): ?Node
            {
                return $this->makeNodeInstance($this->iterator->current());
            }

            public function next(): void
            {
                $this->iterator->next();
            }

            public function key(): mixed
            {
                return $this->iterator->key();
            }

            public function valid(): bool
            {
                return $this->iterator->valid();
            }

            public function rewind(): void
            {
                $this->iterator->rewind();
            }

            /**
             * @param \Dom\Node|DOMNode|Crawler $node
             */
            private function makeNodeInstance(mixed $node): ?Node
            {
                if (!is_object($node)) { // @phpstan-ignore-line change when min. required PHP version is 8.4.
                    return null;
                }

                return $this->makeNodeInstanceCallback?->__invoke($node) ?? null;
            }
        };
    }
}


================================================
FILE: src/Steps/Dom/XmlDocument.php
================================================
<?php

namespace Crwlr\Crawler\Steps\Dom;

use Crwlr\Utils\PhpVersion;
use DOMNode;
use Symfony\Component\DomCrawler\Crawler;
use Throwable;
use voku\helper\ASCII;

/**
 * @method XmlElement|null querySelector(string $selector)
 * @method NodeList<int, XmlElement> querySelectorAll(string $selector)
 * @method NodeList<int, XmlElement> queryXPath(string $selector)
 */

class XmlDocument extends DomDocument
{
    public function outerXml(): string
    {
        return $this->outerSource();
    }

    /**
     * @param \Dom\Node|DOMNode|Crawler $node
     */
    protected function makeChildNodeInstance(object $node): Node
    {
        return new XmlElement($node);
    }

    /**
     * @return \Dom\XMLDocument|Crawler
     */
    protected function makeDocumentInstance(string $source): object
    {
        if (PhpVersion::isAtLeast(8, 4)) {
            try {
                return \Dom\XMLDocument::createFromString($source, LIBXML_NOERROR | LIBXML_NONET);
            } catch (Throwable) {
                $source = $this->replaceInvalidXmlCharacters($source);

                try {
                    return \Dom\XMLDocument::createFromString($source, LIBXML_NOERROR | LIBXML_NONET);
                } catch (Throwable) {
                } // If it fails again, try it with symfony DOM Crawler as fallback.
            }
        }

        $crawler = new Crawler($source);

        if ($crawler->count() === 0) {
            $source = $this->replaceInvalidXmlCharacters($source);

            $crawler = new Crawler($source);
        }

        return $crawler;
    }

    /**
     * Replace characters that aren't valid within XML documents
     *
     * Sometimes XML parsing errors occur because of characters that aren't valid within XML documents.
     * Therefore, this method finds and replaces them with valid alternatives or HTML entities.
     * For best results in those cases, please install the voku/portable-ascii composer package.
     *
     * @param string $value
     * @return string
     */
    private function replaceInvalidXmlCharacters(string $value): string
    {
        return preg_replace_callback('/[^\x{9}\x{A}\x{D}\x{20}-\x{D7FF}\x{E000}-\x{FFFD}]/u', function ($match) {
            $replacement = class_exists('voku\helper\ASCII') ? ASCII::to_transliterate($match[0]) : '?';

            if ($replacement === '?') {
                return '&#' . mb_ord($match[0]) . ';';
            }

            return $replacement;
        }, $value) ?? $value;
    }
}


================================================
FILE: src/Steps/Dom/XmlElement.php
================================================
<?php

namespace Crwlr\Crawler\Steps\Dom;

use DOMNode;
use Symfony\Component\DomCrawler\Crawler;

/**
 * @method XmlElement|null querySelector(string $selector)
 * @method NodeList<int, XmlElement> querySelectorAll(string $selector)
 * @method NodeList<int, XmlElement> queryXPath(string $selector)
 */

class XmlElement extends Node
{
    public function outerXml(): string
    {
        return $this->outerSource();
    }

    public function innerXml(): string
    {
        return $this->innerSource();
    }

    /**
     * @param \Dom\Node|DOMNode|Crawler $node
     */
    protected function makeChildNodeInstance(object $node): Node
    {
        return new XmlElement($node);
    }
}


================================================
FILE: src/Steps/Dom.php
================================================
<?php

namespace Crwlr\Crawler\Steps;

use Crwlr\Crawler\Cache\Exceptions\MissingZlibExtensionException;
use Crwlr\Crawler\Loader\Http\Messages\RespondedRequest;
use Crwlr\Crawler\Logger\PreStepInvocationLogger;
use Crwlr\Crawler\Steps\Dom\DomDocument;
use Crwlr\Crawler\Steps\Dom\HtmlDocument;
use Crwlr\Crawler\Steps\Dom\Node;
use Crwlr\Crawler\Steps\Dom\NodeList;
use Crwlr\Crawler\Steps\Dom\XmlDocument;
use Crwlr\Crawler\Steps\Html\CssSelector;
use Crwlr\Crawler\Steps\Html\DomQuery;
use Crwlr\Crawler\Steps\Html\Exceptions\InvalidDomQueryException;
use Crwlr\Crawler\Steps\Html\XPathQuery;
use Crwlr\Html2Text\Exceptions\InvalidHtmlException;
use Exception;
use Generator;
use InvalidArgumentException;

abstract class Dom extends Step
{
    protected bool $root = false;

    protected ?DomQuery $each = null;

    protected ?DomQuery $first = null;

    protected ?DomQuery $last = null;

    /**
     * @var array<int|string, string|DomQuery|Dom>
     */
    protected array $mapping = [];

    protected string|DomQuery|null $singleSelector = null;

    protected ?string $baseUrl = null;

    /**
     * @param string|DomQuery|array<int|string, string|DomQuery> $selectorOrMapping
     */
    final public function __construct(string|DomQuery|array $selectorOrMapping = [])
    {
        $this->addLogger(new PreStepInvocationLogger());

        $this->extract($selectorOrMapping);
    }

    public static function root(): static
    {
        $instance = new static();

        $instance->root = true;

        return $instance;
    }

    public static function each(string|DomQuery $domQuery): static
    {
        $instance = new static();

        $instance->each = is_string($domQuery) ? $instance->makeDefaultDomQueryInstance($domQuery) : $domQuery;

        if (trim($instance->each->query) === '') {
            $instance->logger?->warning(
                'The selector you provided for the ‘each’ option is empty. This option is intended to allow ' .
                'extracting multiple output objects from a single page, so an empty selector most likely doesn’t ' .
                'make sense, as it will definitely result in only one output object.',
            );
        }

        return $instance;
    }

    public static function first(string|DomQuery $domQuery): static
    {
        $instance = new static();

        $instance->first = is_string($domQuery) ? $instance->makeDefaultDomQueryInstance($domQuery) : $domQuery;

        if (trim($instance->first->query) === '') {
            $instance->logger?->warning(
                'The selector you provided for the ‘first’ option is empty. This option is meant to restrict your ' .
                'extraction to a specific parent element, so an empty selector most likely doesn’t make sense. ' .
                'Either define the desired selector or use the root() method instead.',
            );
        }

        return $instance;
    }

    public static function last(string|DomQuery $domQuery): static
    {
        $instance = new static();

        $instance->last = is_string($domQuery) ? $instance->makeDefaultDomQueryInstance($domQuery) : $domQuery;

        if (trim($instance->last->query) === '') {
            $instance->logger?->warning(
                'The selector you provided for the ‘last’ option is empty. This option is meant to restrict your ' .
                'extraction to a specific parent element, so an empty selector most likely doesn’t make sense. ' .
                'Either define the desired selector or use the root() method instead.',
            );
        }

        return $instance;
    }

    /**
     * @throws InvalidDomQueryException
     */
    public static function cssSelector(string $selector): CssSelector
    {
        return new CssSelector($selector);
    }

    /**
     * @throws InvalidDomQueryException
     */
    public static function xPath(string $query): XPathQuery
    {
        return new XPathQuery($query);
    }

    abstract protected function makeDefaultDomQueryInstance(string $query): DomQuery;

    /**
     * @param string|DomQuery|array<string|DomQuery|Dom> $selectorOrMapping
     */
    public function extract(string|DomQuery|array $selectorOrMapping): static
    {
        if (is_array($selectorOrMapping)) {
            $this->mapping = $selectorOrMapping;
        } else {
            $this->singleSelector = $selectorOrMapping;
        }

        return $this;
    }

    public function outputType(): StepOutputType
    {
        return empty($this->mapping) && $this->singleSelector ?
            StepOutputType::Scalar :
            StepOutputType::AssociativeArrayOrObject;
    }

    /**
     * @param HtmlDocument|Node $input
     * @throws Exception
     */
    protected function invoke(mixed $input): Generator
    {
        $base = $this->getBase($input);

        if (!$base || ($base instanceof NodeList && $base->count() === 0)) {
            return;
        }

        if (empty($this->mapping) && $this->singleSelector) {
            yield from $this->singleSelector($base);
        } else {
            if ($this->each) {
                if ($base instanceof NodeList) {
                    foreach ($base as $element) {
                        yield $this->mapProperties($element);
                    }
                }
            } elseif ($base instanceof Node) {
                yield $this->mapProperties($base);
            }
        }
    }


    /**
     * @throws InvalidArgumentException|MissingZlibExtensionException
     */
    protected function validateAndSanitizeInput(mixed $input): HtmlDocument|XmlDocument
    {
        if ($input instanceof RespondedRequest) {
            $this->baseUrl = $input->effectiveUri();
        }

        return new HtmlDocument($this->validateAndSanitizeStringOrHttpResponse($input));
    }

    /**
     * @throws InvalidHtmlException
     * @throws Exception
     */
    protected function singleSelector(Node|NodeList $nodeOrNodeList): Generator
    {
        if ($this->singleSelector === null) {
            return;
        }

        $domQuery = is_string($this->singleSelector) ?
            $this->makeDefaultDomQueryInstance($this->singleSelector) :
            $this->singleSelector;

        if ($this->baseUrl !== null) {
            $domQuery->setBaseUrl($this->baseUrl);
        }

        if ($nodeOrNodeList instanceof NodeList) {
            $outputs = [];

            foreach ($nodeOrNodeList as $node) {
                $outputs[] = $domQuery->apply($node);
            }
        } else {
            $outputs = $domQuery->apply($nodeOrNodeList);
        }

        if (is_array($outputs)) {
            foreach ($outputs as $output) {
                yield $output;
            }
        } elseif ($outputs !== null) {
            yield $outputs;
        }
    }

    /**
     * @return mixed[]
     * @throws Exception
     */
    protected function mapProperties(Node $node): array
    {
        $mappedProperties = [];

        foreach ($this->mapping as $key => $domQuery) {
            if ($domQuery instanceof Dom) {
                $domQuery->baseUrl = $this->baseUrl;

                $mappedProperties[$key] = $this->getDataFromChildDomStep($domQuery, $node);
            } else {
                if (is_string($domQuery)) {
                    $domQuery = $this->makeDefaultDomQueryInstance($domQuery);
                }

                if ($this->baseUrl !== null) {
                    $domQuery->setBaseUrl($this->baseUrl);
                }

                $mappedProperties[$key] = $domQuery->apply($node);
            }
        }

        return $mappedProperties;
    }

    /**
     * @throws Exception
     */
    protected function getBase(DomDocument|Node $document): Node|NodeList|null
    {
        if ($this->root) {
            return $document;
        } elseif ($this->each) {
            return $this->getBaseFromDomNode($document, $this->each, each: true);
        } elseif ($this->first) {
            return $this->getBaseFromDomNode($document, $this->first, first: true);
        } elseif ($this->last) {
            return $this->getBaseFromDomNode($document, $this->last, last: true);
        }

        throw new Exception('Invalid state: no base selector');
    }

    /**
     * @throws Exception
     */
    private function getBaseFromDomNode(
        DomDocument|Node $document,
        DomQuery $query,
        bool $each = false,
        bool $first = false,
        bool $last = false,
    ): Node|NodeList|null {
        if (trim($query->query) === '') {
            return $each ? new NodeList([$document]) : $document;
        }

        if ($each) {
            return $query instanceof CssSelector ?
                $document->querySelectorAll($query->query) :
                $document->queryXPath($query->query);
        } elseif ($first) {
            return $this->first instanceof CssSelector ?
                $document->querySelector($query->query) :
                $document->queryXPath($query->query)->first();
        } elseif ($last) {
            return $this->last instanceof CssSelector ?
                $document->querySelectorAll($query->query)->last() :
                $document->queryXPath($query->query)->last();
        }

        return $document;
    }

    /**
     * @return mixed[]
     * @throws Exception
     */
    protected function getDataFromChildDomStep(Dom $step, Node $node): array
    {
        $childValue = iterator_to_array($step->invoke($node));

        // When the child step was not used with each() as base and the result is an array with one
        // element (index/key "0") being an array, use that child array.
        if (!$step->each && count($childValue) === 1 && isset($childValue[0]) && is_array($childValue[0])) {
            return $childValue[0];
        }

        return $childValue;
    }
}


================================================
FILE: src/Steps/Exceptions/PreRunValidationException.php
================================================
<?php

namespace Crwlr\Crawler\Steps\Exceptions;

use Exception;

class PreRunValidationException extends Exception {}


================================================
FILE: src/Steps/Filters/AbstractFilter.php
================================================
<?php

namespace Crwlr\Crawler\Steps\Filters;

use Exception;
use InvalidArgumentException;

abstract class AbstractFilter implements FilterInterface
{
    protected ?string $useKey = null;

    protected bool|FilterInterface $or = false;

    public function useKey(string $key): static
    {
        $this->useKey = $key;

        return $this;
    }

    /**
     * Step::orWhere() uses this method to link further Filters with OR to this filter.
     * The Step then takes care of checking if one of the ORs evaluates to true.
     */
    public function addOr(FilterInterface $filter): void
    {
        if ($this->or instanceof FilterInterface) {
            $or = $this->or;

            while ($or->getOr()) {
                $or = $or->getOr();
            }

            $or->addOr($filter);
        } else {
            $this->or = $filter;
        }
    }

    /**
     * Get the Filter linked to this Filter as OR.
     */
    public function getOr(): ?FilterInterface
    {
        return $this->or instanceof FilterInterface ? $this->or : null;
    }

    public function negate(): NegatedFilter
    {
        return new NegatedFilter($this);
    }

    /**
     * @throws Exception
     */
    protected function getKey(mixed $value): mixed
    {
        if ($this->useKey === null) {
            return $value;
        }

        if (!is_array($value) && !is_object($value)) {
            throw new InvalidArgumentException('Can only filter by key with array or object output.');
        }

        if (is_object($value) && !property_exists($value, $this->useKey) && method_exists($value, '__serialize')) {
            $serialized = $value->__serialize();

            if (array_key_exists($this->useKey, $serialized)) {
                $value = $serialized;
            }
        }

        if (
            (is_array($value) && !array_key_exists($this->useKey, $value)) ||
            (is_object($value) && !property_exists($value, $this->useKey))
        ) {
            throw new Exception('Key to filter by does not exist in output.');
        }

        return is_array($value) ? $value[$this->useKey] : $value->{$this->useKey};
    }
}


================================================
FILE: src/Steps/Filters/ArrayFilter.php
================================================
<?php

namespace Crwlr\Crawler\Steps\Filters;

use Exception;

class ArrayFilter extends AbstractFilter
{
    use Filterable;

    /**
     * @throws Exception
     */
    public function evaluate(mixed $valueInQuestion): bool
    {
        $valueInQuestion = $this->getKey($valueInQuestion);

        if (is_array($valueInQuestion) && !empty($valueInQuestion)) {
            foreach ($valueInQuestion as $value) {
                if ($this->passesAllFilters($value)) {
                    return true;
                }
            }
        }

        return false;
    }
}


================================================
FILE: src/Steps/Filters/ClosureFilter.php
================================================
<?php

namespace Crwlr\Crawler\Steps\Filters;

use Closure;
use Exception;

class ClosureFilter extends AbstractFilter
{
    public function __construct(
        protected readonly Closure $closure,
    ) {}

    /**
     * @throws Exception
     */
    public function evaluate(mixed $valueInQuestion): bool
    {
        $valueInQuestion = $this->getKey($valueInQuestion);

        return $this->closure->call($this, $valueInQuestion);
    }
}


================================================
FILE: src/Steps/Filters/ComparisonFilter.php
================================================
<?php

namespace Crwlr\Crawler\Steps\Filters;

use Crwlr\Crawler\Steps\Filters\Enums\ComparisonFilterRule;
use Exception;

class ComparisonFilter extends AbstractFilter
{
    public function __construct(
        protected readonly ComparisonFilterRule $filterRule,
        protected readonly mixed $compareTo,
    ) {}

    /**
     * @throws Exception
     */
    public function evaluate(mixed $valueInQuestion): bool
    {
        return $this->filterRule->evaluate($this->getKey($valueInQuestion), $this->compareTo);
    }
}


================================================
FILE: src/Steps/Filters/Enums/ComparisonFilterRule.php
================================================
<?php

namespace Crwlr\Crawler\Steps\Filters\Enums;

enum ComparisonFilterRule
{
    case Equal;

    case NotEqual;

    case GreaterThan;

    case GreaterThanOrEqual;

    case LessThan;

    case LessThanOrEqual;

    public function evaluate(mixed $value, mixed $compareTo): bool
    {
        return match ($this) {
            self::Equal => ($value === $compareTo),
            self::NotEqual => ($value !== $compareTo),
            self::GreaterThan => ($value > $compareTo),
            self::GreaterThanOrEqual => ($value >= $compareTo),
            self::LessThan => ($value < $compareTo),
            self::LessThanOrEqual => ($value <= $compareTo),
        };
    }
}


================================================
FILE: src/Steps/Filters/Enums/StringFilterRule.php
================================================
<?php

namespace Crwlr\Crawler\Steps\Filters\Enums;

enum StringFilterRule
{
    case Contains;

    case StartsWith;

    case EndsWith;

    public function evaluate(string $haystack, string $needle): bool
    {
        return match ($this) {
            self::Contains => str_contains($haystack, $needle),
            self::StartsWith => str_starts_with($haystack, $needle),
            self::EndsWith => str_ends_with($haystack, $needle),
        };
    }
}


================================================
FILE: src/Steps/Filters/Enums/StringLengthFilterRule.php
================================================
<?php

namespace Crwlr\Crawler\Steps\Filters\Enums;

enum StringLengthFilterRule
{
    case Equal;

    case NotEqual;

    case GreaterThan;

    case GreaterThanOrEqual;

    case LessThan;

    case LessThanOrEqual;

    public function evaluate(string $subject, int $compareTo): bool
    {
        $actualStringLength = strlen($subject);

        return match ($this) {
            self::Equal => ($actualStringLength === $compareTo),
            self::NotEqual => ($actualStringLength !== $compareTo),
            self::GreaterThan => ($actualStringLength > $compareTo),
            self::GreaterThanOrEqual => ($actualStringLength >= $compareTo),
            self::LessThan => ($actualStringLength < $compareTo),
            self::LessThanOrEqual => ($actualStringLength <= $compareTo),
        };
    }
}


================================================
FILE: src/Steps/Filters/Enums/UrlFilterRule.php
================================================
<?php

namespace Crwlr\Crawler\Steps\Filters\Enums;

use Crwlr\Url\Exceptions\InvalidUrlException;
use Crwlr\Url\Url;
use Exception;

enum UrlFilterRule
{
    case Scheme;

    case Host;

    case Domain;

    case Path;

    case PathStartsWith;

    case PathMatches;

    public function evaluate(string $url, string $needle): bool
    {
        try {
            return match ($this) {
                self::Scheme => Url::parse($url)->scheme() === $needle,
                self::Host => Url::parse($url)->host() === $needle,
                self::Domain => Url::parse($url)->domain() === $needle,
                self::Path => Url::parse($url)->path() === $needle,
                self::PathStartsWith => str_starts_with(Url::parse($url)->path() ?? '', $needle),
                self::PathMatches => preg_match($this->prepareRegex($needle), Url::parse($url)->path() ?? '') === 1,
            };
        } catch (InvalidUrlException|Exception $exception) {
            return false;
        }
    }

    protected function prepareRegex(string $regex): string
    {
        return '~' . $regex . '~';
    }
}


================================================
FILE: src/Steps/Filters/Filter.php
================================================
<?php

namespace Crwlr\Crawler\Steps\Filters;

use Closure;
use Crwlr\Crawler\Steps\Filters\Enums\ComparisonFilterRule;
use Crwlr\Crawler\Steps\Filters\Enums\StringFilterRule;
use Crwlr\Crawler\Steps\Filters\Enums\StringLengthFilterRule;
use Crwlr\Crawler\Steps\Filters\Enums\UrlFilterRule;

abstract class Filter
{
    public static function equal(mixed $equalToValue): ComparisonFilter
    {
        return new ComparisonFilter(ComparisonFilterRule::Equal, $equalToValue);
    }

    public static function notEqual(mixed $notEqualToValue): ComparisonFilter
    {
        return new ComparisonFilter(ComparisonFilterRule::NotEqual, $notEqualToValue);
    }

    public static function greaterThan(mixed $greaterThanValue): ComparisonFilter
    {
        return new ComparisonFilter(ComparisonFilterRule::GreaterThan, $greaterThanValue);
    }

    public static function greaterThanOrEqual(mixed $greaterThanOrEqualValue): ComparisonFilter
    {
        return new ComparisonFilter(ComparisonFilterRule::GreaterThanOrEqual, $greaterThanOrEqualValue);
    }

    public static function lessThan(mixed $lessThanValue): ComparisonFilter
    {
        return new ComparisonFilter(ComparisonFilterRule::LessThan, $lessThanValue);
    }

    public static function lessThanOrEqual(mixed $lessThanOrEqualValue): ComparisonFilter
    {
        return new ComparisonFilter(ComparisonFilterRule::LessThanOrEqual, $lessThanOrEqualValue);
    }

    public static function stringContains(string $containsValue): StringFilter
    {
        return new StringFilter(StringFilterRule::Contains, $containsValue);
    }

    public static function stringStartsWith(string $startsWithValue): StringFilter
    {
        return new StringFilter(StringFilterRule::StartsWith, $startsWithValue);
    }

    public static function stringEndsWith(string $endsWithValue): StringFilter
    {
        return new StringFilter(StringFilterRule::EndsWith, $endsWithValue);
    }

    public static function stringLengthEqual(int $length): StringLengthFilter
    {
        return new StringLengthFilter(StringLengthFilterRule::Equal, $length);
    }

    public static function stringLengthNotEqual(int $length): StringLengthFilter
    {
        return new StringLengthFilter(StringLengthFilterRule::NotEqual, $length);
    }

    public static function stringLengthGreaterThan(int $length): StringLengthFilter
    {
        return new StringLengthFilter(StringLengthFilterRule::GreaterThan, $length);
    }

    public static function stringLengthGreaterThanOrEqual(int $length): StringLengthFilter
    {
        return new StringLengthFilter(StringLengthFilterRule::GreaterThanOrEqual, $length);
    }

    public static function stringLengthLessThan(int $length): StringLengthFilter
    {
        return new StringLengthFilter(StringLengthFilterRule::LessThan, $length);
    }

    public static function stringLengthLessThanOrEqual(int $length): StringLengthFilter
    {
        return new StringLengthFilter(StringLengthFilterRule::LessThanOrEqual, $length);
    }

    public static function urlScheme(string $urlSchemeValue): UrlFilter
    {
        return new UrlFilter(UrlFilterRule::Scheme, $urlSchemeValue);
    }

    public static function urlHost(string $urlHostValue): UrlFilter
    {
        return new UrlFilter(UrlFilterRule::Host, $urlHostValue);
    }

    public static function urlDomain(string $urlDomainValue): UrlFilter
    {
        return new UrlFilter(UrlFilterRule::Domain, $urlDomainValue);
    }

    public static function urlPath(string $urlPathValue): UrlFilter
    {
        return new UrlFilter(UrlFilterRule::Path, $urlPathValue);
    }

    public static function urlPathStartsWith(string $urlPathStartsWithValue): UrlFilter
    {
        return new UrlFilter(UrlFilterRule::PathStartsWith, $urlPathStartsWithValue);
    }

    public static function urlPathMatches(string $urlPathMatchesValue): UrlFilter
    {
        return new UrlFilter(UrlFilterRule::PathMatches, $urlPathMatchesValue);
    }

    public static function arrayHasElement(): ArrayFilter
    {
        return new ArrayFilter();
    }

    public static function custom(Closure $closure): ClosureFilter
    {
        return new ClosureFilter($closure);
    }
}


================================================
FILE: src/Steps/Filters/FilterInterface.php
================================================
<?php

namespace Crwlr\Crawler\Steps\Filters;

interface FilterInterface
{
    /**
     * When the value that will be evaluated is array or object, provide a key to use from that array/object.
     */
    public function useKey(string $key): static;

    /**
     * Shall return true if the $valueInQuestion should be kept or false when it should be filtered out.
     */
    public function evaluate(mixed $valueInQuestion): bool;

    public function addOr(FilterInterface $filter): void;

    public function getOr(): ?FilterInterface;

    public function negate(): NegatedFilter;
}


================================================
FILE: src/Steps/Filters/Filterable.php
================================================
<?php

namespace Crwlr\Crawler\Steps\Filters;

use Crwlr\Crawler\Steps\BaseStep;
use Exception;
use InvalidArgumentException;

trait Filterable
{
    /**
     * @var FilterInterface[]
     */
    protected array $filters = [];

    public function where(string|FilterInterface $keyOrFilter, ?FilterInterface $filter = null): static
    {
        if (is_string($keyOrFilter) && $filter === null) {
            throw new InvalidArgumentException('You have to provide a Filter (instance of FilterInterface)');
        } elseif (is_string($keyOrFilter)) {
            if ($this instanceof BaseStep && $this->isOutputKeyAlias($keyOrFilter)) {
                $keyOrFilter = $this->getOutputKeyAliasRealKey($keyOrFilter);
            }

            $filter->useKey($keyOrFilter);

            $this->filters[] = $filter;
        } else {
            $this->filters[] = $keyOrFilter;
        }

        return $this;
    }

    /**
     * @throws Exception
     */
    public function orWhere(string|FilterInterface $keyOrFilter, ?FilterInterface $filter = null): static
    {
        if (empty($this->filters)) {
            throw new Exception('No where before orWhere');
        } elseif (is_string($keyOrFilter) && $filter === null) {
            throw new InvalidArgumentException('You have to provide a Filter (instance of FilterInterface)');
        } elseif (is_string($keyOrFilter)) {
            $filter->useKey($keyOrFilter);
        } else {
            $filter = $keyOrFilter;
        }

        $lastFilter = end($this->filters);

        $lastFilter->addOr($filter);

        return $this;
    }

    protected function passesAllFilters(mixed $output): bool
    {
        foreach ($this->filters as $filter) {
            if (!$filter->evaluate($output)) {
                if ($filter->getOr()) {
                    $orFilter = $filter->getOr();

                    while ($orFilter) {
                        if ($orFilter->evaluate($output)) {
                            continue 2;
                        }

                        $orFilter = $orFilter->getOr();
                    }
                }

                return false;
            }
        }

        return true;
    }
}


================================================
FILE: src/Steps/Filters/NegatedFilter.php
================================================
<?php

namespace Crwlr\Crawler\Steps\Filters;

final class NegatedFilter implements FilterInterface
{
    public function __construct(private readonly FilterInterface $filter) {}

    public function useKey(string $key): static
    {
        $this->filter->useKey($key);

        return $this;
    }

    public function evaluate(mixed $valueInQuestion): bool
    {
        return !$this->filter->evaluate($valueInQuestion);
    }

    public function addOr(FilterInterface $filter): void
    {
        $this->filter->addOr($filter);
    }

    public function getOr(): ?FilterInterface
    {
        return $this->filter->getOr();
    }

    public function negate(): NegatedFilter
    {
        return new NegatedFilter($this);
    }
}


================================================
FILE: src/Steps/Filters/StringFilter.php
================================================
<?php

namespace Crwlr\Crawler\Steps\Filters;

use Crwlr\Crawler\Steps\Filters\Enums\StringFilterRule;
use Exception;

class StringFilter extends AbstractFilter
{
    public function __construct(
        protected readonly StringFilterRule $filterRule,
        protected readonly string $filterString,
    ) {}

    /**
     * @throws Exception
     */
    public function evaluate(mixed $valueInQuestion): bool
    {
        $valueInQuestion = $this->getKey($valueInQuestion);

        if (!is_string($valueInQuestion)) {
            return false;
        }

        return $this->filterRule->evaluate($valueInQuestion, $this->filterString);
    }
}


================================================
FILE: src/Steps/Filters/StringLengthFilter.php
================================================
<?php

namespace Crwlr\Crawler\Steps\Filters;

use Crwlr\Crawler\Steps\Filters\Enums\StringLengthFilterRule;
use Exception;

class StringLengthFilter extends AbstractFilter
{
    public function __construct(
        protected readonly StringLengthFilterRule $filterRule,
        protected readonly int $compareToLength,
    ) {}

    /**
     * @throws Exception
     */
    public function evaluate(mixed $valueInQuestion): bool
    {
        $valueInQuestion = $this->getKey($valueInQuestion);

        if (!is_string($valueInQuestion)) {
            return false;
        }

        return $this->filterRule->evaluate($valueInQuestion, $this->compareToLength);
    }
}


================================================
FILE: src/Steps/Filters/UrlFilter.php
================================================
<?php

namespace Crwlr\Crawler\Steps\Filters;

use Crwlr\Crawler\Steps\Filters\Enums\UrlFilterRule;
use Exception;

class UrlFilter extends AbstractFilter
{
    public function __construct(protected readonly UrlFilterRule $filterRule, protected readonly string $filterString) {}

    /**
     * @throws Exception
     */
    public function evaluate(mixed $valueInQuestion): bool
    {
        $valueInQuestion = $this->getKey($valueInQuestion);

        if (!is_string($valueInQuestion)) {
            return false;
        }

        return $this->filterRule->evaluate($valueInQuestion, $this->filterString);
    }
}


================================================
FILE: src/Steps/Group.php
================================================
<?php

namespace Crwlr\Crawler\Steps;

use Crwlr\Crawler\Input;
use Crwlr\Crawler\Loader\LoaderInterface;
use Crwlr\Crawler\Output;
use Exception;
use Generator;
use Psr\Log\LoggerInterface;

final class Group extends BaseStep
{
    /**
     * @var StepInterface[]
     */
    private array $steps = [];

    /**
     * @var LoaderInterface|null
     */
    private ?LoaderInterface $loader = null;

    /**
     * @param Input $input
     * @return Generator<Output>
     * @throws Exception
     */
    public function invokeStep(Input $input): Generator
    {
        $combinedOutput = $combinedKeptData = [];

        if ($this->uniqueInput && !$this->inputOrOutputIsUnique($input)) {
            return;
        }

        $this->storeOriginalInput($input);

        // When input is array and useInputKey() was used, invoke the steps only with that input array element,
        // but keep the original input, because we want to use it e.g. for the keepInputData() functionality.
        $inputForStepInvocation = $this->getInputKeyToUse($input);

        if ($inputForStepInvocation) {
            foreach ($this->steps as $step) {
                foreach ($step->invokeStep($inputForStepInvocation) as $nthOutput => $output) {
                    if (method_exists($step, 'callUpdateInputUsingOutput')) {
                        $inputForStepInvocation = $step->callUpdateInputUsingOutput($inputForStepInvocation, $output);
                    }

                    if ($this->includeOutput($step)) {
                        $combinedOutput = $this->addToCombinedOutputData(
                            $output->get(),
                            $combinedOutput,
                            $nthOutput,
                        );
                    }

                    // Also transfer data, kept in group child steps, to the kept data of the final group output.
                    if ($output->keep !== $inputForStepInvocation->keep) {
                        $keep = $this->getNewlyKeptData($output, $inputForStepInvocation);

                        $combinedKeptData = $this->addToCombinedOutputData($keep, $combinedKeptData, $nthOutput);
                    }
                }
            }

            yield from $this->prepareCombinedOutputs($combinedOutput, $combinedKeptData, $input);
        }
    }

    public function addStep(StepInterface $step): self
    {
        if ($this->logger instanceof LoggerInterface) {
            $step->addLogger($this->logger);
        }

        if (method_exists($step, 'setLoader') && $this->loader instanceof LoaderInterface) {
            $step->setLoader($this->loader);
        }

        if ($this->maxOutputs) {
            $step->maxOutputs($this->maxOutputs);
        }

        $this->steps[] = $step;

        return $this;
    }

    public function addLogger(LoggerInterface $logger): static
    {
        parent::addLogger($logger);

        foreach ($this->steps as $step) {
            $step->addLogger($logger);
        }

        return $this;
    }

    public function setLoader(LoaderInterface $loader): self
    {
        $this->loader = $loader;

        foreach ($this->steps as $step) {
            if (method_exists($step, 'setLoader')) {
                $step->setLoader($loader);
            }
        }

        return $this;
    }

    public function maxOutputs(int $maxOutputs): static
    {
        parent::maxOutputs($maxOutputs);

        foreach ($this->steps as $step) {
            $step->maxOutputs($maxOutputs);
        }

        return $this;
    }

    public function outputType(): StepOutputType
    {
        return StepOutputType::AssociativeArrayOrObject;
    }

    protected function includeOutput(StepInterface $step): bool
    {
        if (
            !method_exists($step, 'shouldOutputBeExcludedFromGroupOutput') ||
            $step->shouldOutputBeExcludedFromGroupOutput() === false
        ) {
            return true;
        }

        return false;
    }

    /**
     * @param mixed[] $combined
     * @return mixed[]
     */
    private function addToCombinedOutputData(mixed $add, array $combined, int $nthElement): array
    {
        if (is_array($add)) {
            foreach ($add as $key => $value) {
                $combined[$nthElement][$key][] = $value;
            }
        } else {
            $combined[$nthElement][][] = $add;
        }

        return $combined;
    }

    /**
     * @return mixed[]
     */
    private function getNewlyKeptData(Output $output, Input $input): array
    {
        return array_filter($output->keep, function ($key) use ($input) {
            return !array_key_exists($key, $input->keep);
        }, ARRAY_FILTER_USE_KEY);
    }

    /**
     * @param mixed[] $combinedOutputs
     * @param mixed[] $combinedKeptData
     * @param Input $input
     * @return Generator<Output>
     * @throws Exception
     */
    private function prepareCombinedOutputs(array $combinedOutputs, array $combinedKeptData, Input $input): Generator
    {
        foreach ($combinedOutputs as $key => $combinedOutput) {
            if ($this->maxOutputsExceeded()) {
                break;
            }

            $outputData = $this->normalizeCombinedOutputs($combinedOutput);

            $outputData = $this->applyRefiners($outputData, $input->get());

            if ($this->passesAllFilters($outputData)) {
                $output = $this->makeOutput($outputData, $input);

                if (array_key_exists($key, $combinedKeptData)) {
                    $output->keep($this->normalizeCombinedOutputs($combinedKeptData[$key]));
                }

                if ($this->uniqueOutput !== false && !$this->inputOrOutputIsUnique($output)) {
                    continue;
                }

                yield $output;

                $this->trackYieldedOutput();
            }
        }
    }

    /**
     * Normalize combined outputs
     *
     * When adding outputs to combined output during step invocation, it always adds as arrays.
     * Here it unwraps all array properties with just one element to have just that one element as value.
     *
     * @param mixed[] $combinedOutputs
     * @return mixed[]
     */
    private function normalizeCombinedOutputs(array $combinedOutputs): array
    {
        return array_map(function ($output) {
            return count($output) === 1 ? reset($output) : $output;
        }, $combinedOutputs);
    }
}


================================================
FILE: src/Steps/Html/CssSelector.php
================================================
<?php

namespace Crwlr\Crawler\Steps\Html;

use Crwlr\Crawler\Steps\Dom\HtmlDocument;
use Crwlr\Crawler\Steps\Dom\Node;
use Crwlr\Crawler\Steps\Dom\NodeList;
use Crwlr\Crawler\Steps\Html\Exceptions\InvalidDomQueryException;
use Crwlr\Utils\PhpVersion;
use DOMException;
use Symfony\Component\CssSelector\CssSelectorConverter;
use Symfony\Component\CssSelector\Exception\ExpressionErrorException;
use Symfony\Component\CssSelector\Exception\SyntaxErrorException;

final class CssSelector extends DomQuery
{
    /**
     * @throws InvalidDomQueryException
     */
    public function __construct(string $query)
    {
        $query = trim($query);

        if ($query !== '') {
            if (PhpVersion::isBelow(8, 4)) {
                try {
                    (new CssSelectorConverter())->toXPath($query);
                } catch (ExpressionErrorException|SyntaxErrorException $exception) {
                    throw InvalidDomQueryException::fromSymfonyException($query, $exception);
                }
            } else {
                try {
                    (new HtmlDocument('<!doctype html><html></html>'))->querySelector($query);
                } catch (DOMException $exception) {
                    throw InvalidDomQueryException::fromDomException($query, $exception);
                }
            }
        }

        parent::__construct($query);
    }

    protected function filter(Node $node): NodeList
    {
        if ($this->query === '') {
            return new NodeList([$node]);
        }

        return $node->querySelectorAll($this->query);
    }
}


================================================
FILE: src/Steps/Html/DomQuery.php
================================================
<?php

namespace Crwlr\Crawler\Steps\Html;

use Crwlr\Crawler\Steps\Dom\HtmlDocument;
use Crwlr\Crawler\Steps\Dom\HtmlElement;
use Crwlr\Crawler\Steps\Dom\Node;
use Crwlr\Crawler\Steps\Dom\NodeList;
use Crwlr\Crawler\Steps\Dom\XmlElement;
use Crwlr\Html2Text\Exceptions\InvalidHtmlException;
use Crwlr\Html2Text\Html2Text;
use Crwlr\Url\Url;
use Exception;
use InvalidArgumentException;

abstract class DomQuery
{
    public ?string $attributeName = null;

    protected SelectorTarget $target = SelectorTarget::Text;

    protected bool $onlyFirstMatch = false;

    protected bool $onlyLastMatch = false;

    protected false|int $onlyNthMatch = false;

    protected bool $onlyEvenMatches = false;

    protected bool $onlyOddMatches = false;

    protected bool $toAbsoluteUrl = false;

    protected bool $withFragment = true;

    protected ?string $baseUrl = null;

    protected ?Html2Text $html2TextConverter = null;

    public function __construct(
        public readonly string $query,
    ) {}

    /**
     * @return string[]|string|null
     * @throws InvalidHtmlException|Exception
     */
    public function apply(Node $node): array|string|null
    {
        if ($this->toAbsoluteUrl && $node instanceof HtmlDocument) {
            $baseHref = $node->getBaseHref();

            if ($baseHref) {
                $this->setBaseUrl($baseHref);
            }
        }

        $filtered = $this->filter($node);

        if ($this->filtersMatches()) {
            $filtered = $this->filterMatches($filtered);

            if ($filtered === null) {
                return null;
            }
        }

        if ($filtered->count() > 1) {
            return $filtered->each(function ($element) {
                return $this->getTarget($element);
            });
        } elseif ($filtered->count() === 1) {
            $node = $filtered->first();

            if ($node instanceof HtmlElement || $node instanceof XmlElement) {
                return $this->getTarget($node);
            }
        }

        return null;
    }

    public function first(): self
    {
        $this->onlyFirstMatch = true;

        return $this;
    }

    public function last(): self
    {
        $this->onlyLastMatch = true;

        return $this;
    }

    public function nth(int $n): self
    {
        if ($n < 1) {
            throw new InvalidArgumentException('Argument $n must be greater than 0');
        }

        $this->onlyNthMatch = $n;

        return $this;
    }

    public function even(): self
    {
        $this->onlyEvenMatches = true;

        return $this;
    }

    public function odd(): self
    {
        $this->onlyOddMatches = true;

        return $this;
    }

    public function text(): self
    {
        $this->target = SelectorTarget::Text;

        return $this;
    }

    public function formattedText(?Html2Text $converter = null): self
    {
        $this->target = SelectorTarget::FormattedText;

        if ($converter) {
            $this->html2TextConverter = $converter;
        }

        return $this;
    }

    public function html(): self
    {
        $this->target = SelectorTarget::Html;

        return $this;
    }

    public function attribute(string $attributeName): self
    {
        $this->target = SelectorTarget::Attribute;

        $this->attributeName = $attributeName;

        return $this;
    }

    public function outerHtml(): self
    {
        $this->target = SelectorTarget::OuterHtml;

        return $this;
    }

    public function link(): self
    {
        $this->target = SelectorTarget::Attribute;

        $this->attributeName = 'href';

        $this->toAbsoluteUrl = true;

        return $this;
    }

    public function withoutFragment(): self
    {
        $this->withFragment = false;

        return $this;
    }

    /**
     * Call this method and the selected value will be converted to an absolute url when apply() is called.
     *
     * @return $this
     */
    public function toAbsoluteUrl(): self
    {
        $this->toAbsoluteUrl = true;

        return $this;
    }

    /**
     * Automatically called when used in a Dom step.
     *
     * @throws Exception
     */
    public function setBaseUrl(string $baseUrl): static
    {
        if (!empty($this->baseUrl)) {
            $this->baseUrl = Url::parse($this->baseUrl)->resolve($baseUrl)->__toString();
        } else {
            $this->baseUrl = $baseUrl;
        }

        return $this;
    }

    abstract protected function filter(Node $node): NodeList;

    protected function filtersMatches(): bool
    {
        return $this->onlyFirstMatch ||
            $this->onlyLastMatch ||
            $this->onlyNthMatch !== false ||
            $this->onlyEvenMatches ||
            $this->onlyOddMatches;
    }

    /**
     * @return NodeList|null
     * @throws Exception
     */
    protected function filterMatches(NodeList $matches): ?NodeList
    {
        if (
            $matches->count() === 0 ||
            ($this->onlyNthMatch !== false && $matches->count() < $this->onlyNthMatch)
        ) {
            return null;
        }

        if ($this->onlyFirstMatch) {
            $node = $matches->first();

            return $node ? new NodeList([$node]) : new NodeList([]);
        } elseif ($this->onlyLastMatch) {
            $node = $matches->last();

            return $node ? new NodeList([$node]) : new NodeList([]);
        } elseif ($this->onlyNthMatch !== false) {
            $node = $matches->nth($this->onlyNthMatch);

            return $node ? new NodeList([$node]) : new NodeList([]);
        } elseif ($this->onlyEvenMatches || $this->onlyOddMatches) {
            return $this->filterEvenOrOdd($matches);
        }

        return null;
    }

    /**
     * @param NodeList $domCrawler
     * @return NodeList
     */
    protected function filterEvenOrOdd(NodeList $domCrawler): NodeList
    {
        $nodes = [];

        $i = 1;

        foreach ($domCrawler as $node) {
            if (
                ($this->onlyEvenMatches && $i % 2 === 0) ||
                ($this->onlyOddMatches && $i % 2 !== 0)
            ) {
                $nodes[] = $node;
            }

            $i++;
        }

        return new NodeList($nodes);
    }

    /**
     * @throws InvalidHtmlException
     * @throws Exception
     */
    protected function getTarget(HtmlElement|XmlElement $node): string
    {
        if ($this->target === SelectorTarget::FormattedText) {
            if (!$this->html2TextConverter) {
                $this->html2TextConverter = new Html2Text();
            }

            $target = $this->html2TextConverter->convertHtmlToText(
                $node instanceof HtmlElement ? $node->outerHtml() : $node->outerXml(),
            );
        } elseif ($this->target === SelectorTarget::Html) {
            $target = $node instanceof HtmlElement ? trim($node->innerHtml()) : trim($node->innerXml());
        } elseif ($this->target === SelectorTarget::OuterHtml) {
            $target = $node instanceof HtmlElement ? trim($node->outerHtml()) : trim($node->outerXml());
        } else {
            $target = trim(
                $this->attributeName ?
                    ($node->getAttribute($this->attributeName) ?? '') :
                    (
                        method_exists($node, strtolower($this->target->name)) ?
                            $node->{strtolower($this->target->name)}() :
                            ''
                    ),
            );
        }

        if ($this->toAbsoluteUrl && $this->baseUrl !== null) {
            $target = $this->handleUrlFragment(Url::parse($this->baseUrl)->resolve($target));
        }

        if (str_contains($target, '�')) {
            $target = str_replace('�', '', $target);
        }

        return $target;
    }

    /**
     * @throws Exception
     */
    protected function handleUrlFragment(Url $url): Url
    {
        if (!$this->withFragment) {
            $url->fragment('');
        }

        return $url;
    }
}


================================================
FILE: src/Steps/Html/Exceptions/InvalidDomQueryException.php
================================================
<?php

namespace Crwlr\Crawler\Steps\Html\Exceptions;

use DOMException;
use Exception;
use Symfony\Component\CssSelector\Exception\ExpressionErrorException;
use Symfony\Component\CssSelector\Exception\SyntaxErrorException;

class InvalidDomQueryException extends Exception
{
    protected string $query = '';

    public static function make(string $message, string $domQuery): self
    {
        $exception = new self($message);

        $exception->setDomQuery($domQuery);

        return $exception;
    }

    public static function fromSymfonyException(
        string $domQuery,
        ExpressionErrorException|SyntaxErrorException $originalException,
    ): self {
        $exception = new self(
            $originalException->getMessage(),
            $originalException->getCode(),
            $originalException,
        );

        $exception->setDomQuery($domQuery);

        return $exception;
    }

    public static function fromDomException(string $domQuery, DOMException $originalException): self
    {
        $exception = new self(
            $originalException->getMessage(),
            $originalException->getCode(),
            $originalException,
        );

        $exception->setDomQuery($domQuery);

        return $exception;
    }

    public function setDomQuery(string $domQuery): void
    {
        $this->query = $domQuery;
    }

    public function getDomQuery(): string
    {
        return $this->query;
    }
}


================================================
FILE: src/Steps/Html/GetLink.php
================================================
<?php

namespace Crwlr\Crawler\Steps\Html;

use Crwlr\Crawler\Cache\Exceptions\MissingZlibExtensionException;
use Crwlr\Crawler\Loader\Http\Messages\RespondedRequest;
use Crwlr\Crawler\Steps\Dom\HtmlDocument;
use Crwlr\Crawler\Steps\Dom\HtmlElement;
use Crwlr\Crawler\Steps\Html\Exceptions\InvalidDomQueryException;
use Crwlr\Crawler\Steps\Loading\Http;
use Crwlr\Crawler\Steps\Step;
use Crwlr\Crawler\Steps\StepOutputType;
use Crwlr\Url\Url;
use Exception;
use Generator;
use InvalidArgumentException;

class GetLink extends Step
{
    protected Url $baseUri;

    protected ?bool $onSameDomain = null;

    /**
     * @var null|string[]
     */
    protected ?array $onDomain = null;

    protected ?bool $onSameHost = null;

    /**
     * @var null|string[]
     */
    protected ?array $onHost = null;

    protected bool $withFragment = true;

    protected string|CssSelector|null $selector = null;

    /**
     * @throws InvalidDomQueryException
     */
    public function __construct(string|CssSelector|null $selector = null)
    {
        $this->selector = is_string($selector) ? new CssSelector($selector) : $selector;
    }

    public static function isSpecialNonHttpLink(HtmlElement $linkElement): bool
    {
        $href = $linkElement->getAttribute('href') ?? '';

        return str_starts_with($href, 'mailto:') ||
            str_starts_with($href, 'tel:') ||
            str_starts_with($href, 'javascript:');
    }

    public function outputType(): StepOutputType
    {
        return StepOutputType::Scalar;
    }

    /**
     * @throws MissingZlibExtensionException
     */
    protected function validateAndSanitizeInput(mixed $input): HtmlDocument
    {
        if (!$input instanceof RespondedRequest) {
            throw new InvalidArgumentException('Input must be an instance of RespondedRequest.');
        }

        $this->baseUri = Url::parse($input->effectiveUri());

        return new HtmlDocument(Http::getBodyString($input));
    }

    /**
     * @param HtmlDocument $input
     * @return Generator<string>
     * @throws Exception
     */
    protected function invoke(mixed $input): Generator
    {
        $this->getBaseFromDocument($input);

        $selector = $this->selector ?? 'a';

        if (is_string($selector)) {
            $selector = new CssSelector($selector);
        }

        foreach ($input->querySelectorAll($selector->query) as $link) {
            $linkUrl = $this->getLinkUrl($link);

            if ($linkUrl) {
                yield (string) $linkUrl;

                break;
            }
        }
    }

    public function onSameDomain(): static
    {
        $this->onSameDomain = true;

        return $this;
    }

    public function notOnSameDomain(): static
    {
        $this->onSameDomain = false;

        return $this;
    }

    /**
     * @param string|string[] $domains
     * @return $this
     */
    public function onDomain(string|array $domains): static
    {
        if (is_array($domains) && !$this->isArrayWithOnlyStrings($domains)) {
            throw new InvalidArgumentException('You can only set domains from string values');
        }

        $domains = is_string($domains) ? [$domains] : $domains;

        $this->onDomain = $this->onDomain ? array_merge($this->onDomain, $domains) : $domains;

        return $this;
    }

    public function onSameHost(): static
    {
        $this->onSameHost = true;

        return $this;
    }

    public function notOnSameHost(): static
    {
        $this->onSameHost = false;

        return $this;
    }

    /**
     * @param string|string[] $hosts
     */
    public function onHost(string|array $hosts): static
    {
        if (is_array($hosts) && !$this->isArrayWithOnlyStrings($hosts)) {
            throw new InvalidArgumentException('You can only set hosts from string values');
        }

        $hosts = is_string($hosts) ? [$hosts] : $hosts;

        $this->onHost = $this->onHost ? array_merge($this->onHost, $hosts) : $hosts;

        return $this;
    }

    public function withoutFragment(): static
    {
        $this->withFragment = false;

        return $this;
    }

    /**
     * @throws Exception
     */
    protected function getBaseFromDocument(HtmlDocument $document): void
    {
        $baseHref = $document->getBaseHref();

        if (!empty($baseHref)) {
            $this->baseUri = $this->baseUri->resolve($baseHref);
        }
    }

    /**
     * @throws Exception
     */
    protected function getLinkUrl(HtmlElement $link): ?Url
    {
        if ($link->nodeName() !== 'a') {
            $this->logger?->warning('Selector matched <' . $link->nodeName() . '> html element. Ignored it.');

            return null;
        }

        if (self::isSpecialNonHttpLink($link)) {
            return null;
        }

        $linkUrl = $this->handleUrlFragment(
            $this->baseUri->resolve($link->getAttribute('href') ?? ''),
        );

        if ($this->matchesAdditionalCriteria($linkUrl)) {
            return $linkUrl;
        }

        return null;
    }

    /**
     * @throws Exception
     */
    protected function matchesAdditionalCriteria(Url $link): bool
    {
        return ($this->onSameDomain === null || $this->isOnSameDomain($link)) &&
            ($this->onSameHost === null || $this->isOnSameHost($link)) &&
            ($this->onDomain === null || $this->isOnDomain($link)) &&
            ($this->onHost === null || $this->isOnHost($link));
    }

    protected function isOnSameDomain(Url $link): bool
    {
        return ($this->onSameDomain && $this->baseUri->isDomainEqualIn($link)) ||
            ($this->onSameDomain === false && !$this->baseUri->isDomainEqualIn($link));
    }

    protected function isOnSameHost(Url $link): bool
    {
        return ($this->onSameHost && $this->baseUri->isHostEqualIn($link)) ||
            ($this->onSameHost === false && !$this->baseUri->isHostEqualIn($link));
    }

    /**
     * @throws Exception
     */
    protected function isOnDomain(Url $link): bool
    {
        if (is_array($this->onDomain)) {
            foreach ($this->onDomain as $domain) {
                if ($link->domain() === $domain) {
                    return true;
                }
            }
        }

        return false;
    }

    /**
     * @throws Exception
     */
    protected function isOnHost(Url $link): bool
    {
        if (is_array($this->onHost)) {
            foreach ($this->onHost as $host) {
                if ($link->host() === $host) {
                    return true;
                }
            }
        }

        return false;
    }

    /**
     * @param mixed[] $array
     * @return bool
     */
    protected function isArrayWithOnlyStrings(array $array): bool
    {
        foreach ($array as $element) {
            if (!is_string($element)) {
                return false;
            }
        }

        return true;
    }

    /**
     * @throws Exception
     */
    protected function handleUrlFragment(Url $url): Url
    {
        if (!$this->withFragment) {
            $url->fragment('');
        }

        return $url;
    }
}


================================================
FILE: src/Steps/Html/GetLinks.php
================================================
<?php

namespace Crwlr\Crawler\Steps\Html;

use Crwlr\Crawler\Steps\Dom\HtmlDocument;
use Exception;
use Generator;

class GetLinks extends GetLink
{
    /**
     * @param HtmlDocument $input
     * @return Generator<string>
     * @throws Exception
     */
    protected function invoke(mixed $input): Generator
    {
        $this->getBaseFromDocument($input);

        $selector = $this->selector ?? 'a';

        if (is_string($selector)) {
            $selector = new CssSelector($selector);
        }

        foreach ($input->querySelectorAll($selector->query) as $link) {
            $linkUrl = $this->getLinkUrl($link);

            if ($linkUrl) {
                yield (string) $linkUrl;
            }
        }
    }
}


================================================
FILE: src/Steps/Html/MetaData.php
================================================
<?php

namespace Crwlr\Crawler\Steps\Html;

use Crwlr\Crawler\Cache\Exceptions\MissingZlibExtensionException;
use Crwlr\Crawler\Steps\Dom\HtmlDocument;
use Crwlr\Crawler\Steps\Step;
use Crwlr\Crawler\Steps\StepOutputType;
use Generator;

class MetaData extends Step
{
    /**
     * @var string[]
     */
    protected array $onlyKeys = [];

    /**
     * @param string[] $keys
     */
    public function only(array $keys): static
    {
        $this->onlyKeys = $keys;

        return $this;
    }

    public function outputType(): StepOutputType
    {
        return StepOutputType::AssociativeArrayOrObject;
    }

    /**
     * @param HtmlDocument $input
     */
    protected function invoke(mixed $input): Generator
    {
        $data = $this->addToData([], 'title', $this->getTitle($input));

        foreach ($input->querySelectorAll('meta') as $metaElement) {
            $metaName = $metaElement->getAttribute('name');

            if (empty($metaName)) {
                $metaName = $metaElement->getAttribute('property');
            }

            if (!empty($metaName) && (empty($this->onlyKeys) || in_array($metaName, $this->onlyKeys, true))) {
                $data = $this->addToData($data, $metaName, $metaElement->getAttribute('content') ?? '');
            }
        }

        yield $data;
    }

    /**
     * @throws MissingZlibExtensionException
     */
    protected function validateAndSanitizeInput(mixed $input): mixed
    {
        return $this->validateAndSanitizeToHtmlDocumentInstance($input);
    }

    protected function getTitle(HtmlDocument $document): string
    {
        $titleElement = $document->querySelector('title');

        if ($titleElement) {
            return $titleElement->text();
        }

        return '';
    }

    /**
     * @param array<string, string> $data
     * @return array<string, string>
     */
    protected function addToData(array $data, string $key, string $value): array
    {
        if (empty($this->onlyKeys) || in_array($key, $this->onlyKeys, true)) {
            $data[$key] = $value;
        }

        return $data;
    }
}


================================================
FILE: src/Steps/Html/SchemaOrg.php
================================================
<?php

namespace Crwlr\Crawler\Steps\Html;

use Adbar\Dot;
use Crwlr\Crawler\Cache\Exceptions\MissingZlibExtensionException;
use Crwlr\Crawler\Steps\Step;
use Crwlr\Crawler\Steps\StepOutputType;
use Generator;
use Spatie\SchemaOrg\BaseType;

class SchemaOrg extends Step
{
    protected bool $toArray = false;

    protected ?string $onlyType = null;

    /**
     * @var array<int|string, string>
     */
    protected array $mapping = [];

    public function toArray(): static
    {
        $this->toArray = true;

        return $this;
    }

    public function onlyType(string $type = ''): static
    {
        $this->onlyType = $type;

        return $this;
    }

    /**
     * @param array<int|string, string> $mapping
     */
    public function extract(array $mapping): static
    {
        $this->mapping = $mapping;

        return $this;
    }

    public function outputType(): StepOutputType
    {
        return StepOutputType::AssociativeArrayOrObject;
    }

    /**
     * @param string $input
     */
    protected function invoke(mixed $input): Generator
    {
        $data = \Crwlr\SchemaOrg\SchemaOrg::fromHtml($input, $this->logger);

        foreach ($data as $schemaOrgObject) {
            if ($this->onlyType && $schemaOrgObject->getType() !== $this->onlyType) {
                yield from $this->scanChildrenForType($schemaOrgObject);

                continue;
            }

            yield $this->prepareReturnValue($schemaOrgObject);
        }
    }

    /**
     * @throws MissingZlibExtensionException
     */
    protected function validateAndSanitizeInput(mixed $input): string
    {
        return $this->validateAndSanitizeStringOrHttpResponse($input);
    }

    protected function scanChildrenForType(BaseType $schemaOrgObject): Generator
    {
        foreach ($schemaOrgObject->getProperties() as $propertyName => $property) {
            $propertyValue = $schemaOrgObject->getProperty($propertyName);

            if ($propertyValue instanceof BaseType && $propertyValue->getType() === $this->onlyType) {
                yield $this->prepareReturnValue($propertyValue);
            } elseif ($propertyValue instanceof BaseType) {
                yield from $this->scanChildrenForType($propertyValue);
            }
        }
    }

    /**
     * @return BaseType|mixed[]
     */
    protected function prepareReturnValue(BaseType $object): BaseType|array
    {
        if ($this->toArray || !empty($this->mapping)) {
            if (empty($this->mapping)) {
                return $object->toArray();
            }

            return $this->applyMapping($object->toArray());
        }

        return $object;
    }

    /**
     * @param mixed[] $schemaOrgData
     * @return mixed[]
     */
    protected function applyMapping(array $schemaOrgData): array
    {
        $extractedData = [];

        $dot = new Dot($schemaOrgData);

        foreach ($this->mapping as $outputKey => $dotNotationKey) {
            if (is_int($outputKey)) {
                $outputKey = $dotNotationKey;
            }

            $extractedData[$outputKey] = $dot->get($dotNotationKey);
        }

        return $extractedData;
    }
}


================================================
FILE: src/Steps/Html/SelectorTarget.php
================================================
<?php

namespace Crwlr\Crawler\Steps\Html;

enum SelectorTarget
{
    case Text;

    case FormattedText;

    case Html;

    case Attribute;

    case OuterHtml;
}


================================================
FILE: src/Steps/Html/XPathQuery.php
================================================
<?php

namespace Crwlr\Crawler\Steps\Html;

use Crwlr\Crawler\Steps\Dom\Node;
use Crwlr\Crawler\Steps\Dom\NodeList;
use Crwlr\Crawler\Steps\Html\Exceptions\InvalidDomQueryException;
use DOMDocument;
use DOMXPath;

class XPathQuery extends DomQuery
{
    /**
     * @throws InvalidDomQueryException
     */
    public function __construct(string $query)
    {
        $query = trim($query);

        if ($query !== '') {
            $this->validateQuery($query);
        }

        parent::__construct(trim($query));
    }

    protected function filter(Node $node): NodeList
    {
        if ($this->query === '') {
            return new NodeList([$node]);
        }

        return $node->queryXPath($this->query);
    }

    /**
     * @throws InvalidDomQueryException
     */
    private function validateQuery(string $query): void
    {
        // Temporarily set a new error handler, so checking an invalid XPath query does not generate a PHP warning.
        set_error_handler(function ($errno, $errstr) {
            if ($errno === E_WARNING && $errstr === 'DOMXPath::evaluate(): Invalid expression') {
                return true;
            }

            return false;
        });

        $evaluation = (new DOMXPath(new DOMDocument()))->evaluate($query);

        restore_error_handler();

        if ($evaluation === false) {
            throw InvalidDomQueryException::make('Invalid XPath query', $query);
        }
    }
}


================================================
FILE: src/Steps/Html.php
================================================
<?php

namespace Crwlr\Crawler\Steps;

use Crwlr\Crawler\Cache\Exceptions\MissingZlibExtensionException;
use Crwlr\Crawler\Loader\Http\Messages\RespondedRequest;
use Crwlr\Crawler\Steps\Dom\HtmlDocument;
use Crwlr\Crawler\Steps\Html\CssSelector;
use Crwlr\Crawler\Steps\Html\DomQuery;
use Crwlr\Crawler\Steps\Html\Exceptions\InvalidDomQueryException;
use Crwlr\Crawler\Steps\Html\GetLink;
use Crwlr\Crawler\Steps\Html\GetLinks;
use Crwlr\Crawler\Steps\Html\MetaData;
use Crwlr\Crawler\Steps\Html\SchemaOrg;

class Html extends Dom
{
    /**
     * @throws InvalidDomQueryException
     */
    public static function getLink(?string $selector = null): GetLink
    {
        return new GetLink($selector);
    }

    /**
     * @throws InvalidDomQueryException
     */
    public static function getLinks(?string $selector = null): GetLinks
    {
        return new GetLinks($selector);
    }

    public static function metaData(): MetaData
    {
        return new MetaData();
    }

    public static function schemaOrg(): SchemaOrg
    {
        return new SchemaOrg();
    }

    /**
     * @param mixed $input
     * @return HtmlDocument
     * @throws MissingZlibExtensionException
     */
    protected function validateAndSanitizeInput(mixed $input): HtmlDocument
    {
        if ($input instanceof RespondedRequest) {
            $this->baseUrl = $input->effectiveUri();
        }

        return $this->validateAndSanitizeToHtmlDocumentInstance($input);
    }

    /**
     * @throws InvalidDomQueryException
     */
    protected function makeDefaultDomQueryInstance(string $query): DomQuery
    {
        return new CssSelector($query);
    }
}


================================================
FILE: src/Steps/Json.php
================================================
<?php

namespace Crwlr\Crawler\Steps;

use Adbar\Dot;
use Crwlr\Crawler\Steps\Dom\HtmlDocument;
use Crwlr\Utils\Json as JsonUtil;
use Crwlr\Utils\Exceptions\InvalidJsonException;
use Generator;
use Throwable;

class Json extends Step
{
    /**
     * @param mixed[] $propertyMapping
     */
    final public function __construct(protected ?array $propertyMapping = [], protected ?string $each = null) {}

    public static function all(): static
    {
        return new static(null);
    }

    /**
     * @param mixed[] $propertyMapping
     */
    public static function get(array $propertyMapping = []): static
    {
        return new static($propertyMapping);
    }

    /**
     * @param mixed[] $propertyMapping
     */
    public static function each(string $each, array $propertyMapping = []): static
    {
        return new static($propertyMapping, $each);
    }

    public function outputType(): StepOutputType
    {
        return StepOutputType::AssociativeArrayOrObject;
    }

    protected function validateAndSanitizeInput(mixed $input): mixed
    {
        return $this->validateAndSanitizeStringOrHttpResponse($input);
    }

    protected function invoke(mixed $input): Generator
    {
        $array = $this->inputStringToArray($input);

        if ($array === null || $this->propertyMapping === null) {
            if ($array === null) {
                $this->logger?->warning('Failed to decode JSON string.');
            } elseif ($this->propertyMapping === null) {
                yield $array;
            }

            return;
        }

        $dot = new Dot($array);

        if ($this->each === null) {
            yield $this->mapProperties($dot);
        } else {
            $each = $this->each === '' ? $dot->get() : $dot->get($this->each);

            if (!is_iterable($each)) {
                $this->logger?->warning('The target of "each" does not exist in the JSON data.');
            } else {
                foreach ($each as $item) {
                    yield $this->mapProperties(new Dot($item));
                }
            }
        }
    }

    /**
     * @return mixed[]|null
     */
    protected function inputStringToArray(string $input): ?array
    {
        try {
            return JsonUtil::stringToArray($input);
        } catch (InvalidJsonException) {
            // If headless browser is used in loader, the JSON in the response body is wrapped in an HTML document.
            if (str_contains($input, '<html') || str_contains($input, '<HTML')) {
                try {
                    $bodyText = (new HtmlDocument($input))->querySelector('body')?->text() ?? '';

                    return JsonUtil::stringToArray($bodyText);
                } catch (Throwable) {
                }
            }
        }

        return null;
    }

    /**
     * @param Dot<int|string, mixed> $dot
     * @return mixed[]
     */
    protected function mapProperties(Dot $dot): array
    {
        if ($this->propertyMapping === null || $this->propertyMapping === []) {
            return [];
        }

        $mapped = [];

        foreach ($this->propertyMapping as $propertyKey => $dotNotation) {
            if (is_int($propertyKey)) {
                $propertyKey = $dotNotation;
            }

            if ($dotNotation === '' || ($dotNotation === '*' && $dot->get('*') === null)) {
                $mapped[$propertyKey] = $dot->all();
            } else {
                $mapped[$propertyKey] = $dot->get($dotNotation);
            }
        }

        return $mapped;
    }
}


================================================
FILE: src/Steps/Loading/GetSitemapsFromRobotsTxt.php
================================================
<?php

namespace Crwlr\Crawler\Steps\Loading;

use Crwlr\Crawler\Loader\Http\HttpLoader;
use Crwlr\Crawler\Steps\Step;
use Crwlr\Crawler\Steps\StepOutputType;
use Crwlr\RobotsTxt\Exceptions\InvalidRobotsTxtFileException;
use Generator;
use InvalidArgumentException;
use Psr\Http\Message\UriInterface;

class GetSitemapsFromRobotsTxt extends Step
{
    /**
     * @use LoadingStep<HttpLoader>
     */
    use LoadingStep;

    public function outputType(): StepOutputType
    {
        return StepOutputType::Scalar;
    }

    /**
     * @throws InvalidRobotsTxtFileException
     */
    protected function invoke(mixed $input): Generator
    {
        $robotsTxtHandler = $this->getLoader()->robotsTxt();

        foreach ($robotsTxtHandler->getSitemaps($input) as $sitemapUrl) {
            yield $sitemapUrl;
        }
    }

    /**
     * @throws InvalidArgumentException
     */
    protected function validateAndSanitizeInput(mixed $input): UriInterface
    {
        return $this->validateAndSanitizeToUriInterface($input);
    }
}


================================================
FILE: src/Steps/Loading/Http/AbstractPaginator.php
================================================
<?php

namespace Crwlr\Crawler\Steps\Loading\Http;

use Closure;
use Crwlr\Crawler\Loader\Http\Messages\RespondedRequest;
use Crwlr\Crawler\Steps\Loading\Http\Paginators\StopRules\StopRule;
use Crwlr\Crawler\Utils\RequestKey;
use Crwlr\Url\Url;
use Psr\Http\Message\RequestInterface;
use Psr\Log\LoggerInterface;

abstract class AbstractPaginator
{
    /**
     * @var array<string, true>
     */
    protected array $loaded = [];

    protected int $loadedCount = 0;

    protected ?RequestInterface $latestRequest;

    /**
     * @var array<int, Closure|StopRule>
     */
    protected array $stopRules = [];

    protected bool $hasFinished = false;

    public function __construct(protected int $maxPages = Paginator::MAX_PAGES_DEFAULT) {}

    public function processLoaded(
        RequestInterface $request,
        ?RespondedRequest $respondedRequest,
    ): void {
        $this->registerLoadedRequest($respondedRequest ?? $request);
    }

    public function hasFinished(): bool
    {
        return $this->hasFinished || $this->maxPagesReached();
    }

    /**
     * When a paginate step is called with multiple inputs, like:
     *
     * ['https://www.example.com/listing1', 'https://www.example.com/listing2', ...]
     *
     * it always has to start paginating again for each listing base URL.
     * Therefore, we reset the state after finishing paginating one base input.
     * Except for $this->found, because if it would be the case that the exact same pages are
     * discovered whilst paginating, we don't want to load the exact same pages again and again.
     */
    public function resetFinished(): void
    {
        $this->hasFinished = false;

        $this->loadedCount = 0;

        $this->latestRequest = null;
    }

    public function stopWhen(Closure|StopRule $callback): self
    {
        $this->stopRules[] = $callback;

        return $this;
    }

    public function logWhenFinished(LoggerInterface $logger): void
    {
        if ($this->maxPagesReached()) {
            $logger->notice('Max pages limit reached.');
        } else {
            $logger->info('Finished paginating.');
        }
    }

    abstract public function getNextRequest(): ?RequestInterface;

    protected function registerLoadedRequest(RequestInterface|RespondedRequest $request): void
    {
        $key = $request instanceof RespondedRequest ? RequestKey::from($request->request) : RequestKey::from($request);

        if (array_key_exists($key, $this->loaded)) {
            return;
        }

        $this->loaded[$key] = true;

        $this->loadedCount++;

        if ($request instanceof RespondedRequest) {
            foreach ($request->redirects() as $redirectUrl) {
                $this->loaded[RequestKey::from($request->request->withUri(Url::parsePsr7($redirectUrl)))] = true;
            }
        }

        $this->latestRequest = $request instanceof RespondedRequest ? $request->request : $request;

        $respondedRequest = $request instanceof RespondedRequest ? $request : null;

        $request = $request instanceof RequestInterface ? $request : $request->request;

        if ($this->shouldStop($request, $respondedRequest)) {
            $this->setFinished();
        }
    }

    protected function shouldStop(RequestInterface $request, ?RespondedRequest $respondedRequest): bool
    {
        if ($this->maxPagesReached()) {
            return true;
        }

        foreach ($this->stopRules as $stopRule) {
            if ($stopRule instanceof StopRule && $stopRule->shouldStop($request, $respondedRequest)) {
                return true;
            } elseif ($stopRule instanceof Closure && $stopRule->call($this, $request, $respondedRequest)) {
                return true;
            }
        }

        return false;
    }

    protected function maxPagesReached(): bool
    {
        return $this->loadedCount >= $this->maxPages;
    }

    protected function setFinished(): self
    {
        $this->hasFinished = true;

        return $this;
    }
}


================================================
FILE: src/Steps/Loading/Http/Browser/BrowserAction.php
================================================
<?php

namespace Crwlr\Crawler\Steps\Loading\Http\Browser;

use Closure;
use Crwlr\Crawler\Loader\Http\Browser\Screenshot;
use Crwlr\Crawler\Loader\Http\Browser\ScreenshotConfig;
use Crwlr\Utils\Microseconds;
use HeadlessChromium\Page;
use Psr\Log\LoggerInterface;
use Throwable;

class BrowserAction
{
    public const DEFAULT_TIMEOUT = 15_000;

    public static function waitUntilDocumentContainsElement(
        string $cssSelector,
        int $timeout = self::DEFAULT_TIMEOUT,
    ): Closure {
        return function (Page $page) use ($cssSelector, $timeout) {
            $page->waitUntilContainsElement($cssSelector, $timeout);
        };
    }

    public static function clickElement(
        string $cssSelector,
        int $timeout = self::DEFAULT_TIMEOUT,
    ): Closure {
        return function (Page $page) use ($cssSelector, $timeout) {
            $page->waitUntilContainsElement($cssSelector, $timeout);

            $page->mouse()->find($cssSelector)->click();
        };
    }

    /**
     * Click an element that lives inside a shadow DOM within the document.
     *
     * For this purpose the action needs two selectors: the first one to select the shadow host element and the
     * second one to select the element that shall be clicked inside that shadow DOM.
     */
    public static function clickInsideShadowDom(
        string $shadowHostSelector,
        string $clickElementSelector,
        int $timeout = self::DEFAULT_TIMEOUT,
    ): Closure {
        return function (Page $page) use ($shadowHostSelector, $clickElementSelector, $timeout) {
            $page->evaluate(<<<JS
            (async function() {
                let shadowHostElement = document.querySelector('{$shadowHostSelector}');

                while (!shadowHostElement) {
                    await new Promise(resolve => setTimeout(resolve, 25));
                    shadowHostElement = document.querySelector('{$shadowHostSelector}');
                }

                if (shadowHostElement.shadowRoot) {
                    let clickElement = shadowHostElement.shadowRoot.querySelector('{$clickElementSelector}');

                    while (!clickElement) {
                        await new Promise(resolve => setTimeout(resolve, 25));
                        clickElement = shadowHostElement.shadowRoot.querySelector('{$clickElementSelector}');
                    }

                    clickElement.dispatchEvent(new MouseEvent("click", { bubbles: true }));
                }
            })()
            JS)->waitForResponse($timeout);
        };
    }

    public static function moveMouseToElement(string $cssSelector, int $timeout = self::DEFAULT_TIMEOUT): Closure
    {
        return function (Page $page) use ($cssSelector, $timeout) {
            $page->waitUntilContainsElement($cssSelector, $timeout);

            $page->mouse()->find($cssSelector);
        };
    }

    public static function moveMouseToPosition(int $x, int $y, ?int $steps = null): Closure
    {
        return function (Page $page) use ($x, $y, $steps) {
            if ($steps !== null) {
                $page->mouse()->move($x, $y, ['steps' => $steps]);
            } else {
                $page->mouse()->move($x, $y);
            }
        };
    }

    public static function scrollDown(int $distance): Closure
    {
        return function (Page $page) use ($distance) {
            $page->mouse()->scrollDown($distance);
        };
    }

    public static function scrollUp(int $distance): Closure
    {
        return function (Page $page) use ($distance) {
            $page->mouse()->scrollUp($distance);
        };
    }

    public static function typeText(string $text, ?int $delay = null): Closure
    {
        return function (Page $page) use ($text, $delay) {
            if ($delay !== null) {
                $page->keyboard()->setKeyInterval($delay)->typeText($text);
            } else {
                $page->keyboard()->typeText($text);
            }
        };
    }

    public static function evaluate(string $jsCode): Closure
    {
        return function (Page $page) use ($jsCode) {
            $page->evaluate($jsCode);
        };
    }

    public static function waitForReload(int $timeout = self::DEFAULT_TIMEOUT): Closure
    {
        return function (Page $page) use ($timeout) {
            $page->waitForReload(timeout: $timeout);
        };
    }

    public static function wait(float $seconds): Closure
    {
        return function () use ($seconds) {
            usleep(Microseconds::fromSeconds($seconds)->value);
        };
    }

    public static function screenshot(ScreenshotConfig $config): Closure
    {
        return function (Page $page, ?LoggerInterface $logger) use ($config) {
            $fullFilePath = $config->getFullPath($page);

            try {
                $page->screenshot($config->toChromePhpScreenshotConfig($page))->saveToFile($fullFilePath);

                return new Screenshot($fullFilePath);
            } catch (Throwable $exception) {
                $logger?->error('Failed to take screenshot.');

                $logger?->debug($exception->getMessage());

                return null;
            }
        };
    }

    /**
     * @deprecated Use the two methods evaluate() and waitForReload() separately.
     */
    public static function evaluateAndWaitForReload(string $jsCode): Closure
    {
        return function (Page $page) use ($jsCode) {
            $page->evaluate($jsCode)->waitForPageReload();
        };
    }

    /**
     * @deprecated Use the two methods clickElement() and waitForReload() separately.
     */
    public static function clickElementAndWaitForReload(string $cssSelector): Closure
    {
        return function (Page $page) use ($cssSelector) {
            $page->waitUntilContainsElement($cssSelector);

            $page->mouse()->find($cssSelector)->click();

            $page->waitForReload();
        };
    }
}


================================================
FILE: src/Steps/Loading/Http/Document.php
================================================
<?php

namespace Crwlr\Crawler\Steps\Loading\Http;

use Crwlr\Crawler\Loader\Http\Messages\RespondedRequest;
use Crwlr\Crawler\Steps\Dom\HtmlDocument;
use Crwlr\Crawler\Steps\Loading\Http;
use Crwlr\Url\Url;
use Exception;
use Psr\Log\LoggerInterface;

final class Document
{
    private HtmlDocument $dom;

    private Url $url;

    private Url $baseUrl;

    private ?Url $canonicalUrl = null;

    public function __construct(
        private readonly RespondedRequest $respondedRequest,
        private readonly ?LoggerInterface $logger = null,
    ) {
        $responseBody = Http::getBodyString($this->respondedRequest);

        $this->dom = new HtmlDocument($responseBody);

        $this->setBaseUrl();
    }

    public function dom(): HtmlDocument
    {
        return $this->dom;
    }

    public function url(): Url
    {
        return $this->url;
    }

    public function baseUrl(): Url
    {
        return $this->baseUrl;
    }

    public function canonicalUrl(): string
    {
        if ($this->canonicalUrl === null) {
            $canonicalLinkElement = $this->dom->querySelector('link[rel=canonical]');

            if ($canonicalLinkElement) {
                $canonicalHref = $canonicalLinkElement->getAttribute('href');

                if ($canonicalHref) {
                    try {
                        $this->canonicalUrl = $this->baseUrl->resolve($canonicalHref);
                    } catch (Exception $exception) {
                        $this->logger?->warning(
                            'Failed to resolve canonical link href value against the document base URL.',
                        );
                    }
                }
            }

            $this->canonicalUrl = $this->canonicalUrl ?? $this->url;
        }

        return $this->canonicalUrl;
    }

    private function setBaseUrl(): void
    {
        $this->url = Url::parse($this->respondedRequest->effectiveUri());

        $this->baseUrl = $this->url;

        $documentBaseHref = $this->dom->getBaseHref();

        if ($documentBaseHref) {
            try {
                $this->baseUrl = $this->baseUrl->resolve($documentBaseHref);
            } catch (Exception $exception) {
                $this->logger?->warning('Failed to resolve the document <base> tag href against the document URL.');
            }
        }
    }
}


================================================
FILE: src/Steps/Loading/Http/Paginate.php
================================================
<?php

namespace Crwlr\Crawler\Steps\Loading\Http;

use Crwlr\Crawler\Loader\Http\Exceptions\LoadingException;
use Crwlr\Crawler\Loader\Http\Messages\RespondedRequest;
use Crwlr\Crawler\Steps\Loading\Http;
use Exception;
use Generator;
use Psr\Http\Message\RequestInterface;
use Psr\Http\Message\StreamInterface;
use Psr\Http\Message\UriInterface;

/**
 * @deprecated This class shall be removed in the next major version (v4).
 *             See the comment above the Http::transferSettingsToPaginateStep() method.
 */

class Paginate extends Http
{
    public function __construct(
        protected AbstractPaginator $paginator,
        string $method = 'GET',
        array $headers = [],
        string|StreamInterface|null $body = null,
        string $httpVersion = '1.1',
    ) {
        parent::__construct($method, $headers, $body, $httpVersion);
    }

    /**
     * @param UriInterface|UriInterface[] $input
     * @throws LoadingException
     */
    protected function invoke(mixed $input): Generator
    {
        if (is_array($input)) {
            foreach ($input as $inputUrl) {
                yield from $this->paginateInputUrl($inputUrl);
            }
        } else {
            yield from $this->paginateInputUrl($input);
        }
    }

    /**
     * @throws LoadingException
     */
    private function paginateInputUrl(UriInterface $url): Generator
    {
        $request = $this->getRequestFromInputUri($url);

        $response = $this->getResponseFromRequest($request);

        if ($response) {
            yield $response;
        }

        $this->processLoaded($request, $response);

        while (!$this->paginator->hasFinished()) {
            $request = $this->paginator->getNextRequest();

            if (!$request) {
                break;
            }

            $response = $this->getResponseFromRequest($request);

            if ($response) {
                yield $response;
            }

            $this->processLoaded($request, $response);
        }

        $this->finish();
    }

    private function finish(): void
    {
        if ($this->logger) {
            $this->paginator->logWhenFinished($this->logger);

            $this->paginator->resetFinished();
        }
    }

    private function processLoaded(RequestInterface $request, ?RespondedRequest $response): void
    {
        try {
            $this->paginator->processLoaded($request, $response);
        } catch (Exception $exception) {
            $this->logger?->error('Paginate Error: ' . $exception->getMessage());
        }
    }
}


================================================
FILE: src/Steps/Loading/Http/Paginator.php
================================================
<?php

namespace Crwlr\Crawler\Steps\Loading\Http;

use Crwlr\Crawler\Steps\Html\DomQuery;
use Crwlr\Crawler\Steps\Html\Exceptions\InvalidDomQueryException;
use Crwlr\Crawler\Steps\Loading\Http\Paginators\QueryParamsPaginator;
use Crwlr\Crawler\Steps\Loading\Http\Paginators\SimpleWebsitePaginator;

class Paginator
{
    public const MAX_PAGES_DEFAULT = 1000;

    /**
     * @throws InvalidDomQueryException
     */
    public static function simpleWebsite(
        string|DomQuery $paginationLinksSelector,
        int $maxPages = self::MAX_PAGES_DEFAULT,
    ): SimpleWebsitePaginator {
        return new SimpleWebsitePaginator($paginationLinksSelector, $maxPages);
    }

    public static function queryParams(int $maxPages = Paginator::MAX_PAGES_DEFAULT): QueryParamsPaginator
    {
        return new QueryParamsPaginator($maxPages);
    }
}


================================================
FILE: src/Steps/Loading/Http/Paginators/QueryParams/AbstractQueryParamManipulator.php
================================================
<?php

namespace Crwlr\Crawler\Steps\Loading\Http\Paginators\QueryParams;

use Adbar\Dot;
use Crwlr\QueryString\Query;
use Exception;

abstract class AbstractQueryParamManipulator implements QueryParamManipulator
{
    public function __construct(protected string $queryParamName) {}

    /**
     * @throws Exception
     */
    protected function getCurrentValue(Query $query, mixed $fallbackValue = null): mixed
    {
        if ($query->has($this->queryParamName)) {
            return $query->get($this->queryParamName);
        }

        return $fallbackValue;
    }

    /**
     * @throws Exception
     */
    protected function getCurrentValueUsingDotNotation(Query $query, mixed $fallbackValue = null): mixed
    {
        $dot = new Dot($query->toArray());

        return $dot->get($this->queryParamName, $fallbackValue);
    }

    /**
     * @throws Exception
     */
    protected function getCurrentValueAsInt(Query $query): int
    {
        return (int) $this->getCurrentValue($query);
    }

    /**
     * @throws Exception
     */
    protected function getCurrentValueAsIntUsingDotNotation(Query $query): int
    {
        return (int) $this->getCurrentValueUsingDotNotation($query);
    }
}


================================================
FILE: src/Steps/Loading/Http/Paginators/QueryParams/Decrementor.php
================================================
<?php

namespace Crwlr\Crawler\Steps\Loading\Http\Paginators\QueryParams;

use Adbar\Dot;
use Crwlr\QueryString\Query;
use Exception;

class Decrementor extends AbstractQueryParamManipulator
{
    public function __construct(
        string $queryParamName,
        protected int $decrement = 1,
        protected bool $useDotNotation = false,
    ) {
        parent::__construct($queryParamName);
    }

    /**
     * @throws Exception
     */
    public function execute(Query $query): Query
    {
        if ($this->useDotNotation) {
            $dot = (new Dot($query->toArray()))->set(
                $this->queryParamName,
                (string) ($this->getCurrentValueAsIntUsingDotNotation($query) - $this->decrement),
            );

            return new Query($dot->all());
        }

        return $query->set(
            $this->queryParamName,
            (string) ($this->getCurrentValueAsInt($query) - $this->decrement),
        );
    }
}


================================================
FILE: src/Steps/Loading/Http/Paginators/QueryParams/Incrementor.php
================================================
<?php

namespace Crwlr\Crawler\Steps\Loading\Http\Paginators\QueryParams;

use Adbar\Dot;
use Crwlr\QueryString\Query;
use Exception;

class Incrementor extends AbstractQueryParamManipulator
{
    public function __construct(
        string $queryParamName,
        protected int $increment = 1,
        protected bool $useDotNotation = false,
    ) {
        parent::__construct($queryParamName);
    }

    /**
     * @throws Exception
     */
    public function execute(Query $query): Query
    {
        if ($this->useDotNotation) {
            $dot = (new Dot($query->toArray()))->set(
                $this->queryParamName,
                (string) ($this->getCurrentValueAsIntUsingDotNotation($query) + $this->increment),
            );

            return new Query($dot->all());
        }

        return $query->set(
            $this->queryParamName,
            (string) ($this->getCurrentValueAsInt($query) + $this->increment),
        );
    }
}


================================================
FILE: src/Steps/Loading/Http/Paginators/QueryParams/QueryParamManipulator.php
================================================
<?php

namespace Crwlr\Crawler\Steps\Loading\Http\Paginators\QueryParams;

use Crwlr\QueryString\Query;

interface QueryParamManipulator
{
    public function execute(Query $query): Query;
}


================================================
FILE: src/Steps/Loading/Http/Paginators/QueryParamsPaginator.php
================================================
<?php

namespace Crwlr\Crawler\Steps\Loading\Http\Paginators;

use Crwlr\Crawler\Steps\Loading\Http;
use Crwlr\Crawler\Steps\Loading\Http\Paginator;
use Crwlr\Crawler\Steps\Loading\Http\Paginators\QueryParams\Decrementor;
use Crwlr\Crawler\Steps\Loading\Http\Paginators\QueryParams\Incrementor;
use Crwlr\Crawler\Steps\Loading\Http\Paginators\QueryParams\QueryParamManipulator;
use Crwlr\QueryString\Query;
use Crwlr\Url\Url;
use Exception;
use GuzzleHttp\Psr7\Utils;
use Psr\Http\Message\RequestInterface;

class QueryParamsPaginator extends Http\AbstractPaginator
{
    /**
     * @var QueryParamManipulator[]
     */
    protected array $manipulators = [];

    /**
     * @var bool True means the class handles URL query params, false means it's about params sent as request body.
     */
    protected bool $paramsInUrl = true;

    public static function paramsInUrl(int $maxPages = Paginator::MAX_PAGES_DEFAULT): self
    {
        return new self($maxPages);
    }

    public function inUrl(): self
    {
        $this->paramsInUrl = true;

        return $this;
    }

    public static function paramsInBody(int $maxPages = Paginator::MAX_PAGES_DEFAULT): self
    {
        $instance = new self($maxPages);

        $instance->paramsInUrl = false;

        return $instance;
    }

    public function inBody(): self
    {
        $this->paramsInUrl = false;

        return $this;
    }

    public function increase(string $queryParamName, int $by = 1, bool $useDotNotation = false): self
    {
        $this->manipulators[] = new Incrementor($queryParamName, $by, $useDotNotation);

        return $this;
    }

    public function increaseUsingDotNotation(string $queryParamName, int $by = 1): self
    {
        $this->manipulators[] = new Incrementor($queryParamName, $by, true);

        return $this;
    }

    public function decrease(string $queryParamName, int $by = 1, bool $useDotNotation = false): self
    {
        $this->manipulators[] = new Decrementor($queryParamName, $by, $useDotNotation);

        return $this;
    }

    public function decreaseUsingDotNotation(string $queryParamName, int $by = 1): self
    {
        $this->manipulators[] = new Decrementor($queryParamName, $by, true);

        return $this;
    }

    /**
     * @throws Exception
     */
    public function getNextRequest(): ?RequestInterface
    {
        if (!$this->latestRequest) {
            return null;
        }

        if ($this->paramsInUrl) {
            $url = Url::parse($this->latestRequest->getUri());

            $query = $url->queryString();
        } else {
            $query = Query::fromString(Http::getBodyString($this->latestRequest));
        }

        foreach ($this->manipulators as $manipulator) {
            $query = $manipulator->execute($query);
        }

        if ($this->paramsInUrl) {
            $request = $this->latestRequest->withUri($url->toPsr7());
        } else {
            $request = $this->latestRequest->withBody(Utils::streamFor($query->toString()));
        }

        return $request;
    }
}


================================================
FILE: src/Steps/Loading/Http/Paginators/SimpleWebsitePaginator.php
================================================
<?php

namespace Crwlr\Crawler\Steps\Loading\Http\Paginators;

use Crwlr\Crawler\Loader\Http\Messages\RespondedRequest;
use Crwlr\Crawler\Steps\Dom;
use Crwlr\Crawler\Steps\Html\CssSelector;
use Crwlr\Crawler\Steps\Html\DomQuery;
use Crwlr\Crawler\Steps\Html\Exceptions\InvalidDomQueryException;
use Crwlr\Crawler\Steps\Loading\Http;
use Crwlr\Crawler\Utils\RequestKey;
use Crwlr\Url\Url;
use Exception;
use Psr\Http\Message\RequestInterface;
use Psr\Log\LoggerInterface;

class SimpleWebsitePaginator extends Http\AbstractPaginator
{
    /**
     * @var array<string, array{ url: string, foundOn: string }>
     */
    protected array $found = [];

    /**
     * @var array<string, true>
     */
    protected array $loadedUrls = [];

    protected DomQuery $paginationLinksSelector;

    protected string $latestRequestKey = '';

    /**
     * @var array<string, RequestInterface>
     */
    protected array $parentRequests = [];

    /**
     * @throws InvalidDomQueryException
     */
    public function __construct(string|DomQuery $paginationLinksSelector, int $maxPages = 1000)
    {
        if (is_string($paginationLinksSelector)) {
            $this->paginationLinksSelector = Dom::cssSelector($paginationLinksSelector);
        } else {
            $this->paginationLinksSelector = $paginationLinksSelector;
        }

        parent::__construct($maxPages);
    }

    public function hasFinished(): bool
    {
        return $this->maxPagesReached() || empty($this->found) || $this->hasFinished;
    }

    public function getNextRequest(): ?RequestInterface
    {
        if (!$this->latestRequest) {
            return null;
        }

        $nextUrl = array_shift($this->found);

        if (!$nextUrl) {
            return null;
        }

        $request = $this->parentRequests[$nextUrl['foundOn']];

        $this->cleanUpParentRequests();

        return $request->withUri(Url::parsePsr7($nextUrl['url']));
    }

    /**
     * @throws Exception
     */
    public function processLoaded(
        RequestInterface $request,
        ?RespondedRequest $respondedRequest,
    ): void {
        $this->registerLoadedRequest($respondedRequest ?? $request);

        if ($this->latestRequest) {
            $this->latestRequestKey = RequestKey::from($this->latestRequest);
        }

        $this->loadedUrls[$request->getUri()->__toString()] = true;

        if ($respondedRequest) {
            foreach ($respondedRequest->redirects() as $redirectUrl) {
                $this->loadedUrls[$redirectUrl] = true;
            }

            $this->getPaginationLinksFromResponse($respondedRequest);
        }
    }

    public function logWhenFinished(LoggerInterface $logger): void
    {
        if ($this->maxPagesReached() && !empty($this->found)) {
            $logger->notice('Max pages limit reached');
        } else {
            $logger->info('All found pagination links loaded');
        }
    }

    /**
     * @throws Exception
     */
    protected function getPaginationLinksFromResponse(RespondedRequest $respondedRequest): void
    {
        $responseBody = Http::getBodyString($respondedRequest);

        $document = new Dom\HtmlDocument($responseBody);

        $paginationLinksElements = $this->paginationLinksSelector instanceof CssSelector ?
            $document->querySelectorAll($this->paginationLinksSelector->query) :
            $document->queryXPath($this->paginationLinksSelector->query);

        foreach ($paginationLinksElements as $paginationLinksElement) {
            /** @var Dom\HtmlElement $paginationLinksElement */
            $this->addFoundUrlFromLinkElement(
                $paginationLinksElement,
                $document,
                $respondedRequest->effectiveUri(),
            );

            foreach ($paginationLinksElement->querySelectorAll('a') as $linkInPaginationLinksElement) {
                $this->addFoundUrlFromLinkElement(
                    $linkInPaginationLinksElement,
                    $document,
                    $respondedRequest->effectiveUri(),
                );
            }
        }
    }

    /**
     * @throws Exception
     */
    protected function addFoundUrlFromLinkElement(
        Dom\HtmlElement $linkElement,
        Dom\HtmlDocument $document,
        string $documentUrl,
    ): void {
        if ($this->isRelevantLinkElement($linkElement)) {
            $url = $this->getAbsoluteUrlFromLinkElement($linkElement, $document, $documentUrl);

            $this->addFoundUrl($url);
        }
    }

    /**
     * @throws Exception
     */
    protected function getAbsoluteUrlFromLinkElement(
        Dom\HtmlElement $linkElement,
        Dom\HtmlDocument $document,
        string $documentUrl,
    ): string {
        $baseUrl = Url::parse($documentUrl);

        $baseHref = $document->getBaseHref();

        if ($baseHref) {
            $baseUrl = $baseUrl->resolve($baseHref);
        }

        $linkHref = $linkElement->getAttribute('href') ?? '';

        return $baseUrl->resolve($linkHref)->__toString();
    }

    protected function isRelevantLinkElement(Dom\HtmlElement $element): bool
    {
        if ($element->nodeName() !== 'a') {
            return false;
        }

        $href = $element->getAttribute('href');

        return !empty($href) && !str_starts_with($href, '#');
    }

    protected function addFoundUrl(string $url): void
    {
        if (!isset($this->found[$url]) && !isset($this->loadedUrls[$url])) {
            if ($this->latestRequest && !array_key_exists($this->latestRequestKey, $this->parentRequests)) {
                $this->parentRequests[$this->latestRequestKey] = $this->latestRequest;
            }

            $this->found[$url] = ['url' => $url, 'foundOn' => $this->latestRequestKey];
        }
    }

    /**
     * The parent requests for found links are stored, so the new requests are always created from the actual parent,
     * not the latest registered response. After getting the next request to load, always check for all parent
     * requests, if there are still children in the found URLs. If not, the parent request can be forgotten, so we
     * keep memory usage as low as possible.
     */
    protected function cleanUpParentRequests(): void
    {
        foreach ($this->parentRequests as $requestKey => $request) {
            foreach ($this->found as $found) {
                if ($found['foundOn'] === $requestKey) {
                    continue 2;
                }
            }

            unset($this->parentRequests[$requestKey]);
        }
    }
}


================================================
FILE: src/Steps/Loading/Http/Paginators/StopRules/Contains.php
================================================
<?php

namespace Crwlr\Crawler\Steps\Loading\Http\Paginators\StopRules;

use Crwlr\Crawler\Cache\Exceptions\MissingZlibExtensionException;
use Crwlr\Crawler\Loader\Http\Messages\RespondedRequest;
use Crwlr\Crawler\Steps\Loading\Http;
use Psr\Http\Message\RequestInterface;

class Contains implements StopRule
{
    public function __construct(protected string $contains) {}

    /**
     * @throws MissingZlibExtensionException
     */
    public function shouldStop(RequestInterface $request, ?RespondedRequest $respondedRequest): bool
    {
        if (!$respondedRequest) {
            return true;
        }

        $content = trim(Http::getBodyString($respondedRequest->response));

        return str_contains($content, $this->contains);
    }
}


================================================
FILE: src/Steps/Loading/Http/Paginators/StopRules/IsEmptyInDom.php
================================================
<?php

namespace Crwlr\Crawler\Steps\Loading\Http\Paginators\StopRules;

use Crwlr\Crawler\Cache\Exceptions\MissingZlibExtensionException;
use Crwlr\Crawler\Loader\Http\Messages\RespondedRequest;
use Crwlr\Crawler\Steps\Dom\DomDocument;
use Crwlr\Crawler\Steps\Dom\HtmlElement;
use Crwlr\Crawler\Steps\Dom\XmlElement;
use Crwlr\Crawler\Steps\Html\CssSelector;
use Crwlr\Crawler\Steps\Html\DomQuery;
use Crwlr\Crawler\Steps\Html\Exceptions\InvalidDomQueryException;
use Crwlr\Crawler\Steps\Loading\Http;
use Psr\Http\Message\RequestInterface;
use Throwable;

abstract class IsEmptyInDom implements StopRule
{
    public function __construct(protected string|DomQuery $selector) {}

    /**
     * @throws InvalidDomQueryException|MissingZlibExtensionException
     */
    public function shouldStop(RequestInterface $request, ?RespondedRequest $respondedRequest): bool
    {
        if (!$respondedRequest) {
            return true;
        }

        $source = trim(Http::getBodyString($respondedRequest->response));

        try {
            $document = $this->makeDom($source);
        } catch (Throwable $exception) {
            return true;
        }

        $domQuery = $this->selector instanceof DomQuery ? $this->selector : new CssSelector($this->selector);

        $filtered = $domQuery instanceof CssSelector ?
            $document->querySelectorAll($domQuery->query) :
            $document->queryXPath($domQuery->query);

        if ($filtered->count() === 0) {
            return true;
        }

        foreach ($filtered as $element) {
            /** @var HtmlElement|XmlElement $element */
            if (!$this->nodeIsEmpty($element)) {
                return false;
            }
        }

        return true;
    }

    abstract protected function makeDom(string $source): DomDocument;

    private function nodeIsEmpty(HtmlElement|XmlElement $node): bool
    {
        return $node instanceof HtmlElement ? trim($node->innerHtml()) === '' : trim($node->innerXml()) === '';
    }
}


================================================
FILE: src/Steps/Loading/Http/Paginators/StopRules/IsEmptyInHtml.php
================================================
<?php

namespace Crwlr\Crawler\Steps\Loading\Http\Paginators\StopRules;

use Crwlr\Crawler\Steps\Dom\DomDocument;
use Crwlr\Crawler\Steps\Dom\HtmlDocument;

class IsEmptyInHtml extends IsEmptyInDom
{
    protected function makeDom(string $source): DomDocument
    {
        return new HtmlDocument($source);
    }
}


================================================
FILE: src/Steps/Loading/Http/Paginators/StopRules/IsEmptyInJson.php
================================================
<?php

namespace Crwlr\Crawler\Steps\Loading\Http\Paginators\StopRules;

use Adbar\Dot;
use Crwlr\Crawler\Loader\Http\Messages\RespondedRequest;
use Crwlr\Crawler\Steps\Loading\Http;
use Crwlr\Utils\Exceptions\InvalidJsonException;
use Crwlr\Utils\Json;
use Psr\Http\Message\RequestInterface;

class IsEmptyInJson implements StopRule
{
    public function __construct(protected string $dotNotationKey) {}

    /**
     * @throws InvalidJsonException
     */
    public function shouldStop(RequestInterface $request, ?RespondedRequest $respondedRequest): bool
    {
        if (!$respondedRequest) {
            return true;
        }

        $content = trim(Http::getBodyString($respondedRequest->response));

        $json = Json::stringToArray($content);

        $dot = new Dot($json);

        return empty($dot->get($this->dotNotationKey));
    }
}


================================================
FILE: src/Steps/Loading/Http/Paginators/StopRules/IsEmptyInXml.php
================================================
<?php

namespace Crwlr\Crawler\Steps\Loading\Http\Paginators\StopRules;

use Crwlr\Crawler\Steps\Dom\DomDocument;
use Crwlr\Crawler\Steps\Dom\XmlDocument;

class IsEmptyInXml extends IsEmptyInDom
{
    protected function makeDom(string $source): DomDocument
    {
        return new XmlDocument($source);
    }
}


================================================
FILE: src/Steps/Loading/Http/Paginators/StopRules/IsEmptyResponse.php
================================================
<?php

namespace Crwlr\Crawler\Steps\Loading\Http\Paginators\StopRules;

use Crwlr\Crawler\Loader\Http\Messages\RespondedRequest;
use Crwlr\Crawler\Steps\Loading\Http;
use Psr\Http\Message\RequestInterface;

class IsEmptyResponse implements StopRule
{
    public function shouldStop(RequestInterface $request, ?RespondedRequest $respondedRequest): bool
    {
        if (!$respondedRequest) {
            return true;
        }

        $content = trim(Http::getBodyString($respondedRequest->response));

        return $content === '' || $content === '[]' || $content === '{}';
    }
}


================================================
FILE: src/Steps/Loading/Http/Paginators/StopRules/NotContains.php
================================================
<?php

namespace Crwlr\Crawler\Steps\Loading\Http\Paginators\StopRules;

use Crwlr\Crawler\Cache\Exceptions\MissingZlibExtensionException;
use Crwlr\Crawler\Loader\Http\Messages\RespondedRequest;
use Crwlr\Crawler\Steps\Loading\Http;
use Psr\Http\Message\RequestInterface;

class NotContains implements StopRule
{
    public function __construct(protected string $contains) {}

    /**
     * @throws MissingZlibExtensionException
     */
    public function shouldStop(RequestInterface $request, ?RespondedRequest $respondedRequest): bool
    {
        if (!$respondedRequest) {
            return true;
        }

        $content = trim(Http::getBodyString($respondedRequest->response));

        return !str_contains($content, $this->contains);
    }
}


================================================
FILE: src/Steps/Loading/Http/Paginators/StopRules/PaginatorStopRules.php
================================================
<?php

namespace Crwlr\Crawler\Steps\Loading\Http\Paginators\StopRules;

use Crwlr\Crawler\Steps\Html\DomQuery;

class PaginatorStopRules
{
    public static function isEmptyResponse(): IsEmptyResponse
    {
        return new IsEmptyResponse();
    }

    public static function isEmptyInJson(string $dotNotationKey): IsEmptyInJson
    {
        return new IsEmptyInJson($dotNotationKey);
    }

    public static function isEmptyInHtml(string|DomQuery $selector): IsEmptyInHtml
    {
        return new IsEmptyInHtml($selector);
    }

    public static function isEmptyInXml(string|DomQuery $selector): IsEmptyInXml
    {
        return new IsEmptyInXml($selector);
    }

    public static function contains(string $string): Contains
    {
        return new Contains($string);
    }

    public static function notContains(string $string): NotContains
    {
        return new NotContains($string);
    }
}


================================================
FILE: src/Steps/Loading/Http/Paginators/StopRules/StopRule.php
================================================
<?php

namespace Crwlr\Crawler\Steps\Loading\Http\Paginators\StopRules;

use Crwlr\Crawler\Loader\Http\Messages\RespondedRequest;
use Psr\Http\Message\RequestInterface;

interface StopRule
{
    public function shouldStop(RequestInterface $request, ?RespondedRequest $respondedRequest): bool;
}


================================================
FILE: src/Steps/Loading/Http.php
================================================
<?php

namespace Crwlr\Crawler\Steps\Loading;

use Crwlr\Crawler\Cache\Exceptions\MissingZlibExtensionException;
use Crwlr\Crawler\Loader\Http\Messages\RespondedRequest;
use Crwlr\Crawler\Steps\Html\Exceptions\InvalidDomQueryException;
use Crwlr\Crawler\Steps\Loading\Http\AbstractPaginator;
use Crwlr\Crawler\Steps\Loading\Http\Paginate;
use Crwlr\Crawler\Steps\Loading\Http\Paginator;
use Crwlr\Crawler\Steps\StepOutputType;
use Crwlr\Crawler\Utils\Gzip;
use Exception;
use Generator;
use Psr\Http\Message\MessageInterface;
use Psr\Http\Message\StreamInterface;
use Psr\Http\Message\UriInterface;

class Http extends HttpBase
{
    /**
     * @param array|(string|string[])[] $headers
     */
    public static function crawl(array $headers = [], string $httpVersion = '1.1'): HttpCrawl
    {
        return new HttpCrawl($headers, $httpVersion);
    }

    /**
     * @param array|(string|string[])[] $headers
     */
    public static function get(array $headers = [], string $httpVersion = '1.1'): self
    {
        return new self('GET', $headers, null, $httpVersion);
    }

    /**
     * @param array|(string|string[])[] $headers
     */
    public static function post(
        array $headers = [],
        string|StreamInterface|null $body = null,
        string $httpVersion = '1.1',
    ): self {
        return new self('POST', $headers, $body, $httpVersion);
    }

    /**
     * @param array|(string|string[])[] $headers
     */
    public static function put(
        array $headers = [],
        string|StreamInterface|null $body = null,
        string $httpVersion = '1.1',
    ): self {
        return new self('PUT', $headers, $body, $httpVersion);
    }

    /**
     * @param array|(string|string[])[] $headers
     */
    public static function patch(
        array $headers = [],
        string|StreamInterface|null $body = null,
        string $httpVersion = '1.1',
    ): self {
        return new self('PATCH', $headers, $body, $httpVersion);
    }

    /**
     * @param array|(string|string[])[] $headers
     */
    public static function delete(
        array $headers = [],
        string|StreamInterface|null $body = null,
        string $httpVersion = '1.1',
    ): self {
        return new self('DELETE', $headers, $body, $httpVersion);
    }

    /**
     * When using the contents of an HTTP Message Stream multiple times, it's important to not forget to rewind() it,
     * otherwise you'll just get an empty string. So better just always use this helper.
     *
     * @throws MissingZlibExtensionException
     */
    public static function getBodyString(MessageInterface|RespondedRequest $message): string
    {
        $message = $message instanceof RespondedRequest ? $message->response : $message;

        $message->getBody()->rewind();

        $contents = $message->getBody()->getContents();

        $message->getBody()->rewind();

        if (in_array('application/x-gzip', $message->getHeader('Content-Type'), true)) {
            return Gzip::decode($contents);
        }

        return $contents;
    }

    /**
     * @throws InvalidDomQueryException
     */
    public function paginate(
        AbstractPaginator|string $paginator,
        int $defaultPaginatorMaxPages = Paginator::MAX_PAGES_DEFAULT,
    ): Paginate {
        if (is_string($paginator)) {
            $paginator = Paginator::simpleWebsite($paginator, $defaultPaginatorMaxPages);
        }

        return $this->transferSettingsToPaginateStep(
            new Paginate($paginator, $this->method, $this->headers, $this->body, $this->httpVersion),
        );
    }

    public function outputType(): StepOutputType
    {
        return StepOutputType::AssociativeArrayOrObject;
    }

    /**
     * @param UriInterface|UriInterface[] $input
     * @return Generator<RespondedRequest>
     * @throws Exception
     */
    protected function invoke(mixed $input): Generator
    {
        $input = !is_array($input) ? [$input] : $input;

        foreach ($input as $uri) {
            $response = $this->getResponseFromInputUri($uri);

            if ($response) {
                yield $response;
            }
        }

        $this->resetInputRequestParams();
    }

    /**
     * Temporary fix to transfer settings that may have already been defined on the current instance,
     * to a new Paginate step instance. This shall be fixed in the next major version (v4) by removing
     * the Paginate class and implementing it in the Http class directly.
     */
    private function transferSettingsToPaginateStep(Paginate $step): Paginate
    {
        $step->stopOnErrorResponse = $this->stopOnErrorResponse;

        $step->yieldErrorResponses = $this->yieldErrorResponses;

        $step->useAsUrl = $this->useAsUrl;

        $step->useAsBody = $this->useAsBody;

        $step->useAsHeaders = $this->useAsHeaders;

        $step->useAsHeader = $this->useAsHeader;

        $step->staticUrl = $this->staticUrl;

        $step->postBrowserNavigateHooks = $this->postBrowserNavigateHooks;

        $step->skipCache = $this->skipCache;

        $step->forceBrowserUsage = $this->forceBrowserUsage;

        return $step;
    }
}


================================================
FILE: src/Steps/Loading/HttpBase.php
================================================
<?php

namespace Crwlr\Crawler\Steps\Loading;

use Closure;
use Crwlr\Crawler\Loader\Http\Exceptions\LoadingException;
use Crwlr\Crawler\Loader\Http\HttpLoader;
use Crwlr\Crawler\Loader\Http\Messages\RespondedRequest;
use Crwlr\Crawler\Steps\Step;
use Crwlr\Crawler\Utils\HttpHeaders;
use Crwlr\Crawler\Utils\TemplateString;
use Exception;
use GuzzleHttp\Psr7\Request;
use InvalidArgumentException;
use Psr\Http\Message\RequestInterface;
use Psr\Http\Message\StreamInterface;
use Psr\Http\Message\UriInterface;
use Throwable;

abstract class HttpBase extends Step
{
    /**
     * @use LoadingStep<HttpLoader>
     */
    use LoadingStep;

    protected bool $stopOnErrorResponse = false;

    protected bool $yieldErrorResponses = false;

    protected ?string $useAsUrl = null;

    protected ?string $useAsBody = null;

    protected ?string $inputBody = null;

    protected ?string $useAsHeaders = null;

    /**
     * @var null|array<string, string>
     */
    protected ?array $useAsHeader = null;

    /**
     * @var null|array<string, string|string[]>
     */
    protected ?array $inputHeaders = null;

    protected ?string $staticUrl = null;

    /**
     * @var Closure[]
     */
    protected array $postBrowserNavigateHooks = [];

    protected bool $skipCache = false;

    protected bool $forceBrowserUsage = false;

    /**
     * @param string $method
     * @param array<string, string|string[]> $headers
     * @param string|StreamInterface|null $body
     * @param string $httpVersion
     */
    public function __construct(
        protected readonly string $method = 'GET',
        protected readonly array $headers = [],
        protected readonly string|StreamInterface|null $body = null,
        protected readonly string $httpVersion = '1.1',
    ) {}

    public function stopOnErrorResponse(): static
    {
        $this->stopOnErrorResponse = true;

        return $this;
    }

    public function yieldErrorResponses(): static
    {
        $this->yieldErrorResponses = true;

        return $this;
    }

    /**
     * Chose key from array input to use its value as request URL
     *
     * If input is an array with string keys, you can define which key from that array should be used as the URL for
     * the HTTP request.
     */
    public function useInputKeyAsUrl(string $key): static
    {
        $this->useAsUrl = $key;

        return $this;
    }

    /**
     * Chose key from array input to use its value as request body
     *
     * If input is an array with string keys, you can define which key from that array should be used as the body for
     * the HTTP request.
     */
    public function useInputKeyAsBody(string $key): static
    {
        $this->useAsBody = $key;

        return $this;
    }

    /**
     * Chose key from array input to use its value as a request header
     *
     * If input is an array with string keys, you can choose a key from that array and map it to an HTTP request header.
     */
    public function useInputKeyAsHeader(string $key, ?string $asHeader = null): static
    {
        $asHeader = $asHeader ?? $key;

        if ($this->useAsHeader === null) {
            $this->useAsHeader = [];
        }

        $this->useAsHeader[$key] = $asHeader;

        return $this;
    }

    /**
     * Chose key from array input to use its value as request headers
     *
     * If input is an array with string keys, you can choose a key from that array that will be used as headers for the
     * HTTP request. So, the value behind that array key, has to be an array with header names as keys. If you want to
     * map just one single HTTP header from input, use the `useInputKeyAsHeader()` method.
     */
    public function useInputKeyAsHeaders(string $key): static
    {
        $this->useAsHeaders = $key;

        return $this;
    }

    public function postBrowserNavigateHook(Closure $callback): static
    {
        if ($this->method !== 'GET') {
            $this->logger?->warning(
                'A ' . $this->method . ' request cannot be executed using the (headless) browser, so post browser ' .
                'navigate hooks can\'t be defined for this step either.',
            );

            return $this;
        }

        $this->postBrowserNavigateHooks[] = $callback;

        return $this;
    }

    /**
     * Skip using the cache for this step
     *
     * If you're using a cache in your crawler's loader, but want to skip using the cache for one
     * particular step in the chain, use this method.
     *
     * Attention: this has no effect if you directly use the loader in a custom child step.
     * If you want to use this feature, please use getResponseFromInputUri() or getResponseFromRequest()
     * instead of the loader.
     */
    public function skipCache(): static
    {
        $this->skipCache = true;

        return $this;
    }

    /**
     * This allows the step to temporarily switch the loader to use the (headless) Chrome browser,
     * even if it is configured to use the (guzzle) HTTP client. When a request is finished,
     * it resets the loader setting.
     *
     * Attention: this has no effect if you directly use the loader in a custom child step.
     * If you want to use this feature, please use getResponseFromInputUri() or getResponseFromRequest()
     * instead of the loader.
     */
    public function useBrowser(): static
    {
        $this->forceBrowserUsage = true;

        return $this;
    }

    public function staticUrl(string $url): static
    {
        $this->staticUrl = $url;

        return $this;
    }

    /**
     * @return UriInterface|UriInterface[]
     * @throws InvalidArgumentException
     */
    protected function validateAndSanitizeInput(mixed $input): mixed
    {
        $this->getBodyFromArrayInput($input);

        $this->getHeadersFromArrayInput($input);

        $input = $this->staticUrl ? $this->resolveStaticUrl() : $this->getUrlFromArrayInput($input);

        if (is_array($input)) {
            foreach ($input as $key => $url) {
                $input[$key] = $this->validateAndSanitizeToUriInterface($url);
            }

            return $input;
        }

        return $this->validateAndSanitizeToUriInterface($input);
    }

    protected function outputKeyAliases(): array
    {
        return [
            'url' => 'effectiveUri',
            'uri' => 'effectiveUri',
            'status' => 'responseStatusCode',
            'headers' => 'responseHeaders',
            'body' => 'responseBody',
        ];
    }

    /**
     * @throws LoadingException
     */
    protected function getResponseFromInputUri(UriInterface $input): ?RespondedRequest
    {
        $request = $this->getRequestFromInputUri($input);

        return $this->getResponseFromRequest($request);
    }

    protected function getRequestFromInputUri(UriInterface $uri): RequestInterface
    {
        $body = $this->inputBody ?? $this->body;

        $headers = $this->mergeHeaders();

        list($body, $headers) = $this->resolveVarsInRequestProperties($body, $headers);

        return new Request($this->method, $uri, $headers, $body, $this->httpVersion);
    }

    /**
     * @throws LoadingException
     * @throws Exception
     */
    protected function getResponseFromRequest(RequestInterface $request): ?RespondedRequest
    {
        $loader = $this->getLoader();

        $loaderResetConfig = $this->applyTempLoaderCustomizations();

        try {
            $response = $this->stopOnErrorResponse ? $loader->loadOrFail($request) : $loader->load($request);
        } finally {
            $this->resetTempLoaderCustomizations($loaderResetConfig);
        }

        if ($response !== null && ($response->response->getStatusCode() < 400 || $this->yieldErrorResponses)) {
            return $response;
        }

        return null;
    }

    /**
     * @return array<string, mixed>
     * @throws Exception
     */
    private function applyTempLoaderCustomizations(): array
    {
        $loader = $this->getLoader();

        $resetConfig = ['resetToHttpClient' => false, 'resetToBrowser' => false];

        if ($this->skipCache) {
            $loader->skipCacheForNextRequest();
        }

        if ($this->method !== 'GET' && ($this->forceBrowserUsage || $loader->usesHeadlessBrowser())) {
            $this->logger?->warning(
                'The (headless) browser can only be used for GET requests! Therefore this step will use the HTTP ' .
                'client for loading.',
            );

            if ($loader->usesHeadlessBrowser()) {
                $loader->useHttpClient();

                $resetConfig['resetToBrowser'] = true;
            }
        } elseif ($this->forceBrowserUsage && !$loader->usesHeadlessBrowser()) {
            $resetConfig['resetToHttpClient'] = true;

            $loader->useHeadlessBrowser();
        }

        if (!empty($this->postBrowserNavigateHooks) && $loader->usesHeadlessBrowser()) {
            $loader->browser()->setTempPostNavigateHooks($this->postBrowserNavigateHooks);
        }

        return $resetConfig;
    }

    /**
     * @param array<string, mixed> $resetConfig
     */
    private function resetTempLoaderCustomizations(array $resetConfig): void
    {
        $loader = $this->getLoader();

        if ($resetConfig['resetToHttpClient'] === true) {
            try {
                $loader->useHttpClient();
            } catch (Throwable) {
            }
        } elseif ($resetConfig['resetToBrowser']) {
            $loader->useHeadlessBrowser();
        }
    }

    /**
     * @return mixed
     */
    protected function getUrlFromArrayInput(mixed $input): mixed
    {
        if ($this->useAsUrl) {
            if (!is_array($input)) {
                $this->logger?->warning('Input is not array, therefore can\'t get URL from input by key.');
            } elseif (array_key_exists($this->useAsUrl, $input)) {
                return [$input[$this->useAsUrl]];
            } else {
                $this->logger?->warning(
                    'Input key ' . $this->useAsUrl . ' that should be used as request URL isn\'t present in input.',
                );
            }
        } elseif (is_array($input) && array_key_exists('url', $input)) {
            return $input['url'];
        } elseif (is_array($input) && array_key_exists('uri', $input)) {
            return $input['uri'];
        }

        return $input;
    }

    protected function getBodyFromArrayInput(mixed $input): void
    {
        if ($this->useAsBody) {
            if (!is_array($input)) {
                $this->logger?->warning('Input is not array, therefore can\'t get body from input by key.');
            } elseif (array_key_exists($this->useAsBody, $input)) {
                $this->inputBody = $input[$this->useAsBody];
            } else {
                $this->logger?->warning(
                    'Input key ' . $this->useAsBody . ' that should be used as request body isn\'t present in input.',
                );
            }
        }
    }

    protected function getHeadersFromArrayInput(mixed $input): void
    {
        if ($this->useAsHeaders) {
            if (!is_array($input)) {
                $this->logger?->warning('Input is not array, therefore can\'t get headers from input by key.');
            } elseif (array_key_exists($this->useAsHeaders, $input)) {
                $this->inputHeaders = $input[$this->useAsHeaders];
            } else {
                $this->logger?->warning(
                    'Input key ' . $this->useAsHeaders . ' that should be used as request headers isn\'t present in ' .
                    'input.',
                );
            }
        }

        if (is_array($this->useAsHeader)) {
            if (!is_array($input)) {
                $this->logger?->warning('Input is not array, therefore can\'t get header from input by key.');
            } else {
                foreach ($this->useAsHeader as $inputKey => $headerName) {
                    $this->addToInputHeadersFromInput($input, $inputKey, $headerName);
                }
            }
        }
    }

    protected function addToInputHeadersFromInput(mixed $input, string $inputKey, string $headerName): void
    {
        if (!is_array($this->inputHeaders)) {
            $this->inputHeaders = [];
        }

        if (!array_key_exists($inputKey, $input)) {
            $this->logger?->warning(
                'Input key ' . $inputKey . ' that should be used as a request header, isn\'t present in input.',
            );

            return;
        }

        $inputValue = $input[$inputKey];

        if (!array_key_exists($headerName, $this->inputHeaders)) {
            $this->inputHeaders[$headerName] = is_array($inputValue) ? $inputValue : [$inputValue];

            return;
        }

        $this->inputHeaders = HttpHeaders::addTo(HttpHeaders::normalize($this->inputHeaders), $headerName, $inputValue);
    }

    /**
     * @return array<string, string[]>
     */
    protected function mergeHeaders(): array
    {
        $headers = HttpHeaders::normalize($this->headers);

        if (is_array($this->inputHeaders)) {
            $inputHeaders = HttpHeaders::normalize($this->inputHeaders);

            $headers = HttpHeaders::merge($headers, $inputHeaders);
        }

        return $headers;
    }

    protected function resetInputRequestParams(): void
    {
        $this->inputHeaders = null;

        $this->inputBody = null;
    }

    private function resolveStaticUrl(): string
    {
        $fullInput = $this->getFullOriginalInput();

        $inputValue = $fullInput?->get();

        if (!is_array($inputValue)) {
            $inputValue = [];
        }

        return TemplateString::resolve($this->staticUrl ?? '', $inputValue);
    }

    /**
     * @param StreamInterface|string|null $body
     * @param array<string, string[]> $headers
     * @return array{ 0: string|StreamInterface|null, 1: array<string, string[]> }
     */
    private function resolveVarsInRequestProperties(StreamInterface|string|null $body, array $headers): array
    {
        $fullInput = $this->getFullOriginalInput();

        if (!$fullInput) {
            return [$body, $headers];
        }

        $fullInputData = $fullInput->get();

        if (!is_array($fullInputData)) {
            return [$body, $headers];
        }

        return [
            is_string($body) ? TemplateString::resolve($body, $fullInputData) : $body,
            $this->resolveVarsInHeaders($headers, $fullInputData),
        ];
    }

    /**
     * @param array<string, string[]> $headers
     * @param mixed[] $fullInputData
     * @return array<string, string[]>
     */
    private function resolveVarsInHeaders(array $headers, array $fullInputData): array
    {
        foreach ($headers as $headerName => $headerValues) {
            foreach ($headerValues as $key => $headerValue) {
                $headers[$headerName][$key] = TemplateString::resolve($headerValue, $fullInputData);
            }
        }

        return $headers;
    }
}


================================================
FILE: src/Steps/Loading/HttpCrawl.php
================================================
<?php

namespace Crwlr\Crawler\Steps\Loading;

use Closure;
use Crwlr\Crawler\Loader\Http\Messages\RespondedRequest;
use Crwlr\Crawler\Steps\Dom\HtmlElement;
use Crwlr\Crawler\Steps\Dom\XmlDocument;
use Crwlr\Crawler\Steps\Html\GetLink;
use Crwlr\Crawler\Steps\Loading\Http\Document;
use Crwlr\Crawler\Steps\Sitemap\GetUrlsFromSitemap;
use Crwlr\Utils\PhpVersion;
use Crwlr\Url\Url;
use Exception;
use Generator;
use Psr\Http\Message\UriInterface;
use Throwable;

class HttpCrawl extends Http
{
    protected ?int $depth = null;

    protected bool $sameHost = true;

    protected string $host = '';

    protected bool $sameDomain = false;

    protected string $domain = '';

    protected ?string $pathStartsWith = null;

    protected ?string $pathRegex = null;

    protected ?Closure $customClosure = null;

    protected bool $inputIsSitemap = false;

    protected bool $loadAll = false;

    protected bool $keepUrlFragment = false;

    protected bool $useCanonicalLinks = false;

    /**
     * @var array<string,array<string,bool>>
     */
    protected array $urls = [];

    /**
     * @var array<string,true>
     */
    protected array $loadedUrls = [];

    protected int $yieldedResponseCount = 0;

    public function __construct(array $headers = [], string $httpVersion = '1.1')
    {
        parent::__construct(headers: $headers, httpVersion: $httpVersion);
    }

    public function depth(int $depth): static
    {
        $this->depth = $depth;

        return $this;
    }

    public function sameHost(): static
    {
        $this->sameHost = true;

        $this->sameDomain = false;

        return $this;
    }

    public function sameDomain(): static
    {
        $this->sameDomain = true;

        $this->sameHost = false;

        return $this;
    }

    public function pathStartsWith(string $startsWith = ''): static
    {
        $this->pathStartsWith = $startsWith;

        return $this;
    }

    public function pathMatches(string $regexPattern = ''): static
    {
        $this->pathRegex = $regexPattern;

        return $this;
    }

    public function customFilter(Closure $closure): static
    {
        $this->customClosure = $closure;

        return $this;
    }

    public function inputIsSitemap(): static
    {
        $this->inputIsSitemap = true;

        return $this;
    }

    public function loadAllButYieldOnlyMatching(): static
    {
        $this->loadAll = true;

        return $this;
    }

    public function keepUrlFragment(): static
    {
        $this->keepUrlFragment = true;

        return $this;
    }

    public function useCanonicalLinks(): static
    {
        $this->useCanonicalLinks = true;

        return $this;
    }

    protected function validateAndSanitizeInput(mixed $input): mixed
    {
        return $this->validateAndSanitizeToUriInterface($input);
    }

    /**
     * @param UriInterface $input
     * @throws Exception
     */
    protected function invoke(mixed $input): Generator
    {
        $this->setHostOrDomain($input);

        $response = $this->getResponseFromInputUri($input);

        if (!$response) {
            return;
        }

        $initialResponseDocument = new Document($response);

        $this->setResponseCanonicalUrl($response, $initialResponseDocument);

        $this->addLoadedUrlsFromResponse($response);

        if (!$this->inputIsSitemap && $this->matchesAllCriteria(Url::parse($input))) {
            $this->yieldedResponseCount++;

            yield $response;
        }

        $this->urls = $this->getUrlsFromInitialResponse($response, $initialResponseDocument);

        $depth = 1;

        while (
            !$this->depthIsExceeded($depth) &&
            !empty($this->urls) &&
            (!$this->maxOutputs || $this->yieldedResponseCount < $this->maxOutputs)
        ) {
            yield from $this->loadUrls();

            $depth++;
        }
    }

    /**
     * @throws Exception
     */
    protected function setHostOrDomain(UriInterface $uri): void
    {
        if ($this->sameHost) {
            $this->host = $uri->getHost();
        } else {
            $domain = Url::parse($uri)->domain();

            if (!is_string($domain) || empty($domain)) {
                throw new Exception('No domain in input url');
            }

            $this->domain = $domain;
        }
    }

    /**
     * @throws Exception
     */
    protected function loadUrls(): Generator
    {
        $newUrls = [];

        foreach ($this->urls as $url => $yieldResponse) {
            $uri = Url::parsePsr7($url);

            $response = $this->getResponseFromInputUri($uri);

            if ($response !== null && !$this->wasAlreadyLoaded($response)) {
                $document = new Document($response, $this->logger);

                $this->setResponseCanonicalUrl($response, $document);

                $yieldResponse = $this->yieldResponse($document, $yieldResponse['yield']);

                $this->addLoadedUrlsFromResponse($response);

                $newUrls = array_merge($newUrls, $this->getUrlsFromHtmlDocument($document));

                if ($yieldResponse) {
                    yield $response;

                    $this->yieldedResponseCount++;

                    if ($this->maxOutputs && $this->yieldedResponseCount >= $this->maxOutputs) {
                        break;
                    }
                }
            }
        }

        $this->urls = $newUrls;
    }

    /**
     * @return array<string,array<string,bool>>
     * @throws Exception
     */
    protected function getUrlsFromInitialResponse(RespondedRequest $respondedRequest, ?Document $document = null): array
    {
        if ($this->inputIsSitemap) {
            return $this->getUrlsFromSitemap($respondedRequest);
        } else {
            $document = $document ?? new Document($respondedRequest);

            return $this->getUrlsFromHtmlDocument($document);
        }
    }

    /**
     * @return array<string,array<string,bool>>
     * @throws Exception
     */
    protected function getUrlsFromSitemap(RespondedRequest $respondedRequest): array
    {
        $document = new XmlDocument(Http::getBodyString($respondedRequest));

        if (PhpVersion::isBelow(8, 4)) {
            $document = GetUrlsFromSitemap::fixUrlSetTag($document);
        }

        $urls = [];

        foreach ($document->querySelectorAll('urlset url loc') as $url) {
            $url = $this->handleUrlFragment(Url::parse($url->text()));

            if (!$this->isOnSameHostOrDomain($url)) {
                continue;
            }

            $matchesCriteria = $this->matchesCriteriaBesidesHostOrDomain($url);

            if (!$matchesCriteria && !$this->loadAll) {
                continue;
            }

            $url = $url->toString();

            if (!isset($urls[$url]) && !isset($this->urls[$url]) && !isset($this->loadedUrls[$url])) {
                $urls[$url] = ['yield' => $matchesCriteria];
            }
        }

        return $urls;
    }

    /**
     * @return array<string,array<string,bool>>
     * @throws Exception
     */
    protected function getUrlsFromHtmlDocument(Document $document): array
    {
        $this->addCanonicalUrlToLoadedUrls($document);

        $urls = [];

        foreach ($document->dom()->querySelectorAll('a') as $link) {
            if (GetLink::isSpecialNonHttpLink($link)) {
                continue;
            }

            try {
                $url = $this->handleUrlFragment($document->baseUrl()->resolve($link->getAttribute('href') ?? ''));
            } catch (Throwable) {
                $this->logger?->warning('Failed to resolve a link with href: ' . $link->getAttribute('href'));

                continue;
            }

            if (!$this->isOnSameHostOrDomain($url)) {
                continue;
            }

            $matchesCriteria = $this->matchesCriteriaBesidesHostOrDomain($url, $link);

            if (!$matchesCriteria && !$this->loadAll) {
                continue;
            }

            $url = $url->toString();

            if (!isset($urls[$url]) && !isset($this->urls[$url]) && !isset($this->loadedUrls[$url])) {
                $urls[$url] = ['yield' => $matchesCriteria];
            }
        }

        return $urls;
    }

    protected function addLoadedUrlsFromResponse(RespondedRequest $respondedRequest): void
    {
        $loadedUrls = [$respondedRequest->requestedUri() => true];

        foreach ($respondedRequest->redirects() as $redirectUrl) {
            $loadedUrls[$redirectUrl] = true;
        }

        foreach ($loadedUrls as $loadedUrl => $true) {
            if (!isset($this->loadedUrls[$loadedUrl])) {
                $this->loadedUrls[$loadedUrl] = true;
            }
        }
    }

    /**
     * If the loaded response had a redirect, it can be that it was a redirect to a page that was already loaded before.
     * In that case, don't yield that response again.
     *
     * @param RespondedRequest $respondedRequest
     * @return bool
     */
    protected function wasAlreadyLoaded(RespondedRequest $respondedRequest): bool
    {
        if (
            array_key_exists($respondedRequest->requestedUri(), $this->loadedUrls) ||
            array_key_exists($respondedRequest->effectiveUri(), $this->loadedUrls)
        ) {
            $this->logger?->info('Was already loaded before. Do not process this page again.');

            return true;
        }

        foreach ($respondedRequest->redirects() as $url) {
            if (array_key_exists($url, $this->loadedUrls)) {
                $this->logger?->info('Was already loaded before. Do not process this page again.');

                return true;
            }
        }

        return false;
    }

    protected function addCanonicalUrlToLoadedUrls(Document $document): void
    {
        if ($this->useCanonicalLinks && !isset($this->loadedUrls[$document->canonicalUrl()])) {
            $this->loadedUrls[$document->canonicalUrl()] = true;
        }
    }

    /**
     * Yield response only if the URL matches the defined criteria and if the canonical URL isn't already among the
     * loaded URLs (and of course, the user decided that canonical links shall be used, because this is optional).
     */
    protected function yieldResponse(Document $document, bool $urlMatchesCriteria): bool
    {
        if (!$urlMatchesCriteria) {
            return false;
        }

        return !$this->useCanonicalLinks || !array_key_exists($document->canonicalUrl(), $this->loadedUrls);
    }

    /**
     * @throws Exception
     */
    protected function setResponseCanonicalUrl(RespondedRequest $respondedRequest, Document $document): void
    {
        if ($this->useCanonicalLinks && $respondedRequest->effectiveUri() !== $document->canonicalUrl()) {
            $this->logger?->info('Canonical link URL of this document is: ' . $document->canonicalUrl());

            $respondedRequest->addRedirectUri($document->canonicalUrl());
        }
    }

    protected function depthIsExceeded(int $depth): bool
    {
        return $this->depth !== null && $depth > $this->depth;
    }

    /**
     * @throws Exception
     */
    protected function matchesAllCriteria(Url $url, ?HtmlElement $linkElement = null): bool
    {
        return $this->isOnSameHostOrDomain($url) && $this->matchesCriteriaBesidesHostOrDomain($url, $linkElement);
    }

    /**
     * @throws Exception
     */
    protected function matchesCriteriaBesidesHostOrDomain(Url $url, ?HtmlElement $linkElement = null): bool
    {
        return $this->matchesPathCriteria($url) &&
            $this->matchesCustomCriteria($url, $linkElement);
    }

    /**
     * @throws Exception
     */
    protected function isOnSameHostOrDomain(Url $url): bool
    {
        if ($this->sameHost) {
            return $this->host === $url->host();
        } else {
            return $this->domain === $url->domain();
        }
    }

    /**
     * @throws Exception
     */
    protected function matchesPathCriteria(Url $url): bool
    {
        if ($this->pathStartsWith === null && $this->pathRegex === null) {
            return true;
        }

        $path = $url->path() ?? '';

        return ($this->pathStartsWith === null || str_starts_with($path, $this->pathStartsWith)) &&
            ($this->pathRegex === null || preg_match($this->pathRegex, $path) === 1);
    }

    protected function matchesCustomCriteria(Url $url, ?HtmlElement $linkElement): bool
    {
        return $this->customClosure === null || $this->customClosure->call($this, $url, $linkElement);
    }

    /**
     * @throws Exception
     */
    protected function handleUrlFragment(Url $url): Url
    {
        if (!$this->keepUrlFragment) {
            $url->fragment('');
        }

        return $url;
    }
}


================================================
FILE: src/Steps/Loading/LoadingStep.php
================================================
<?php

namespace Crwlr\Crawler\Steps\Loading;

use Crwlr\Crawler\Loader\LoaderInterface;

/**
 * @template T of LoaderInterface
 */

trait LoadingStep
{
    /**
     * @var T $loader
     */
    private LoaderInterface $loader;

    /**
     * @var ?T $customLoader
     */
    private ?LoaderInterface $customLoader = null;

    /**
     * @param T $loader
     */
    public function setLoader(LoaderInterface $loader): static
    {
        $this->loader = $loader;

        return $this;
    }

    /**
     * @param T $loader
     */
    public function withLoader(LoaderInterface $loader): static
    {
        $this->customLoader = $loader;

        return $this;
    }

    /**
     * @return T
     */
    protected function getLoader(): LoaderInterface
    {
        return $this->customLoader ?? $this->loader;
    }
}


================================================
FILE: src/Steps/Refiners/AbstractRefiner.php
================================================
<?php

namespace Crwlr\Crawler\Steps\Refiners;

use Psr\Log\LoggerInterface;

abstract class AbstractRefiner implements RefinerInterface
{
    protected ?LoggerInterface $logger = null;

    public function addLogger(LoggerInterface $logger): static
    {
        $this->logger = $logger;

        return $this;
    }

    protected function logTypeWarning(string $staticRefinerMethod, mixed $value): void
    {
        $this->logger?->warning(
            'Refiner ' . $staticRefinerMethod . ' can\'t be applied to value of type ' . gettype($value),
        );
    }
}


================================================
FILE: src/Steps/Refiners/DateTime/DateTimeFormat.php
================================================
<?php

namespace Crwlr\Crawler\Steps\Refiners\DateTime;

use Crwlr\Crawler\Steps\Refiners\String\AbstractStringRefiner;
use DateTime;

class DateTimeFormat extends AbstractStringRefiner
{
    public function __construct(protected string $targetFormat, protected ?string $originFormat = null) {}

    public function refine(mixed $value): mixed
    {
        return $this->apply($value, function ($value) {
            if ($this->originFormat) {
                $parsed = DateTime::createFromFormat($this->originFormat, $value);
            } else {
                $parsed = $this->parseFromUnknownFormat($value);
            }

            if ($parsed === null) {
                return $value;
            } elseif ($parsed === false) {
                $this->logger?->warning(
                    'Failed parsing date/time "' . $value . '", so can\'t reformat it to requested format.',
                );

                return $value;
            }

            return $parsed->format($this->targetFormat);
        }, 'DateTimeRefiner::reformat()');
    }

    private function parseFromUnknownFormat(string $value): ?DateTime
    {
        $timestamp = strtotime($value);

        if ($timestamp === false || $timestamp === 0) {
            $this->logger?->warning(
                'Failed to automatically (without known format) parse date/time "' . $value . '", so can\'t reformat ' .
                'it to requested format.',
            );

            return null;
        }

        return (new DateTime())->setTimestamp($timestamp);
    }
}


================================================
FILE: src/Steps/Refiners/DateTimeRefiner.php
================================================
<?php

namespace Crwlr\Crawler\Steps\Refiners;

use Crwlr\Crawler\Steps\Refiners\DateTime\DateTimeFormat;

class DateTimeRefiner
{
    public static function reformat(string $targetFormat, ?string $originFormat = null): DateTimeFormat
    {
        return new DateTimeFormat($targetFormat, $originFormat);
    }
}


================================================
FILE: src/Steps/Refiners/Html/RemoveFromHtml.php
================================================
<?php

namespace Crwlr\Crawler\Steps\Refiners\Html;

use Crwlr\Crawler\Steps\Dom;
use Crwlr\Crawler\Steps\Dom\HtmlDocument;
use Crwlr\Crawler\Steps\Html\CssSelector;
use Crwlr\Crawler\Steps\Html\DomQuery;
use Crwlr\Crawler\Steps\Html\Exceptions\InvalidDomQueryException;
use Crwlr\Crawler\Steps\Refiners\String\AbstractStringRefiner;
use Throwable;

class RemoveFromHtml extends AbstractStringRefiner
{
    protected DomQuery $selector;

    /**
     * @throws InvalidDomQueryException
     */
    public function __construct(string|DomQuery $selector)
    {
        $selectorString = is_string($selector) ? $selector : $selector->query;

        if (trim($selectorString) === '') {
            $this->logger?->warning(
                'Empty selector in remove HTML refiner. If you want HTML nodes to be removed, please define a ' .
                'selector for those nodes.',
            );
        }

        if (is_string($selector)) {
            $selector = Dom::cssSelector($selector);
        }

        $this->selector = $selector;
    }

    public function refine(mixed $value): mixed
    {
        return $this->apply($value, function ($value) {
            try {
                $document = new HtmlDocument($value);
            } catch (Throwable $exception) {
                $this->logger?->warning(
                    'Failed parsing output as HTML in refiner to remove nodes from HTML: ' . $exception->getMessage(),
                );

                return $value;
            }

            if ($this->selector instanceof CssSelector) {
                $document->removeNodesMatchingSelector($this->selector->query);
            } else {
                $document->removeNodesMatchingXPath($this->selector->query);
            }

            if (str_contains($value, '<html') || str_contains($value, '<HTML')) {
                return $document->outerHtml();
            }

            return $document->querySelector('body')?->innerHtml() ?? $document->outerHtml();
        }, 'HtmlRefiner::remove()');
    }
}


================================================
FILE: src/Steps/Refiners/HtmlRefiner.php
================================================
<?php

namespace Crwlr\Crawler\Steps\Refiners;

use Crwlr\Crawler\Steps\Html\DomQuery;
use Crwlr\Crawler\Steps\Refiners\Html\RemoveFromHtml;

class HtmlRefiner
{
    public static function remove(string|DomQuery $selector): RemoveFromHtml
    {
        return new RemoveFromHtml($selector);
    }
}


================================================
FILE: src/Steps/Refiners/RefinerInterface.php
================================================
<?php

namespace Crwlr\Crawler\Steps\Refiners;

use Psr\Log\LoggerInterface;

interface RefinerInterface
{
    public function refine(mixed $value): mixed;

    public function addLogger(LoggerInterface $logger): static;
}


================================================
FILE: src/Steps/Refiners/String/AbstractStringRefiner.php
================================================
<?php

namespace Crwlr\Crawler\Steps\Refiners\String;

use Closure;
use Crwlr\Crawler\Steps\Refiners\AbstractRefiner;

abstract class AbstractStringRefiner extends AbstractRefiner
{
    /**
     * @param Closure $refiner
     * @return mixed
     */
    protected function apply(mixed $value, Closure $refiner, string $staticRefinerMethod): mixed
    {
        if (!is_string($value) && !is_array($value)) {
            $this->logTypeWarning($staticRefinerMethod, $value);

            return $value;
        }

        if (is_array($value)) {
            foreach ($value as $key => $element) {
                if (is_string($element)) {
                    $value[$key] = $refiner($element);
                }
            }
        } else {
            $value = $refiner($value);
        }

        return $value;
    }
}


================================================
FILE: src/Steps/Refiners/String/StrAfterFirst.php
================================================
<?php

namespace Crwlr\Crawler\Steps\Refiners\String;

class StrAfterFirst extends AbstractStringRefiner
{
    public function __construct(protected readonly string $first) {}

    public function refine(mixed $value): mixed
    {
        return $this->apply($value, function ($value) {
            if ($this->first === '') {
                return $value;
            }

            $split = explode($this->first, $value, 2);

            $lastPart = end($split);

            return trim($lastPart);
        }, 'StringRefiner::afterFirst()');
    }
}


================================================
FILE: src/Steps/Refiners/String/StrAfterLast.php
================================================
<?php

namespace Crwlr\Crawler\Steps\Refiners\String;

class StrAfterLast extends AbstractStringRefiner
{
    public function __construct(protected readonly string $last) {}

    public function refine(mixed $value): mixed
    {
        return $this->apply($value, function ($value) {
            if ($this->last === '') {
                return '';
            }

            $split = explode($this->last, $value);

            $lastPart = end($split);

            return trim($lastPart);
        }, 'StringRefiner::afterLast()');
    }
}


================================================
FILE: src/Steps/Refiners/String/StrBeforeFirst.php
================================================
<?php

namespace Crwlr\Crawler\Steps\Refiners\String;

class StrBeforeFirst extends AbstractStringRefiner
{
    public function __construct(protected readonly string $first) {}

    public function refine(mixed $value): mixed
    {
        return $this->apply($value, function ($value) {
            if ($this->first === '') {
                return '';
            }

            return trim(explode($this->first, $value)[0]);
        }, 'StringRefiner::beforeFirst()');
    }
}


================================================
FILE: src/Steps/Refiners/String/StrBeforeLast.php
================================================
<?php

namespace Crwlr\Crawler\Steps\Refiners\String;

class StrBeforeLast extends AbstractStringRefiner
{
    public function __construct(protected readonly string $last) {}

    public function refine(mixed $value): mixed
    {
        return $this->apply($value, function ($value) {
            if ($this->last === '') {
                return $value;
            }

            $split = explode($this->last, $value);

            if (count($split) === 1) {
                return $value;
            }

            array_pop($split);

            return trim(implode($this->last, $split));
        }, 'StringRefiner::beforeLast()');
    }
}


================================================
FILE: src/Steps/Refiners/String/StrBetweenFirst.php
================================================
<?php

namespace Crwlr\Crawler\Steps\Refiners\String;

class StrBetweenFirst extends AbstractStringRefiner
{
    public function __construct(protected readonly string $start, protected readonly string $end) {}

    public function refine(mixed $value): mixed
    {
        return $this->apply($value, function ($value) {
            if ($this->start === '') {
                $splitAtStart = ['', $value];
            } else {
                $splitAtStart = explode($this->start, $value, 2);
            }

            if (count($splitAtStart) === 2) {
                if ($this->end === '') {
                    return trim($splitAtStart[1]);
                }

                return trim(explode($this->end, $splitAtStart[1])[0]);
            }

            return '';
        }, 'StringRefiner::betweenFirst()');
    }
}


================================================
FILE: src/Steps/Refiners/String/StrBetweenLast.php
================================================
<?php

namespace Crwlr\Crawler\Steps\Refiners\String;

class StrBetweenLast extends AbstractStringRefiner
{
    public function __construct(protected readonly string $start, protected readonly string $end) {}

    public function refine(mixed $value): mixed
    {
        return $this->apply($value, function ($value) {
            if ($this->start === '') {
                $splitAtStart = ['', $value];
            } else {
                $splitAtStart = explode($this->start, $value);
            }

            $lastPart = end($splitAtStart);

            if ($this->end === '') {
                return trim($lastPart);
            }

            return trim(explode($this->end, $lastPart)[0]);
        }, 'StringRefiner::betweenLast()');
    }
}


================================================
FILE: src/Steps/Refiners/String/StrReplace.php
================================================
<?php

namespace Crwlr\Crawler\Steps\Refiners\String;

class StrReplace extends AbstractStringRefiner
{
    /**
     * @param string|string[] $search
     * @param string|string[] $replace
     */
    public function __construct(
        protected readonly string|array $search,
        protected readonly string|array $replace,
    ) {}

    public function refine(mixed $value): mixed
    {
        return $this->apply($value, function ($value) {
            $replaced = str_replace($this->search, $this->replace, $value);

            return trim($replaced);
        }, 'StringRefiner::replace()');

        //        if (!is_string($value)) {
        //            $this->logTypeWarning('StringRefiner::replace()', $value);
        //
        //            return $value;
        //        }
        //
        //        $replaced = str_replace($this->search, $this->replace, $value);
        //
        //        return trim($replaced);
    }
}


================================================
FILE: src/Steps/Refiners/StringRefiner.php
================================================
<?php

namespace Crwlr\Crawler\Steps\Refiners;

use Crwlr\Crawler\Steps\Refiners\String\StrAfterFirst;
use Crwlr\Crawler\Steps\Refiners\String\StrAfterLast;
use Crwlr\Crawler\Steps\Refiners\String\StrBeforeFirst;
use Crwlr\Crawler\Steps\Refiners\String\StrBeforeLast;
use Crwlr\Crawler\Steps\Refiners\String\StrBetweenFirst;
use Crwlr\Crawler\Steps\Refiners\String\StrBetweenLast;
use Crwlr\Crawler\Steps\Refiners\String\StrReplace;

class StringRefiner
{
    public static function afterFirst(string $first): StrAfterFirst
    {
        return new StrAfterFirst($first);
    }

    public static function afterLast(string $last): StrAfterLast
    {
        return new StrAfterLast($last);
    }

    public static function beforeFirst(string $first): StrBeforeFirst
    {
        return new StrBeforeFirst($first);
    }

    public static function beforeLast(string $last): StrBeforeLast
    {
        return new StrBeforeLast($last);
    }

    public static function betweenFirst(string $start, string $end): StrBetweenFirst
    {
        return new StrBetweenFirst($start, $end);
    }

    public static function betweenLast(string $start, string $end): StrBetweenLast
    {
        return new StrBetweenLast($start, $end);
    }

    /**
     * @param string|string[] $search
     * @param string|string[] $replace
     */
    public static function replace(string|array $search, string|array $replace): StrReplace
    {
        return new StrReplace($search, $replace);
    }
}


================================================
FILE: src/Steps/Refiners/Url/AbstractUrlRefiner.php
================================================
<?php

namespace Crwlr\Crawler\Steps\Refiners\Url;

use Crwlr\Crawler\Steps\Refiners\AbstractRefiner;
use Crwlr\Url\Exceptions\InvalidUrlComponentException;
use Crwlr\Url\Url;
use Exception;
use Psr\Http\Message\UriInterface;

abstract class AbstractUrlRefiner extends AbstractRefiner
{
    /**
     * @throws InvalidUrlComponentException|Exception
     */
    public function refine(mixed $value): mixed
    {
        if (is_array($value)) {
            foreach ($value as $key => $url) {
                $value[$key] = $this->refine($url);
            }

            return $value;
        }

        if (!is_string($value) && !$value instanceof Url && !$value instanceof UriInterface) {
            $this->logTypeWarning($this->staticRefinerMethod(), $value);

            return $value;
        }

        if (!$value instanceof Url) {
            $value = Url::parse($value);
        }

        return $this->refineUrl($value);
    }

    abstract protected function staticRefinerMethod(): string;

    abstract protected function refineUrl(Url $url): string;
}


================================================
FILE: src/Steps/Refiners/Url/WithFragment.php
================================================
<?php

namespace Crwlr\Crawler\Steps\Refiners\Url;

use Crwlr\Url\Exceptions\InvalidUrlComponentException;
use Crwlr\Url\Url;
use Exception;

class WithFragment extends AbstractUrlRefiner
{
    public function __construct(protected readonly string $fragment) {}

    protected function staticRefinerMethod(): string
    {
        return 'UrlRefiner::withFragment()';
    }

    /**
     * @throws InvalidUrlComponentException|Exception
     */
    protected function refineUrl(Url $url): string
    {
        $url->fragment($this->fragment);

        return (string) $url;
    }
}


================================================
FILE: src/Steps/Refiners/Url/WithHost.php
================================================
<?php

namespace Crwlr\Crawler\Steps\Refiners\Url;

use Crwlr\Url\Exceptions\InvalidUrlComponentException;
use Crwlr\Url\Url;
use Exception;

class WithHost extends AbstractUrlRefiner
{
    public function __construct(protected readonly string $host) {}

    protected function staticRefinerMethod(): string
    {
        return 'UrlRefiner::withHost()';
    }

    /**
     * @throws InvalidUrlComponentException|Exception
     */
    protected function refineUrl(Url $url): string
    {
        $url->host($this->host);

        return (string) $url;
    }
}


================================================
FILE: src/Steps/Refiners/Url/WithPath.php
================================================
<?php

namespace Crwlr\Crawler\Steps\Refiners\Url;

use Crwlr\Url\Exceptions\InvalidUrlComponentException;
use Crwlr\Url\Url;
use Exception;

class WithPath extends AbstractUrlRefiner
{
    public function __construct(protected readonly string $path) {}

    protected function staticRefinerMethod(): string
    {
        return 'UrlRefiner::withPath()';
    }

    /**
     * @throws InvalidUrlComponentException|Exception
     */
    protected function refineUrl(Url $url): string
    {
        $url->path($this->path);

        return (string) $url;
    }
}


================================================
FILE: src/Steps/Refiners/Url/WithPort.php
================================================
<?php

namespace Crwlr\Crawler\Steps\Refiners\Url;

use Crwlr\Url\Exceptions\InvalidUrlComponentException;
use Crwlr\Url\Url;
use Exception;

class WithPort extends AbstractUrlRefiner
{
    public function __construct(protected readonly int $port) {}

    protected function staticRefinerMethod(): string
    {
        return 'UrlRefiner::withPort()';
    }

    /**
     * @throws InvalidUrlComponentException|Exception
     */
    protected function refineUrl(Url $url): string
    {
        $url->port($this->port);

        return (string) $url;
    }
}


================================================
FILE: src/Steps/Refiners/Url/WithQuery.php
================================================
<?php

namespace Crwlr\Crawler\Steps\Refiners\Url;

use Crwlr\Url\Exceptions\InvalidUrlComponentException;
use Crwlr\Url\Url;
use Exception;

class WithQuery extends AbstractUrlRefiner
{
    public function __construct(protected readonly string $query) {}

    protected function staticRefinerMethod(): string
    {
        return 'UrlRefiner::withQuery()';
    }

    /**
     * @throws InvalidUrlComponentException|Exception
     */
    protected function refineUrl(Url $url): string
    {
        $url->query($this->query);

        return (string) $url;
    }
}


================================================
FILE: src/Steps/Refiners/Url/WithScheme.php
================================================
<?php

namespace Crwlr\Crawler\Steps\Refiners\Url;

use Crwlr\Url\Exceptions\InvalidUrlComponentException;
use Crwlr\Url\Url;
use Exception;

class WithScheme extends AbstractUrlRefiner
{
    public function __construct(protected readonly string $scheme) {}

    protected function staticRefinerMethod(): string
    {
        return 'UrlRefiner::withScheme()';
    }

    /**
     * @throws InvalidUrlComponentException|Exception
     */
    protected function refineUrl(Url $url): string
    {
        $url->scheme($this->scheme);

        return (string) $url;
    }
}


================================================
FILE: src/Steps/Refiners/Url/WithoutPort.php
================================================
<?php

namespace Crwlr\Crawler\Steps\Refiners\Url;

use Crwlr\Url\Exceptions\InvalidUrlComponentException;
use Crwlr\Url\Url;
use Exception;

class WithoutPort extends AbstractUrlRefiner
{
    protected function staticRefinerMethod(): string
    {
        return 'UrlRefiner::withoutPort()';
    }

    /**
     * @throws InvalidUrlComponentException|Exception
     */
    protected function refineUrl(Url $url): string
    {
        $url->resetPort();

        return (string) $url;
    }
}


================================================
FILE: src/Steps/Refiners/UrlRefiner.php
================================================
<?php

namespace Crwlr\Crawler\Steps\Refiners;

use Crwlr\Crawler\Steps\Refiners\Url\WithFragment;
use Crwlr\Crawler\Steps\Refiners\Url\WithHost;
use Crwlr\Crawler\Steps\Refiners\Url\WithoutPort;
use Crwlr\Crawler\Steps\Refiners\Url\WithPath;
use Crwlr\Crawler\Steps\Refiners\Url\WithPort;
use Crwlr\Crawler\Steps\Refiners\Url\WithQuery;
use Crwlr\Crawler\Steps\Refiners\Url\WithScheme;

class UrlRefiner
{
    public static function withScheme(string $scheme): WithScheme
    {
        return new WithScheme($scheme);
    }

    public static function withHost(string $host): WithHost
    {
        return new WithHost($host);
    }

    public static function withPort(int $port): WithPort
    {
        return new WithPort($port);
    }

    public static function withoutPort(): WithoutPort
    {
        return new WithoutPort();
    }

    public static function withPath(string $path): WithPath
    {
        return new WithPath($path);
    }

    public static function withQuery(string $query): WithQuery
    {
        return new WithQuery($query);
    }

    public static function withoutQuery(): WithQuery
    {
        return new WithQuery('');
    }

    public static function withFragment(string $fragment): WithFragment
    {
        return new WithFragment($fragment);
    }

    public static function withoutFragment(): WithFragment
    {
        return new WithFragment('');
    }
}


================================================
FILE: src/Steps/Sitemap/GetUrlsFromSitemap.php
================================================
<?php

namespace Crwlr\Crawler\Steps\Sitemap;

use Crwlr\Crawler\Cache\Exceptions\MissingZlibExtensionException;
use Crwlr\Crawler\Steps\Dom\XmlDocument;
use Crwlr\Crawler\Steps\Dom\XmlElement;
use Crwlr\Crawler\Steps\Step;
use Crwlr\Crawler\Steps\StepOutputType;
use Crwlr\Utils\PhpVersion;
use Generator;

class GetUrlsFromSitemap extends Step
{
    protected bool $withData = false;

    /**
     * Remove attributes from a sitemap's <urlset> tag
     *
     * Symfony's DomCrawler component has problems when a sitemap's <urlset> tag contains certain attributes.
     * So, if the count of urls in the sitemap is zero, try to remove all attributes from the <urlset> tag.
     */
    public static function fixUrlSetTag(XmlDocument $dom): XmlDocument
    {
        if ($dom->querySelectorAll('urlset url')->count() === 0) {
            return new XmlDocument(preg_replace('/<urlset.+?>/', '<urlset>', $dom->outerXml()) ?? $dom->outerXml());
        }

        return $dom;
    }

    public function withData(): static
    {
        $this->withData = true;

        return $this;
    }

    public function outputType(): StepOutputType
    {
        return $this->withData ? StepOutputType::AssociativeArrayOrObject : StepOutputType::Scalar;
    }

    /**
     * @param XmlDocument $input
     */
    protected function invoke(mixed $input): Generator
    {
        if (PhpVersion::isBelow(8, 4)) {
            $input = self::fixUrlSetTag($input);
        }

        foreach ($input->querySelectorAll('urlset url') as $urlNode) {
            if ($urlNode->querySelector('loc')) {
                if ($this->withData) {
                    yield $this->getWithAdditionalData($urlNode);
                } else {
                    yield $urlNode->querySelector('loc')->text();
                }
            }
        }
    }

    /**
     * @throws MissingZlibExtensionException
     */
    protected function validateAndSanitizeInput(mixed $input): mixed
    {
        return $this->validateAndSanitizeToXmlDocumentInstance($input);
    }

    /**
     * @return string[]
     */
    protected function getWithAdditionalData(XmlElement $urlNode): array
    {
        $data = ['url' => $urlNode->querySelector('loc')?->text() ?? ''];

        $properties = ['lastmod', 'changefreq', 'priority'];

        foreach ($properties as $property) {
            $node = $urlNode->querySelector($property);

            if ($node) {
                $data[$property] = $node->text();
            }
        }

        return $data;
    }
}


================================================
FILE: src/Steps/Sitemap.php
================================================
<?php

namespace Crwlr\Crawler\Steps;

use Crwlr\Crawler\Steps\Loading\GetSitemapsFromRobotsTxt;
use Crwlr\Crawler\Steps\Sitemap\GetUrlsFromSitemap;

class Sitemap
{
    public static function getSitemapsFromRobotsTxt(): GetSitemapsFromRobotsTxt
    {
        return new GetSitemapsFromRobotsTxt();
    }

    public static function getUrlsFromSitemap(): GetUrlsFromSitemap
    {
        return new GetUrlsFromSitemap();
    }
}


================================================
FILE: src/Steps/Step.php
================================================
<?php

namespace Crwlr\Crawler\Steps;

use Closure;
use Crwlr\Crawler\Cache\Exceptions\MissingZlibExtensionException;
use Crwlr\Crawler\Input;
use Crwlr\Crawler\Loader\Http\Messages\RespondedRequest;
use Crwlr\Crawler\Output;
use Crwlr\Crawler\Steps\Dom\HtmlDocument;
use Crwlr\Crawler\Steps\Dom\XmlDocument;
use Crwlr\Crawler\Steps\Loading\Http;
use Crwlr\Url\Exceptions\InvalidUrlException;
use Crwlr\Url\Url;
use Exception;
use Generator;
use InvalidArgumentException;
use Psr\Http\Message\ResponseInterface;
use Psr\Http\Message\UriInterface;

abstract class Step extends BaseStep
{
    protected ?Closure $updateInputUsingOutput = null;

    protected bool $excludeFromGroupOutput = false;

    private bool $groupOutputsPerInput = false;

    /**
     * @return Generator<mixed>
     */
    abstract protected function invoke(mixed $input): Generator;

    /**
     * Calls the validateAndSanitizeInput method and assures that the invoke method receives valid, sanitized input.
     *
     * @return Generator<Output>
     * @throws Exception
     */
    final public function invokeStep(Input $input): Generator
    {
        if ($this->maxOutputsExceeded()) {
            return;
        }

        $this->storeOriginalInput($input);

        $inputForStepInvocation = $this->getInputKeyToUse($input);

        if ($inputForStepInvocation) {
            try {
                $validInputValue = $this->validateAndSanitizeInput($inputForStepInvocation->get());
            } catch (InvalidArgumentException $exception) {
                $this->logInvalidInputException($exception, $inputForStepInvocation->get());

                return;
            }

            if ($this->uniqueInput === false || $this->inputOrOutputIsUnique(new Input($validInputValue))) {
                if (!$this->groupOutputsPerInput) {
                    yield from $this->invokeAndYield($validInputValue, $input);
                } else {
                    yield from $this->invokeAndYieldOneOutputPerInput($validInputValue, $input);
                }
            }
        }
    }

    /**
     * Callback that is called in a step group to adapt the input for further steps
     *
     * In groups all the steps are called with the same Input, but with this callback it's possible to adjust the input
     * for the following steps.
     */
    public function updateInputUsingOutput(Closure $closure): static
    {
        $this->updateInputUsingOutput = $closure;

        return $this;
    }

    public function excludeFromGroupOutput(): static
    {
        $this->excludeFromGroupOutput = true;

        return $this;
    }

    public function oneOutputPerInput(): static
    {
        $this->groupOutputsPerInput = true;

        return $this;
    }

    public function shouldOutputBeExcludedFromGroupOutput(): bool
    {
        return $this->excludeFromGroupOutput;
    }

    /**
     * If the user set a callback to update the input (see above) => call it.
     */
    public function callUpdateInputUsingOutput(Input $input, Output $output): Input
    {
        if ($this->updateInputUsingOutput instanceof Closure) {
            return $input->withValue(
                $this->updateInputUsingOutput->call($this, $input->get(), $output->get()),
            );
        }

        return $input;
    }

    /**
     * Validate and sanitize the incoming Input object
     *
     * In child classes you can add this method to validate and sanitize the incoming input. The method is called
     * automatically when the step is invoked within the Crawler and the invoke method receives the validated and
     * sanitized input. Also, you can just return any value from this method and in the invoke method it's again
     * incoming as an Input object.
     *
     * @throws InvalidArgumentException  Throw this if the input value is invalid for this step.
     */
    protected function validateAndSanitizeInput(mixed $input): mixed
    {
        return $input;
    }

    /**
     * @throws InvalidArgumentException
     */
    protected function validateAndSanitizeStringOrStringable(
        mixed $inputValue,
        string $exceptionMessage = 'Input must be string or stringable',
    ): string {
        $inputValue = $this->getSingleElementFromArray($inputValue);

        if (is_object($inputValue) && method_exists($inputValue, '__toString')) {
            return $this->removeUtf8BomFromString($inputValue->__toString());
        }

        if (is_string($inputValue)) {
            return $this->removeUtf8BomFromString($inputValue);
        }

        throw new InvalidArgumentException($exceptionMessage);
    }

    /**
     * @throws InvalidArgumentException|MissingZlibExtensionException
     */
    protected function validateAndSanitizeStringOrHttpResponse(
        mixed $inputValue,
        string $exceptionMessage = 'Input must be string, stringable or HTTP response (RespondedRequest)',
        bool $allowOnlyRespondedRequest = false,
    ): string {
        if (is_array($inputValue) && count($inputValue) > 1 && array_key_exists('response', $inputValue)) {
            $inputValue = $inputValue['response'];
        }

        $inputValue = $this->getSingleElementFromArray($inputValue);

        if (
            $inputValue instanceof RespondedRequest ||
            ($inputValue instanceof ResponseInterface && !$allowOnlyRespondedRequest)
        ) {
            return $this->removeUtf8BomFromString(Http::getBodyString($inputValue));
        }

        return $this->validateAndSanitizeStringOrStringable($inputValue, $exceptionMessage);
    }

    /**
     * @throws InvalidArgumentException
     */
    protected function validateAndSanitizeToUriInterface(
        mixed $inputValue,
        string $exceptionMessage = 'Input must be string, stringable or an instance of UriInterface or Crwlr\\Url',
    ): UriInterface {
        $inputValue = $this->getSingleElementFromArray($inputValue);

        if ($inputValue instanceof UriInterface) {
            return $inputValue;
        }

        if (
            is_string($inputValue) ||
            $inputValue instanceof Url ||
            (is_object($inputValue) && method_exists($inputValue, '__toString'))
        ) {
            try {
                return Url::parsePsr7((string) $inputValue);
            } catch (InvalidUrlException $exception) {
                throw new InvalidArgumentException($exception->getMessage());
            }
        }

        throw new InvalidArgumentException($exceptionMessage);
    }

    /**
     * @throws MissingZlibExtensionException
     */
    protected function validateAndSanitizeToHtmlDocumentInstance(
        mixed $inputValue,
        string $exceptionMessage = 'Input must be string, stringable or HTTP response (RespondedRequest)',
    ): HtmlDocument {
        return new HtmlDocument($this->validateAndSanitizeStringOrHttpResponse($inputValue, $exceptionMessage));
    }

    /**
     * @throws MissingZlibExtensionException
     */
    protected function validateAndSanitizeToXmlDocumentInstance(
        mixed $inputValue,
        string $exceptionMessage = 'Input must be string, stringable or HTTP response (RespondedRequest)',
    ): XmlDocument {
        return new XmlDocument($this->validateAndSanitizeStringOrHttpResponse($inputValue, $exceptionMessage));
    }

    protected function getSingleElementFromArray(mixed $inputValue): mixed
    {
        if (is_array($inputValue) && count($inputValue) === 1) {
            return reset($inputValue);
        }

        return $inputValue;
    }

    /**
     * @throws Exception
     */
    private function invokeAndYield(mixed $validInputValue, Input $input): Generator
    {
        foreach ($this->invoke($validInputValue) as $outputData) {
            $outputData = $this->applyRefiners($outputData, $input->get());

            if ($this->maxOutputsExceeded()) {
                break;
            } elseif (!$this->passesAllFilters($outputData)) {
                continue;
            }

            if (!is_array($outputData) && $this->outputKey) {
                $outputData = [$this->outputKey => $outputData];
            }

            $output = $this->makeOutput($outputData, $input);

            if ($this->uniqueOutput && !$this->inputOrOutputIsUnique($output)) {
                continue;
            }

            yield $output;

            $this->trackYieldedOutput();
        }
    }

    /**
     * Version of invokeAndYield() when oneOutputPerInput() was called.
     */
    private function invokeAndYieldOneOutputPerInput(mixed $validInputValue, Input $input): Generator
    {
        $outputDataArray = [];

        foreach ($this->invoke($validInputValue) as $outputData) {
            $outputData = $this->applyRefiners($outputData, $input->get());

            if (!$this->passesAllFilters($outputData)) {
                continue;
            }

            $outputDataArray[] = $outputData;
        }

        if ($this->outputKey) {
            $outputDataArray = [$this->outputKey => $outputDataArray];
        }

        $output = $this->makeOutput($outputDataArray, $input);

        if ($this->uniqueOutput && !$this->inputOrOutputIsUnique($output)) {
            return;
        }

        yield $output;

        $this->trackYieldedOutput();
    }

    /**
     * Sometimes there can be a so-called byte order mark character as first characters in a text file. See:
     * https://stackoverflow.com/questions/53303571/why-does-the-filereader-stream-read-239-187-191-from-a-textfile
     * 239, 187, 191 is the BOM for UTF-8. Remove it, as it is unnecessary and can cause issues when a string
     * needs to start with a certain character.
     *
     * @param string $string
     * @return string
     */
    private function removeUtf8BomFromString(string $string): string
    {
        if (substr($string, 0, 3) === (chr(239) . chr(187) . chr(191))) {
            return substr($string, 3);
        }

        return $string;
    }

    private function logInvalidInputException(InvalidArgumentException $exception, mixed $input): void
    {
        $exceptionMessage = $exception->getMessage();

        $stepClassName = $this->getStepClassName();

        $logMessage = ($stepClassName ? 'The ' . $stepClassName . ' step' : 'A step') . ' was called with input ' .
            'that it can not work with: ' . $exceptionMessage;

        if (str_starts_with($exceptionMessage, 'Input must be string')) {
            $logMessage .= '. The invalid input is of type ' . gettype($input) . '.';
        }

        $this->logger?->error($logMessage);
    }
}


================================================
FILE: src/Steps/StepInterface.php
================================================
<?php

namespace Crwlr\Crawler\Steps;

use Crwlr\Crawler\Input;
use Crwlr\Crawler\Output;
use Crwlr\Crawler\Steps\Filters\FilterInterface;
use Generator;
use Psr\Log\LoggerInterface;

interface StepInterface
{
    public function addLogger(LoggerInterface $logger): static;

    /**
     * @param Input $input
     * @return Generator<Output>
     */
    public function invokeStep(Input $input): Generator;

    /**
     * @param string|string[]|null $keys
     */
    public function keep(string|array|null $keys = null): static;

    public function keepAs(string $key): static;

    /**
     * @param string|string[]|null $keys
     */
    public function keepFromInput(string|array|null $keys = null): static;

    public function keepInputAs(string $key): static;

    public function keepsAnything(): bool;

    public function keepsAnythingFromInputData(): bool;

    public function keepsAnythingFromOutputData(): bool;

    public function useInputKey(string $key): static;

    public function uniqueInputs(?string $key = null): static;

    public function uniqueOutputs(?string $key = null): static;

    public function where(string|FilterInterface $keyOrFilter, ?FilterInterface $filter = null): static;

    public function orWhere(string|FilterInterface $keyOrFilter, ?FilterInterface $filter = null): static;

    public function outputKey(string $key): static;

    public function maxOutputs(int $maxOutputs): static;

    public function resetAfterRun(): void;
}


================================================
FILE: src/Steps/StepOutputType.php
================================================
<?php

namespace Crwlr\Crawler\Steps;

enum StepOutputType
{
    case Scalar;

    case AssociativeArrayOrObject;

    case Mixed;
}


================================================
FILE: src/Steps/Xml.php
================================================
<?php

namespace Crwlr\Crawler\Steps;

use Crwlr\Crawler\Cache\Exceptions\MissingZlibExtensionException;
use Crwlr\Crawler\Loader\Http\Messages\RespondedRequest;
use Crwlr\Crawler\Steps\Dom\XmlDocument;
use Crwlr\Crawler\Steps\Html\CssSelector;
use Crwlr\Crawler\Steps\Html\DomQuery;
use Crwlr\Crawler\Steps\Html\Exceptions\InvalidDomQueryException;

class Xml extends Dom
{
    /**
     * @throws InvalidDomQueryException
     */
    public function makeDefaultDomQueryInstance(string $query): DomQuery
    {
        return new CssSelector($query);
    }

    /**
     * @param mixed $input
     * @return XmlDocument
     * @throws MissingZlibExtensionException
     */
    protected function validateAndSanitizeInput(mixed $input): XmlDocument
    {
        if ($input instanceof RespondedRequest) {
            $this->baseUrl = $input->effectiveUri();
        }

        return $this->validateAndSanitizeToXmlDocumentInstance($input);
    }
}


================================================
FILE: src/Stores/JsonFileStore.php
================================================
<?php

namespace Crwlr\Crawler\Stores;

use Crwlr\Crawler\Result;
use Exception;

class JsonFileStore extends Store
{
    protected int $createTimestamp;

    public function __construct(protected readonly string $storePath, protected readonly ?string $filePrefix = null)
    {
        $this->createTimestamp = time();

        touch($this->filePath());

        file_put_contents($this->filePath(), '[]');
    }

    /**
     * @throws Exception
     */
    public function store(Result $result): void
    {
        $currentResultsFileContent = file_get_contents($this->filePath());

        if (!$currentResultsFileContent) {
            $currentResultsFileContent = '[]';
        }

        $results = json_decode($currentResultsFileContent, true);

        $results[] = $result->toArray();

        file_put_contents($this->filePath(), json_encode($results));
    }

    public function filePath(): string
    {
        return $this->storePath . '/' .
          ($this->filePrefix ? $this->filePrefix . '-' : '') . $this->createTimestamp . '.json';
    }
}


================================================
FILE: src/Stores/SimpleCsvFileStore.php
================================================
<?php

namespace Crwlr\Crawler\Stores;

use Crwlr\Crawler\Result;
use Exception;

class SimpleCsvFileStore extends Store
{
    protected int $createTimestamp;

    protected bool $isFirstResult = true;

    public function __construct(protected readonly string $storePath, protected readonly ?string $filePrefix = null)
    {
        $this->createTimestamp = time();

        touch($this->filePath());
    }

    /**
     * @throws Exception
     */
    public function store(Result $result): void
    {
        $fileHandle = fopen($this->filePath(), 'a');

        if (!is_resource($fileHandle)) {
            throw new Exception('Failed to open file to store data');
        }

        if ($this->isFirstResult) {
            fputcsv($fileHandle, array_keys($result->toArray()), escape: '');

            $this->isFirstResult = false;
        }

        $resultArray = $result->toArray();

        if ($this->anyPropertyIsArray($result)) {
            $resultArray = $this->flattenResultArray($resultArray);
        }

        fputcsv($fileHandle, array_values($resultArray), escape: '');

        fclose($fileHandle);
    }

    public function filePath(): string
    {
        return $this->storePath . '/' .
            ($this->filePrefix ? $this->filePrefix . '-' : '') . $this->createTimestamp . '.csv';
    }

    protected function anyPropertyIsArray(Result $result): bool
    {
        foreach ($result->toArray() as $value) {
            if (is_array($value)) {
                return true;
            }
        }

        return false;
    }

    /**
     * @param mixed[] $result
     * @return array<string|int>
     */
    protected function flattenResultArray(array $result): array
    {
        foreach ($result as $key => $value) {
            if (is_array($value)) {
                $result[$key] = implode(' | ', $value);
            }
        }

        return $result;
    }
}


================================================
FILE: src/Stores/Store.php
================================================
<?php

namespace Crwlr\Crawler\Stores;

use Psr\Log\LoggerInterface;

abstract class Store implements StoreInterface
{
    protected ?LoggerInterface $logger = null;

    public function addLogger(LoggerInterface $logger): static
    {
        $this->logger = $logger;

        return $this;
    }
}


================================================
FILE: src/Stores/StoreInterface.php
================================================
<?php

namespace Crwlr\Crawler\Stores;

use Crwlr\Crawler\Result;
use Psr\Log\LoggerInterface;

interface StoreInterface
{
    public function store(Result $result): void;

    public function addLogger(LoggerInterface $logger): static;
}


================================================
FILE: src/UserAgents/BotUserAgent.php
================================================
<?php

namespace Crwlr\Crawler\UserAgents;

class BotUserAgent implements BotUserAgentInterface
{
    /**
     * @param string $productToken  The name of the Crawler/Bot
     * @param string|null $infoUri  Uri where site owners can find information about your crawler.
     * @param string|null $version  In case you want to communicate infos about different versions of your crawler.
     */
    public function __construct(
        protected string $productToken,
        protected ?string $infoUri = null,
        protected ?string $version = null,
    ) {}

    public static function make(string $productToken, ?string $crawlerInfoUri = null, ?string $version = null): self
    {
        return new self($productToken, $crawlerInfoUri, $version);
    }

    public function __toString(): string
    {
        $botUserAgent = 'Mozilla/5.0 (compatible; ' . $this->productToken;

        if ($this->version) {
            $botUserAgent .= '/' . $this->version;
        }

        if ($this->infoUri) {
            $botUserAgent .= '; +' . $this->infoUri;
        }

        return $botUserAgent . ')';
    }

    public function productToken(): string
    {
        return $this->productToken;
    }
}


================================================
FILE: src/UserAgents/BotUserAgentInterface.php
================================================
<?php

namespace Crwlr\Crawler\UserAgents;

interface BotUserAgentInterface extends UserAgentInterface
{
    public function productToken(): string;
}


================================================
FILE: src/UserAgents/UserAgent.php
================================================
<?php

namespace Crwlr\Crawler\UserAgents;

class UserAgent implements UserAgentInterface
{
    public function __construct(protected readonly string $userAgent) {}

    public function __toString(): string
    {
        return $this->userAgent;
    }

    public static function mozilla5CompatibleBrowser(): self
    {
        return new self('Mozilla/5.0 (compatible)');
    }
}


================================================
FILE: src/UserAgents/UserAgentInterface.php
================================================
<?php

namespace Crwlr\Crawler\UserAgents;

interface UserAgentInterface
{
    public function __toString(): string;
}


================================================
FILE: src/Utils/Gzip.php
================================================
<?php

namespace Crwlr\Crawler\Utils;

use Crwlr\Crawler\Cache\Exceptions\MissingZlibExtensionException;

class Gzip
{
    /**
     * @throws MissingZlibExtensionException
     */
    public static function encode(string $string, bool $throwException = false): string
    {
        if (!function_exists('gzencode') && $throwException) {
            throw new MissingZlibExtensionException('PHP ext-zlib not installed.');
        }

        $encoded = gzencode($string);

        return $encoded !== false ? $encoded : $string;
    }

    /**
     * @throws MissingZlibExtensionException
     */
    public static function decode(string $string, bool $throwException = false): string
    {
        $isEncoded = 0 === mb_strpos($string, "\x1f" . "\x8b" . "\x08", 0, "US-ASCII");

        $functionExists = function_exists('gzdecode');

        if (!$isEncoded || !$functionExists) {
            if (!$functionExists && $throwException) {
                throw new MissingZlibExtensionException('PHP ext-zlib not installed.');
            }

            return $string;
        }

        $decoded = gzdecode($string);

        return $decoded !== false ? $decoded : $string;
    }
}


================================================
FILE: src/Utils/HttpHeaders.php
================================================
<?php

namespace Crwlr\Crawler\Utils;

final class HttpHeaders
{
    /**
     * @param array<string, string|string[]> $headers
     * @return array<string, string[]>
     */
    public static function normalize(array $headers): array
    {
        $normalized = [];

        foreach ($headers as $headerName => $value) {
            $normalized[$headerName] = is_array($value) ? $value : [$value];
        }

        return $normalized;
    }

    /**
     * @param array<string, array<int, string>> $headers
     * @param array<string, array<int, string>> $mergeHeaders
     * @return array<string, array<int, string>>
     */
    public static function merge(array $headers, array $mergeHeaders): array
    {
        foreach ($mergeHeaders as $headerName => $value) {
            if (!array_key_exists($headerName, $headers)) {
                $headers[$headerName] = $value;
            } else {
                $headers = self::addTo($headers, $headerName, $value);
            }
        }

        return $headers;
    }

    /**
     * @param array<string, array<int, string>> $headers
     * @param string $headerName
     * @param string|string[] $value
     * @return array<string, array<int, string>>
     */
    public static function addTo(array $headers, string $headerName, string|array $value): array
    {
        if (!array_key_exists($headerName, $headers)) {
            $headers[$headerName] = is_array($value) ? $value : [$value];
        } elseif (is_array($value)) {
            foreach ($value as $valueItem) {
                if (!in_array($valueItem, $headers[$headerName], true)) {
                    $headers[$headerName][] = $valueItem;
                }
            }
        } elseif (!in_array($value, $headers[$headerName], true)) {
            $headers[$headerName][] = $value;
        }

        return $headers;
    }
}


================================================
FILE: src/Utils/OutputTypeHelper.php
================================================
<?php

namespace Crwlr\Crawler\Utils;

class OutputTypeHelper
{
    /**
     * @return mixed[]
     */
    public static function objectToArray(object $output): array
    {
        if (method_exists($output, 'toArrayForResult')) {
            return $output->toArrayForResult();
        } elseif (method_exists($output, 'toArray')) {
            return $output->toArray();
        } elseif (method_exists($output, '__serialize')) {
            return $output->__serialize();
        }

        return (array) $output;
    }

    public static function isScalar(mixed $output): bool
    {
        return !self::isAssociativeArrayOrObject($output);
    }

    public static function isAssociativeArrayOrObject(mixed $output): bool
    {
        return self::isAssociativeArray($output) || is_object($output);
    }

    public static function isAssociativeArray(mixed $output): bool
    {
        if (!is_array($output)) {
            return false;
        }

        foreach ($output as $key => $value) {
            return is_string($key);
        }

        return false;
    }

    /**
     * @param mixed[] $data
     * @return mixed[]
     */
    public static function recursiveChildObjectsToArray(array $data): array
    {
        foreach ($data as $key => $value) {
            if (is_object($value)) {
                $data[$key] = self::recursiveChildObjectsToArray(self::objectToArray($value));
            } elseif (is_array($value)) {
                $data[$key] = self::recursiveChildObjectsToArray($value);
            }
        }

        return $data;
    }
}


================================================
FILE: src/Utils/RequestKey.php
================================================
<?php

namespace Crwlr\Crawler\Utils;

use Crwlr\Crawler\Cache\Exceptions\MissingZlibExtensionException;
use Crwlr\Crawler\Loader\Http\Messages\RespondedRequest;
use Crwlr\Crawler\Steps\Loading\Http;
use Psr\Http\Message\RequestInterface;

class RequestKey
{
    /**
     * Creates a unique key for an HTTP request
     *
     * The key will be based on all its properties: method, URI, headers, body.
     * So, for example, if requests send different bodies, but the rest is identical, the keys will be different.
     *
     * By default, Cookie headers are removed before building the key, so the key is independent of sessions.
     * You can also pass other headers (or none if you want cookies to be included) to be ignored as second argument.
     *
     * @param RequestInterface|RespondedRequest $request
     * @param string[] $ignoreHeaders
     * @return string
     * @throws MissingZlibExtensionException
     */
    public static function from(RequestInterface|RespondedRequest $request, array $ignoreHeaders = ['Cookie']): string
    {
        $request = $request instanceof RespondedRequest ? $request->request : $request;

        $data = [
            'requestMethod' => $request->getMethod(),
            'requestUri' => $request->getUri()->__toString(),
            'requestHeaders' => $request->getHeaders(),
            'requestBody' => Http::getBodyString($request),
        ];

        $data = self::removeIgnoreHeaders($data, $ignoreHeaders);

        $serialized = serialize($data);

        return md5($serialized);
    }

    /**
     * @param array<string, mixed> $data
     * @param string[] $ignoreHeaders
     * @return array<string, mixed>
     */
    private static function removeIgnoreHeaders(array $data, array $ignoreHeaders): array
    {
        foreach ($ignoreHeaders as $ignoreHeader) {
            if (isset($data['requestHeaders'][$ignoreHeader])) {
                unset($data['requestHeaders'][$ignoreHeader]);
            }

            $otherCase = strtolower($ignoreHeader);

            if ($otherCase === $ignoreHeader) {
                $otherCase = ucwords($ignoreHeader, '-');
            }

            $ignoreHeader = $otherCase;

            if (isset($data['requestHeaders'][$ignoreHeader])) {
                unset($data['requestHeaders'][$ignoreHeader]);
            }
        }

        return $data;
    }
}


================================================
FILE: src/Utils/TemplateString.php
================================================
<?php

namespace Crwlr\Crawler\Utils;

use Adbar\Dot;

class TemplateString
{
    /**
     * @param mixed[] $data
     */
    public static function resolve(string $string, array $data = []): string
    {
        if (str_contains($string, '[crwl:')) {
            return preg_replace_callback('/\[crwl:(.+?)]/m', function ($matches) use ($data) {
                $varName = self::trimAndUnescapeQuotes($matches[1]);

                if (array_key_exists($varName, $data)) {
                    return $data[$varName];
                } elseif (str_contains($varName, '.')) {
                    $dot = new Dot($data);

                    return $dot->get($varName);
                }

                return '';
            }, $string) ?? $string;
        }

        return $string;
    }

    private static function trimAndUnescapeQuotes(string $string): string
    {
        if (
            str_starts_with($string, '\'') && str_ends_with($string, '\'') ||
            str_starts_with($string, '"') && str_ends_with($string, '"')
        ) {
            $string = substr($string, 1, -1);
        }

        $string = str_replace(["\'", '\"'], ["'", '"'], $string);

        return $string;
    }
}


================================================
FILE: tests/Cache/CacheItemTest.php
================================================
<?php

namespace tests\Cache;

use Crwlr\Crawler\Cache\CacheItem;
use DateInterval;
use DateTimeImmutable;

it('is serializable and unserializable without loss', function () {
    $createdAt = new DateTimeImmutable('2023-01-10 12:10:00');

    $item = new CacheItem('value', 'key123', 123, $createdAt);

    $serialized = serialize($item);

    $unserialized = unserialize($serialized);

    expect($unserialized->value())->toBe('value');

    expect($unserialized->key())->toBe('key123');

    expect($unserialized->ttl)->toBe(123);

    expect($unserialized->createdAt->format('Y-m-d H:i:s'))->toBe('2023-01-10 12:10:00');
});

it('creates a key based on the value if you don\'t provide a key manually', function () {
    $item = new CacheItem('foo');

    expect($item->key())->toBeString();

    expect(strlen($item->key()))->toBeGreaterThan(0);
});

it('tells if it is expired already', function () {
    $item = new CacheItem('v', 'k', 10);

    expect($item->isExpired())->toBeFalse();

    $item = new CacheItem('v', 'k', 10, (new DateTimeImmutable())->sub(new DateInterval('PT9S')));

    expect($item->isExpired())->toBeFalse();

    $item = new CacheItem('v', 'k', 10, (new DateTimeImmutable())->sub(new DateInterval('PT11S')));

    expect($item->isExpired())->toBeTrue();
});


================================================
FILE: tests/Cache/FileCacheTest.php
================================================
<?php

namespace tests\Cache;

use Crwlr\Crawler\Cache\CacheItem;
use Crwlr\Crawler\Cache\Exceptions\MissingZlibExtensionException;
use Crwlr\Crawler\Loader\Http\Messages\RespondedRequest;
use Crwlr\Crawler\Cache\FileCache;
use Crwlr\Crawler\Steps\Loading\Http;
use DateInterval;
use DateTimeImmutable;
use GuzzleHttp\Psr7\Request;
use GuzzleHttp\Psr7\Response;
use GuzzleHttp\Psr7\Utils;
use PHPUnit\Framework\TestCase;
use RuntimeException;

use function tests\helper_cachedir;
use function tests\helper_resetCacheDir;

/**
 * @param mixed[] $items
 * @throws MissingZlibExtensionException
 */
function helper_addMultipleItemsToCache(array $items, FileCache $cache): void
{
    foreach ($items as $item) {
        $cache->set($item->cacheKey(), $item);
    }
}

function helper_respondedRequestWithRequestUrl(string $requestUrl): RespondedRequest
{
    return new RespondedRequest(new Request('GET', $requestUrl), new Response());
}

/**
 * Helper function to get the CacheItem instance, because FileCache::get() returns only
 * the value wrapped in the CacheItem object.
 */
function helper_getCacheItemByKey(string $key): ?CacheItem
{
    $cacheFileContent = file_get_contents(helper_cachedir() . '/' . $key);

    $cacheItem = unserialize($cacheFileContent !== false ? $cacheFileContent : 'a:0:{}');

    return $cacheItem instanceof CacheItem ? $cacheItem : null;
}

afterEach(function () {
    helper_resetCacheDir();
});

/** @var TestCase $this */

it('caches a simple value', function () {
    $cache = new FileCache(helper_cachedir());

    $cache->set('user', 'otsch');

    expect($cache->get('user'))->toBe('otsch');
});

it('caches RespondedRequest objects', function () {
    $respondedRequest = new RespondedRequest(new Request('GET', '/'), new Response());

    $cache = new FileCache(helper_cachedir());

    expect($cache->set($respondedRequest->cacheKey(), $respondedRequest))->toBeTrue()
        ->and(file_exists(helper_cachedir() . '/' . $respondedRequest->cacheKey()))->toBeTrue()
        ->and($cache->get($respondedRequest->cacheKey()))->toBeInstanceOf(RespondedRequest::class);
});

it('checks if it has an item for a certain key', function () {
    $respondedRequest = new RespondedRequest(new Request('GET', '/'), new Response());

    $cache = new FileCache(helper_cachedir());

    $cache->set($respondedRequest->cacheKey(), $respondedRequest);

    expect($cache->has($respondedRequest->cacheKey()))->toBeTrue()
        ->and($cache->has('otherKey'))->toBeFalse();
});

it('does not return expired items', function () {
    $respondedRequest = new RespondedRequest(new Request('GET', '/'), new Response());

    $cacheItem = new CacheItem(
        $respondedRequest,
        $respondedRequest->cacheKey(),
        10,
        (new DateTimeImmutable())->sub(new DateInterval('PT11S')),
    );

    $cache = new FileCache(helper_cachedir());

    $cache->set($cacheItem->key(), $cacheItem);

    expect($cache->has($cacheItem->key()))->toBeFalse()
        ->and($cache->get($cacheItem->key()))->toBeNull();
});

it('deletes a cache item', function () {
    $respondedRequest = new RespondedRequest(new Request('GET', '/'), new Response());

    $cache = new FileCache(helper_cachedir());

    $cache->set($respondedRequest->cacheKey(), $respondedRequest);

    expect($cache->has($respondedRequest->cacheKey()))->toBeTrue();

    $cache->delete($respondedRequest->cacheKey());

    expect($cache->has($respondedRequest->cacheKey()))->toBeFalse();
});

it('deletes an expired cache item when has() is called with its key', function () {
    $cacheItem = new CacheItem('bar', 'foo', 10, (new DateTimeImmutable())->sub(new DateInterval('PT11S')));

    $cache = new FileCache(helper_cachedir());

    $cache->set('foo', $cacheItem);

    expect(file_exists(helper_cachedir() . '/foo'))->toBeTrue()
        ->and($cache->has('foo'))->toBeFalse()
        ->and(file_exists(helper_cachedir() . '/foo'))->toBeFalse();
});

it('deletes an expired cache item when get() is called with its key', function () {
    $cacheItem = new CacheItem('bar', 'foo', 10, (new DateTimeImmutable())->sub(new DateInterval('PT11S')));

    $cache = new FileCache(helper_cachedir());

    $cache->set('foo', $cacheItem);

    expect(file_exists(helper_cachedir() . '/foo'))->toBeTrue()
        ->and($cache->get('foo', 'defaultValue'))->toBe('defaultValue')
        ->and(file_exists(helper_cachedir() . '/foo'))->toBeFalse();
});

it('clears the whole cache', function () {
    $cacheItem1 = helper_respondedRequestWithRequestUrl('/foo');

    $cacheItem2 = helper_respondedRequestWithRequestUrl('/bar');

    $cacheItem3 = helper_respondedRequestWithRequestUrl('/baz');

    $cache = new FileCache(helper_cachedir());

    helper_addMultipleItemsToCache([$cacheItem1, $cacheItem2, $cacheItem3], $cache);

    expect($cache->has($cacheItem1->cacheKey()))->toBeTrue()
        ->and($cache->has($cacheItem2->cacheKey()))->toBeTrue()
        ->and($cache->has($cacheItem3->cacheKey()))->toBeTrue();

    $cache->clear();

    expect($cache->has($cacheItem1->cacheKey()))->toBeFalse()
        ->and($cache->has($cacheItem2->cacheKey()))->toBeFalse()
        ->and($cache->has($cacheItem3->cacheKey()))->toBeFalse();
});

it('gets multiple items', function () {
    $cacheItem1 = helper_respondedRequestWithRequestUrl('/foo');

    $cacheItem2 = helper_respondedRequestWithRequestUrl('/bar');

    $cacheItem3 = helper_respondedRequestWithRequestUrl('/baz');

    $cache = new FileCache(helper_cachedir());

    helper_addMultipleItemsToCache([$cacheItem1, $cacheItem2, $cacheItem3], $cache);

    $items = $cache->getMultiple([$cacheItem1->cacheKey(), $cacheItem2->cacheKey(), $cacheItem3->cacheKey()]);

    expect(reset($items)->request->getUri()->__toString())->toBe('/foo')
        ->and(next($items)->request->getUri()->__toString())->toBe('/bar')
        ->and(next($items)->request->getUri()->__toString())->toBe('/baz');
});

it('sets multiple items', function () {
    $cacheItem1 = helper_respondedRequestWithRequestUrl('/foo');

    $cacheItem2 = helper_respondedRequestWithRequestUrl('/bar');

    $cacheItem3 = helper_respondedRequestWithRequestUrl('/baz');

    $cache = new FileCache(helper_cachedir());

    $cache->setMultiple([
        $cacheItem1->cacheKey() => $cacheItem1,
        $cacheItem2->cacheKey() => $cacheItem2,
        $cacheItem3->cacheKey() => $cacheItem3,
    ]);

    expect($cache->has($cacheItem1->cacheKey()))->toBeTrue()
        ->and($cache->has($cacheItem2->cacheKey()))->toBeTrue()
        ->and($cache->has($cacheItem3->cacheKey()))->toBeTrue();
});

it('deletes multiple items', function () {
    $cacheItem1 = helper_respondedRequestWithRequestUrl('/blog');

    $cacheItem2 = helper_respondedRequestWithRequestUrl('/contact');

    $cacheItem3 = helper_respondedRequestWithRequestUrl('/privacy');

    $cache = new FileCache(helper_cachedir());

    helper_addMultipleItemsToCache([$cacheItem1, $cacheItem2, $cacheItem3], $cache);

    $cache->deleteMultiple([$cacheItem1->cacheKey(), $cacheItem2->cacheKey(), $cacheItem3->cacheKey()]);

    expect($cache->has($cacheItem1->cacheKey()))->toBeFalse()
        ->and($cache->has($cacheItem2->cacheKey()))->toBeFalse()
        ->and($cache->has($cacheItem3->cacheKey()))->toBeFalse();
});

it('can still use legacy (pre CacheItem object) cache files', function () {
    $content = file_get_contents(__DIR__ . '/_cachefilecontent');

    file_put_contents(helper_cachedir() . '/foo', $content);

    $cache = new FileCache(helper_cachedir());

    expect($cache->has('foo'))->toBeTrue();

    $cacheItem = $cache->get('foo');

    expect($cacheItem)->toBeArray();

    $respondedRequest = RespondedRequest::fromArray($cacheItem);

    expect($respondedRequest)->toBeInstanceOf(RespondedRequest::class)
        ->and($respondedRequest->requestedUri())->toBe(
            'https://www.crwlr.software/blog/dealing-with-http-url-query-strings-in-php',
        );
});

it('compresses cache data when useCompression() is used', function () {
    $data = <<<DATA
        Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et
        dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet
        clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet,
        consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat,
        sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea
        takimata sanctus est Lorem ipsum dolor sit amet.
        DATA;

    $respondedRequest = new RespondedRequest(new Request('GET', '/compression'), new Response(body: Utils::streamFor($data)));

    $cache = new FileCache(helper_cachedir());

    $cache->set($respondedRequest->cacheKey(), $respondedRequest);

    $uncompressedFileSize = filesize(helper_cachedir() . '/' . $respondedRequest->cacheKey());

    expect($uncompressedFileSize)->not()->toBeFalse();

    if ($uncompressedFileSize === false) {
        throw new RuntimeException('Unable to determine cache file size.');
    }

    clearstatcache(); // Results of filesize() are cached. Clear that to get correct result for compressed file size.

    $cache->useCompression();

    $cache->set($respondedRequest->cacheKey(), $respondedRequest);

    $compressedFileSize = filesize(helper_cachedir() . '/' . $respondedRequest->cacheKey());

    expect($compressedFileSize)->not()->toBeFalse();

    if ($compressedFileSize === false) {
        throw new RuntimeException('Unable to determine cache file size.');
    }

    expect($compressedFileSize)->toBeLessThan($uncompressedFileSize)
        // Didn't want to check for exact numbers, because I guess they could be a bit different on different systems.
        // But thought the diff should at least be more than 30% for the test to succeed.
        ->and($uncompressedFileSize - $compressedFileSize)->toBeGreaterThan($uncompressedFileSize * 0.3);
});

it('gets compressed cache items', function () {
    $cache = new FileCache(helper_cachedir());

    $cache->useCompression();

    $respondedRequest = new RespondedRequest(
        new Request('GET', '/compression'),
        new Response(body: Utils::streamFor('Hello World')),
    );

    $cache->set($respondedRequest->cacheKey(), $respondedRequest);

    $retrievedCacheItem = $cache->get($respondedRequest->cacheKey());

    expect($retrievedCacheItem)->toBeInstanceOf(RespondedRequest::class)
        ->and(Http::getBodyString($retrievedCacheItem))->toBe('Hello World');
});

it('is also able to decode uncompressed cache files when useCompression() is used', function () {
    $cache = new FileCache(helper_cachedir());

    $respondedRequest = new RespondedRequest(new Request('GET', '/yo'), new Response(body: Utils::streamFor('Yo')));

    $cache->set($respondedRequest->cacheKey(), $respondedRequest);

    $retrievedCacheItem = $cache->get($respondedRequest->cacheKey());

    expect($retrievedCacheItem)
        ->toBeInstanceOf(RespondedRequest::class)
        ->and(Http::getBodyString($retrievedCacheItem))
        ->toBe('Yo');

    $cache->useCompression();

    $retrievedCacheItem = $cache->get($respondedRequest->cacheKey());

    expect($retrievedCacheItem)
        ->toBeInstanceOf(RespondedRequest::class)
        ->and(Http::getBodyString($retrievedCacheItem))
        ->toBe('Yo');
});

it('can also read compressed cache files, when useCompression() is not used', function () {
    $cache = new FileCache(helper_cachedir());

    $cache->useCompression();

    $respondedRequest = new RespondedRequest(new Request('GET', '/no'), new Response(body: Utils::streamFor('No')));

    $cache->set($respondedRequest->cacheKey(), $respondedRequest);

    $cache = new FileCache(helper_cachedir());

    $retrievedCacheItem = $cache->get($respondedRequest->cacheKey());

    expect($retrievedCacheItem)
        ->toBeInstanceOf(RespondedRequest::class)
        ->and(Http::getBodyString($retrievedCacheItem))
        ->toBe('No');
});

test('you can change the default ttl', function () {
    $cache = new FileCache(helper_cachedir());

    $cache->ttl(900);

    $respondedRequest = new RespondedRequest(
        new Request('GET', '/foo'),
        new Response(body: Utils::streamFor('bar')),
    );

    $cache->set($respondedRequest->cacheKey(), $respondedRequest);

    $cacheItem = helper_getCacheItemByKey($respondedRequest->cacheKey());

    expect($cacheItem)->toBeInstanceOf(CacheItem::class)
        ->and($cacheItem?->ttl)->toBe(900);
});

it('prolongs the time to live for a single item', function () {
    $cache = new FileCache(helper_cachedir());

    $cache->ttl(100);

    $respondedRequest = new RespondedRequest(new Request('GET', '/a'), new Response(body: Utils::streamFor('b')));

    $cache->set($respondedRequest->cacheKey(), $respondedRequest);

    $cacheItem = helper_getCacheItemByKey($respondedRequest->cacheKey());

    expect($cacheItem)->toBeInstanceOf(CacheItem::class)
        ->and($cacheItem?->ttl)->toBe(100);

    /** @var CacheItem $cacheItem */

    $cache->prolong($cacheItem->key(), 200);

    $cacheItem = helper_getCacheItemByKey($cacheItem->key());

    expect($cacheItem)->toBeInstanceOf(CacheItem::class)
        ->and($cacheItem?->ttl)->toBe(200);
});

it('prolongs the time to live for all items in the cache directory', function () {
    $cache = new FileCache(helper_cachedir());

    $respondedRequest = new RespondedRequest(new Request('GET', '/a'), new Response(body: Utils::streamFor('b')));

    $cache->set($key1 = $respondedRequest->cacheKey(), $respondedRequest, 100);

    $respondedRequest = new RespondedRequest(new Request('GET', '/c'), new Response(body: Utils::streamFor('d')));

    $cache->set($key2 = $respondedRequest->cacheKey(), $respondedRequest, 200);

    $respondedRequest = new RespondedRequest(new Request('GET', '/e'), new Response(body: Utils::streamFor('f')));

    $cache->set($key3 = $respondedRequest->cacheKey(), $respondedRequest, 300);

    $cacheItem = helper_getCacheItemByKey($key1);

    expect($cacheItem)->toBeInstanceOf(CacheItem::class)
        ->and($cacheItem?->ttl)->toBe(100);

    $cacheItem = helper_getCacheItemByKey($key2);

    expect($cacheItem)->toBeInstanceOf(CacheItem::class)
        ->and($cacheItem?->ttl)->toBe(200);

    $cacheItem = helper_getCacheItemByKey($key3);

    expect($cacheItem)->toBeInstanceOf(CacheItem::class)
        ->and($cacheItem?->ttl)->toBe(300);

    $cache->prolongAll(250);

    $cacheItem = helper_getCacheItemByKey($key1);

    expect($cacheItem)->toBeInstanceOf(CacheItem::class)
        ->and($cacheItem?->ttl)->toBe(250);

    $cacheItem = helper_getCacheItemByKey($key2);

    expect($cacheItem)->toBeInstanceOf(CacheItem::class)
        ->and($cacheItem?->ttl)->toBe(250);

    $cacheItem = helper_getCacheItemByKey($key3);

    // Prolonging sets the provided value, no matter if an item's previous ttl value was
    // higher than the new one.
    expect($cacheItem)->toBeInstanceOf(CacheItem::class)
        ->and($cacheItem?->ttl)->toBe(250);
});

test('the get() and has() methods delete an expired item, but prolong does not', function () {
    $cache = new FileCache(helper_cachedir());

    $resp = new RespondedRequest(new Request('GET', '/'), new Response());

    // with get()
    $cacheItem = new CacheItem($resp, $resp->cacheKey(), 10, (new DateTimeImmutable())->sub(new DateInterval('PT11S')));

    $cache->set($cacheItem->key(), $cacheItem);

    $cacheItem = $cache->get($cacheItem->key());

    expect($cacheItem)->toBeNull()
        ->and(file_exists(helper_cachedir($resp->cacheKey())))->toBeFalse();

    // with has()
    $cacheItem = new CacheItem($resp, $resp->cacheKey(), 10, (new DateTimeImmutable())->sub(new DateInterval('PT11S')));

    $cache->set($cacheItem->key(), $cacheItem);

    $cache->has($cacheItem->key());

    expect($cache->has($cacheItem->key()))->toBeFalse()
        ->and(file_exists(helper_cachedir($cacheItem->key())))->toBeFalse();

    // with prolong()
    $cache->set($cacheItem->key(), $cacheItem);

    $cache->prolong($cacheItem->key(), 20);

    expect($cache->has($cacheItem->key()))->toBeTrue()
        ->and(file_exists(helper_cachedir($cacheItem->key())))->toBeTrue();
});


================================================
FILE: tests/Cache/_cachefilecontent
================================================
a:8:{s:13:"requestMethod";s:3:"GET";s:10:"requestUri";s:74:"https://www.crwlr.software/blog/dealing-with-http-url-query-strings-in-php";s:14:"requestHeaders";a:3:{s:4:"Host";a:1:{i:0;s:18:"www.crwlr.software";}s:10:"User-Agent";a:1:{i:0;s:117:"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36";}s:6:"Cookie";a:2:{i:0;s:20:"XSRF-TOKEN=xsrftoken";i:1;s:29:"crwlrsoftware_session=session";}}s:11:"requestBody";s:0:"";s:12:"effectiveUri";s:74:"https://www.crwlr.software/blog/dealing-with-http-url-query-strings-in-php";s:18:"responseStatusCode";i:200;s:15:"responseHeaders";a:12:{s:6:"Server";a:1:{i:0;s:12:"nginx/1.21.4";}s:12:"Content-Type";a:1:{i:0;s:24:"text/html; charset=UTF-8";}s:17:"Transfer-Encoding";a:1:{i:0;s:7:"chunked";}s:10:"Connection";a:1:{i:0;s:10:"keep-alive";}s:4:"Vary";a:1:{i:0;s:15:"Accept-Encoding";}s:12:"X-Powered-By";a:1:{i:0;s:9:"PHP/8.1.1";}s:13:"Cache-Control";a:1:{i:0;s:17:"no-cache, private";}s:4:"Date";a:1:{i:0;s:29:"Tue, 03 Jan 2023 12:38:20 GMT";}s:10:"Set-Cookie";a:2:{i:0;s:81:"XSRF-TOKEN=xsrftoken; expires=Tue, 03-Jan-2023 14:38:20 GMT; Max-Age=7200; path=/";i:1;s:100:"crwlrsoftware_session=session; expires=Tue, 03-Jan-2023 14:38:20 GMT; Max-Age=7200; path=/; httponly";}s:15:"X-Frame-Options";a:2:{i:0;s:10:"SAMEORIGIN";i:1;s:4:"DENY";}s:16:"X-XSS-Protection";a:1:{i:0;s:13:"1; mode=block";}s:22:"X-Content-Type-Options";a:2:{i:0;s:7:"nosniff";i:1;s:7:"nosniff";}}s:12:"responseBody";s:39078:"<!doctype html>
<html lang="en">
<head>
<meta charset=utf-8>
<meta http-equiv="x-ua-compatible" content="ie=edge">
<title>Dealing with HTTP (Url) Query Strings in PHP - crwlr.software Blog</title>
<meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
<meta name="csrf-token" content="yolo">
<link rel="apple-touch-icon" sizes="180x180" href="/apple-touch-icon.png">
<link rel="icon" type="image/png" sizes="32x32" href="/favicon-32x32.png">
<link rel="icon" type="image/png" sizes="16x16" href="/favicon-16x16.png">
<link rel="manifest" href="/site.webmanifest">
<meta name="description" content="There is a new package in town called query-string. It allows to create, access and manipulate query strings for HTTP requests in a very convenient way. Here's a quick overview of what you can do with it and also how it can be used via the url package." />
<meta name="author" content="Christian Olear" />
<meta property="og:title" content="Dealing with HTTP (Url) Query Strings in PHP" />
<meta property="og:type" content="article" />
<meta property="og:url" content="https://www.crwlr.software/blog/dealing-with-http-url-query-strings-in-php" />
<meta property="og:description" content="There is a new package in town called query-string. It allows to create, access and manipulate query strings for HTTP requests in a very convenient way. Here's a quick overview of what you can do with it and also how it can be used via the url package." />
<meta property="og:image" content="https://www.crwlr.software/images/social/blog/query-string-package-release.png">
<meta name="twitter:card" content="summary" />
<meta name="twitter:image" content="https://www.crwlr.software/images/social/blog/query-string-package-release-twitter.png" />
<meta name="twitter:site" content="@crwlrsoft" />
<meta name="twitter:title" content="Dealing with HTTP (Url) Query Strings in PHP" />
<meta name="twitter:description" content="There is a new package in town called query-string. It allows to create, access and manipulate query strings for HTTP requests in a very convenient way. Here's a quick overview of what you can do with it and also how it can be used via the url package." />
<style>/*! tailwindcss v3.0.23 | MIT License | https://tailwindcss.com*/*,:after,:before{border:0 solid #e5e7eb;box-sizing:border-box}:after,:before{--tw-content:""}html{-webkit-text-size-adjust:100%;font-family:ui-sans-serif,system-ui,-apple-system,BlinkMacSystemFont,Segoe UI,Roboto,Helvetica Neue,Arial,Noto Sans,sans-serif,Apple Color Emoji,Segoe UI Emoji,Segoe UI Symbol,Noto Color Emoji;line-height:1.5;-moz-tab-size:4;-o-tab-size:4;tab-size:4}body{line-height:inherit;margin:0}hr{border-top-width:1px;color:inherit;height:0}abbr:where([title]){-webkit-text-decoration:underline dotted;text-decoration:underline dotted}h1,h2,h3,h4,h5,h6{font-size:inherit;font-weight:inherit}a{color:inherit;text-decoration:inherit}b,strong{font-weight:bolder}code,kbd,pre,samp{font-family:ui-monospace,SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace;font-size:1em}small{font-size:80%}sub,sup{font-size:75%;line-height:0;position:relative;vertical-align:baseline}sub{bottom:-.25em}sup{top:-.5em}table{border-collapse:collapse;border-color:inherit;text-indent:0}button,input,optgroup,select,textarea{color:inherit;font-family:inherit;font-size:100%;line-height:inherit;margin:0;padding:0}button,select{text-transform:none}[type=button],[type=reset],[type=submit],button{-webkit-appearance:button;background-color:transparent;background-image:none}:-moz-focusring{outline:auto}:-moz-ui-invalid{box-shadow:none}progress{vertical-align:baseline}::-webkit-inner-spin-button,::-webkit-outer-spin-button{height:auto}[type=search]{-webkit-appearance:textfield;outline-offset:-2px}::-webkit-search-decoration{-webkit-appearance:none}::-webkit-file-upload-button{-webkit-appearance:button;font:inherit}summary{display:list-item}blockquote,dd,dl,figure,h1,h2,h3,h4,h5,h6,hr,p,pre{margin:0}fieldset{margin:0}fieldset,legend{padding:0}menu,ol,ul{list-style:none;margin:0;padding:0}textarea{resize:vertical}input::-moz-placeholder,textarea::-moz-placeholder{color:#9ca3af;opacity:1}input:-ms-input-placeholder,textarea:-ms-input-placeholder{color:#9ca3af;opacity:1}input::placeholder,textarea::placeholder{color:#9ca3af;opacity:1}[role=button],button{cursor:pointer}:disabled{cursor:default}audio,canvas,embed,iframe,img,object,svg,video{display:block;vertical-align:middle}img,video{height:auto;max-width:100%}[hidden]{display:none}[multiple],[type=date],[type=datetime-local],[type=email],[type=month],[type=number],[type=password],[type=search],[type=tel],[type=text],[type=time],[type=url],[type=week],select,textarea{--tw-shadow:0 0 #0000;-webkit-appearance:none;-moz-appearance:none;appearance:none;background-color:#fff;border-color:#6b7280;border-radius:0;border-width:1px;font-size:1rem;line-height:1.5rem;padding:.5rem .75rem}[multiple]:focus,[type=date]:focus,[type=datetime-local]:focus,[type=email]:focus,[type=month]:focus,[type=number]:focus,[type=password]:focus,[type=search]:focus,[type=tel]:focus,[type=text]:focus,[type=time]:focus,[type=url]:focus,[type=week]:focus,select:focus,textarea:focus{--tw-ring-inset:var(--tw-empty,/*!*/ /*!*/);--tw-ring-offset-width:0px;--tw-ring-offset-color:#fff;--tw-ring-color:#2563eb;--tw-ring-offset-shadow:var(--tw-ring-inset) 0 0 0 var(--tw-ring-offset-width) var(--tw-ring-offset-color);--tw-ring-shadow:var(--tw-ring-inset) 0 0 0 calc(1px + var(--tw-ring-offset-width)) var(--tw-ring-color);border-color:#2563eb;box-shadow:var(--tw-ring-offset-shadow),var(--tw-ring-shadow),var(--tw-shadow);outline:2px solid transparent;outline-offset:2px}input::-moz-placeholder,textarea::-moz-placeholder{color:#6b7280;opacity:1}input:-ms-input-placeholder,textarea:-ms-input-placeholder{color:#6b7280;opacity:1}input::placeholder,textarea::placeholder{color:#6b7280;opacity:1}::-webkit-datetime-edit-fields-wrapper{padding:0}::-webkit-date-and-time-value{min-height:1.5em}::-webkit-datetime-edit,::-webkit-datetime-edit-day-field,::-webkit-datetime-edit-hour-field,::-webkit-datetime-edit-meridiem-field,::-webkit-datetime-edit-millisecond-field,::-webkit-datetime-edit-minute-field,::-webkit-datetime-edit-month-field,::-webkit-datetime-edit-second-field,::-webkit-datetime-edit-year-field{padding-bottom:0;padding-top:0}select{-webkit-print-color-adjust:exact;background-image:url("data:image/svg+xml;charset=utf-8,%3Csvg xmlns='http://www.w3.org/2000/svg' fill='none' viewBox='0 0 20 20'%3E%3Cpath stroke='%236b7280' stroke-linecap='round' stroke-linejoin='round' stroke-width='1.5' d='m6 8 4 4 4-4'/%3E%3C/svg%3E");background-position:right .5rem center;background-repeat:no-repeat;background-size:1.5em 1.5em;color-adjust:exact;padding-right:2.5rem}[multiple]{-webkit-print-color-adjust:unset;background-image:none;background-position:0 0;background-repeat:unset;background-size:initial;color-adjust:unset;padding-right:.75rem}[type=checkbox],[type=radio]{-webkit-print-color-adjust:exact;--tw-shadow:0 0 #0000;-webkit-appearance:none;-moz-appearance:none;appearance:none;background-color:#fff;background-origin:border-box;border-color:#6b7280;border-width:1px;color:#2563eb;color-adjust:exact;display:inline-block;flex-shrink:0;height:1rem;padding:0;-webkit-user-select:none;-moz-user-select:none;-ms-user-select:none;user-select:none;vertical-align:middle;width:1rem}[type=checkbox]{border-radius:0}[type=radio]{border-radius:100%}[type=checkbox]:focus,[type=radio]:focus{--tw-ring-inset:var(--tw-empty,/*!*/ /*!*/);--tw-ring-offset-width:2px;--tw-ring-offset-color:#fff;--tw-ring-color:#2563eb;--tw-ring-offset-shadow:var(--tw-ring-inset) 0 0 0 var(--tw-ring-offset-width) var(--tw-ring-offset-color);--tw-ring-shadow:var(--tw-ring-inset) 0 0 0 calc(2px + var(--tw-ring-offset-width)) var(--tw-ring-color);box-shadow:var(--tw-ring-offset-shadow),var(--tw-ring-shadow),var(--tw-shadow);outline:2px solid transparent;outline-offset:2px}[type=checkbox]:checked,[type=radio]:checked{background-color:currentColor;background-position:50%;background-repeat:no-repeat;background-size:100% 100%;border-color:transparent}[type=checkbox]:checked{background-image:url("data:image/svg+xml;charset=utf-8,%3Csvg viewBox='0 0 16 16' fill='%23fff' xmlns='http://www.w3.org/2000/svg'%3E%3Cpath d='M12.207 4.793a1 1 0 0 1 0 1.414l-5 5a1 1 0 0 1-1.414 0l-2-2a1 1 0 0 1 1.414-1.414L6.5 9.086l4.293-4.293a1 1 0 0 1 1.414 0z'/%3E%3C/svg%3E")}[type=radio]:checked{background-image:url("data:image/svg+xml;charset=utf-8,%3Csvg viewBox='0 0 16 16' fill='%23fff' xmlns='http://www.w3.org/2000/svg'%3E%3Ccircle cx='8' cy='8' r='3'/%3E%3C/svg%3E")}[type=checkbox]:checked:focus,[type=checkbox]:checked:hover,[type=radio]:checked:focus,[type=radio]:checked:hover{background-color:currentColor;border-color:transparent}[type=checkbox]:indeterminate{background-color:currentColor;background-image:url("data:image/svg+xml;charset=utf-8,%3Csvg xmlns='http://www.w3.org/2000/svg' fill='none' viewBox='0 0 16 16'%3E%3Cpath stroke='%23fff' stroke-linecap='round' stroke-linejoin='round' stroke-width='2' d='M4 8h8'/%3E%3C/svg%3E");background-position:50%;background-repeat:no-repeat;background-size:100% 100%;border-color:transparent}[type=checkbox]:indeterminate:focus,[type=checkbox]:indeterminate:hover{background-color:currentColor;border-color:transparent}[type=file]{background:unset;border-color:inherit;border-radius:0;border-width:0;font-size:unset;line-height:inherit;padding:0}[type=file]:focus{outline:1px auto -webkit-focus-ring-color}*,:after,:before{--tw-translate-x:0;--tw-translate-y:0;--tw-rotate:0;--tw-skew-x:0;--tw-skew-y:0;--tw-scale-x:1;--tw-scale-y:1;--tw-pan-x: ;--tw-pan-y: ;--tw-pinch-zoom: ;--tw-scroll-snap-strictness:proximity;--tw-ordinal: ;--tw-slashed-zero: ;--tw-numeric-figure: ;--tw-numeric-spacing: ;--tw-numeric-fraction: ;--tw-ring-inset: ;--tw-ring-offset-width:0px;--tw-ring-offset-color:#fff;--tw-ring-color:rgba(59,130,246,.5);--tw-ring-offset-shadow:0 0 #0000;--tw-ring-shadow:0 0 #0000;--tw-shadow:0 0 #0000;--tw-shadow-colored:0 0 #0000;--tw-blur: ;--tw-brightness: ;--tw-contrast: ;--tw-grayscale: ;--tw-hue-rotate: ;--tw-invert: ;--tw-saturate: ;--tw-sepia: ;--tw-drop-shadow: ;--tw-backdrop-blur: ;--tw-backdrop-brightness: ;--tw-backdrop-contrast: ;--tw-backdrop-grayscale: ;--tw-backdrop-hue-rotate: ;--tw-backdrop-invert: ;--tw-backdrop-opacity: ;--tw-backdrop-saturate: ;--tw-backdrop-sepia: }.crwlr-prose{font-weight:300;line-height:2rem}.crwlr-prose h1,.crwlr-prose h2{font-weight:500;margin-bottom:1.75rem;margin-top:3rem}.crwlr-prose h3,.crwlr-prose h4{font-weight:600;margin-bottom:1.25rem;margin-top:2.5rem}.crwlr-prose h1:first-child,.crwlr-prose h2:first-child,.crwlr-prose h3:first-child{margin-top:0}.crwlr-prose h1+h2,.crwlr-prose h2+h3,.crwlr-prose h3+h4{margin-top:.75rem}.crwlr-prose h1{font-size:2.25rem;line-height:2.5rem}.crwlr-prose h2{font-size:1.875rem;line-height:2.25rem}.crwlr-prose h3{font-size:1.5rem;line-height:2rem}.crwlr-prose h4{font-size:1.25rem;line-height:1.75rem}.crwlr-prose strong{font-weight:600}.crwlr-prose p{margin-bottom:1.25rem;margin-top:1.25rem}.crwlr-prose ul{list-style-position:outside;list-style-type:disc;margin-left:1.5rem}.crwlr-prose ul ul,.crwlr-prose ul ul ul{list-style-type:circle}.crwlr-prose a{--tw-text-opacity:1;color:rgb(12 74 110/var(--tw-text-opacity));cursor:pointer;font-weight:600}.crwlr-prose a:hover{--tw-text-opacity:1;color:rgb(12 162 107/var(--tw-text-opacity))}.crwlr-prose pre{margin-bottom:1.25rem;margin-top:1.25rem}.crwlr-prose .no-margin-top{margin-top:0}.crwlr-prose .no-margin-bottom{margin-bottom:0}.crwlr-prose code.hljs{--tw-shadow:0 4px 6px -1px rgba(0,0,0,.1),0 2px 4px -2px rgba(0,0,0,.1);--tw-shadow-colored:0 4px 6px -1px var(--tw-shadow-color),0 2px 4px -2px var(--tw-shadow-color);--tw-shadow-color:#d6d3d1;--tw-shadow:var(--tw-shadow-colored);border-radius:.5rem;box-shadow:var(--tw-ring-offset-shadow,0 0 #0000),var(--tw-ring-shadow,0 0 #0000),var(--tw-shadow);font-size:.875rem;line-height:1.5rem;padding:1.25rem}.crwlr-prose code:not(.hljs){--tw-bg-opacity:1;--tw-text-opacity:1;background-color:rgb(231 229 228/var(--tw-bg-opacity));border-radius:.375rem;color:rgb(6 115 75/var(--tw-text-opacity));display:inline-block;font-size:1rem;line-height:1.5rem;padding-left:.25rem;padding-right:.25rem}.crwlr-prose .date{--tw-text-opacity:1;color:rgb(120 113 108/var(--tw-text-opacity));font-size:.875rem;line-height:1.25rem}.crwlr-prose h1+.date,.crwlr-prose h2+.date{margin-bottom:1.75rem;margin-top:-1.25rem}.crwlr-toc{--tw-border-opacity:1;--tw-bg-opacity:1;background-color:rgb(255 255 255/var(--tw-bg-opacity));border-color:rgb(203 213 225/var(--tw-border-opacity));border-radius:.5rem;border-width:1px;padding:.75rem 1.25rem}.crwlr-prose .crwlr-toc ul,.crwlr-toc ul{list-style-type:none;margin-left:0}.crwlr-prose .crwlr-toc ul ul,.crwlr-toc ul ul{margin-left:1.5rem}.crwlr-toc ul li:before{--tw-text-opacity:1;color:rgb(12 162 107/var(--tw-text-opacity));content:"#";padding-right:.75rem}.crwlr-prose .crwlr-toc a,.crwlr-toc a{--tw-text-opacity:1;color:rgb(41 37 36/var(--tw-text-opacity));font-weight:400}.crwlr-prose .crwlr-toc a:hover,.crwlr-toc a:hover{--tw-text-opacity:1;color:rgb(11 140 94/var(--tw-text-opacity))}.crwlr-docs-h2:before{content:"#"}.crwlr-docs-h2:before,.crwlr-docs-h3:before{--tw-text-opacity:1;color:rgb(12 162 107/var(--tw-text-opacity));padding-right:.75rem}.crwlr-docs-h3:before{content:"##"}.invisible{visibility:hidden}.static{position:static}.fixed{position:fixed}.absolute{position:absolute}.relative{position:relative}.z-10{z-index:10}.z-20{z-index:20}.z-0{z-index:0}.col-span-4{grid-column:span 4/span 4}.float-right{float:right}.clear-both{clear:both}.my-5{margin-bottom:1.25rem;margin-top:1.25rem}.mx-auto{margin-left:auto;margin-right:auto}.mb-7{margin-bottom:1.75rem}.mt-7{margin-top:1.75rem}.mb-5{margin-bottom:1.25rem}.mb-10{margin-bottom:2.5rem}.mb-3{margin-bottom:.75rem}.mb-8{margin-bottom:2rem}.mb-12{margin-bottom:3rem}.ml-3{margin-left:.75rem}.mr-1{margin-right:.25rem}.ml-5{margin-left:1.25rem}.mr-2{margin-right:.5rem}.mr-3{margin-right:.75rem}.mr-5{margin-right:1.25rem}.mt-5{margin-top:1.25rem}.mt-1{margin-top:.25rem}.mr-10{margin-right:2.5rem}.-mt-20{margin-top:-5rem}.ml-7{margin-left:1.75rem}.mt-2{margin-top:.5rem}.mt-3{margin-top:.75rem}.box-content{box-sizing:content-box}.block{display:block}.inline-block{display:inline-block}.flex{display:flex}.inline-flex{display:inline-flex}.grid{display:grid}.hidden{display:none}.h-6{height:1.5rem}.h-4{height:1rem}.h-52{height:13rem}.h-10{height:2.5rem}.h-16{height:4rem}.h-9{height:2.25rem}.h-8{height:2rem}.h-80{height:20rem}.h-full{height:100%}.max-h-max{max-height:-webkit-max-content;max-height:-moz-max-content;max-height:max-content}.w-4{width:1rem}.w-40{width:10rem}.w-full{width:100%}.w-6{width:1.5rem}.w-10{width:2.5rem}.w-\[150\%\]{width:150%}.w-1\/3{width:33.333333%}.w-1\/4{width:25%}.w-\[597px\]{width:597px}.max-w-7xl{max-width:80rem}.max-w-full{max-width:100%}.grow-0{flex-grow:0}.grow{flex-grow:1}.basis-3\/5{flex-basis:60%}.basis-2\/5{flex-basis:40%}.basis-auto{flex-basis:auto}.basis-1\/3{flex-basis:33.333333%}.-rotate-12{--tw-rotate:-12deg}.-rotate-12,.transform{transform:translate(var(--tw-translate-x),var(--tw-translate-y)) rotate(var(--tw-rotate)) skewX(var(--tw-skew-x)) skewY(var(--tw-skew-y)) scaleX(var(--tw-scale-x)) scaleY(var(--tw-scale-y))}.cursor-pointer{cursor:pointer}.list-inside{list-style-position:inside}.list-disc{list-style-type:disc}.grid-cols-1{grid-template-columns:repeat(1,minmax(0,1fr))}.flex-col{flex-direction:column}.flex-wrap{flex-wrap:wrap}.items-center{align-items:center}.justify-center{justify-content:center}.justify-between{justify-content:space-between}.justify-items-stretch{justify-items:stretch}.gap-5{gap:1.25rem}.gap-7{gap:1.75rem}.overflow-hidden{overflow:hidden}.overflow-y-scroll{overflow-y:scroll}.truncate{overflow:hidden;text-overflow:ellipsis;white-space:nowrap}.rounded-md{border-radius:.375rem}.rounded-full{border-radius:9999px}.rounded-lg{border-radius:.5rem}.rounded{border-radius:.25rem}.rounded-t-md{border-top-left-radius:.375rem;border-top-right-radius:.375rem}.rounded-b-md{border-bottom-left-radius:.375rem;border-bottom-right-radius:.375rem}.border-b{border-bottom-width:1px}.border-t{border-top-width:1px}.border-t-8{border-top-width:8px}.border-b-4{border-bottom-width:4px}.border-stone-300{--tw-border-opacity:1;border-color:rgb(214 211 209/var(--tw-border-opacity))}.border-b-slate-300{--tw-border-opacity:1;border-bottom-color:rgb(203 213 225/var(--tw-border-opacity))}.border-t-slate-300{--tw-border-opacity:1;border-top-color:rgb(203 213 225/var(--tw-border-opacity))}.border-t-creen-400{--tw-border-opacity:1;border-top-color:rgb(58 203 150/var(--tw-border-opacity))}.border-b-creen-400{--tw-border-opacity:1;border-bottom-color:rgb(58 203 150/var(--tw-border-opacity))}.bg-gray-200{--tw-bg-opacity:1;background-color:rgb(229 231 235/var(--tw-bg-opacity))}.bg-red-600{--tw-bg-opacity:1;background-color:rgb(220 38 38/var(--tw-bg-opacity))}.bg-yellow-400{--tw-bg-opacity:1;background-color:rgb(250 204 21/var(--tw-bg-opacity))}.bg-green-500{--tw-bg-opacity:1;background-color:rgb(34 197 94/var(--tw-bg-opacity))}.bg-\[\#012B37\]{--tw-bg-opacity:1;background-color:rgb(1 43 55/var(--tw-bg-opacity))}.bg-creen-600{--tw-bg-opacity:1;background-color:rgb(12 162 107/var(--tw-bg-opacity))}.bg-white{--tw-bg-opacity:1;background-color:rgb(255 255 255/var(--tw-bg-opacity))}.bg-slate-700{--tw-bg-opacity:1;background-color:rgb(51 65 85/var(--tw-bg-opacity))}.bg-slate-50{--tw-bg-opacity:1;background-color:rgb(248 250 252/var(--tw-bg-opacity))}.bg-slate-600{--tw-bg-opacity:1;background-color:rgb(71 85 105/var(--tw-bg-opacity))}.p-2{padding:.5rem}.p-5{padding:1.25rem}.p-3{padding:.75rem}.px-7{padding-left:1.75rem;padding-right:1.75rem}.py-3{padding-bottom:.75rem;padding-top:.75rem}.py-2{padding-bottom:.5rem;padding-top:.5rem}.px-5{padding-left:1.25rem;padding-right:1.25rem}.py-8{padding-bottom:2rem;padding-top:2rem}.pb-2{padding-bottom:.5rem}.pt-9{padding-top:2.25rem}.pb-10{padding-bottom:2.5rem}.pl-8{padding-left:2rem}.pb-3{padding-bottom:.75rem}.text-center{text-align:center}.align-middle{vertical-align:middle}.text-2xl{font-size:1.5rem;line-height:2rem}.text-xl{font-size:1.25rem;line-height:1.75rem}.text-3xl{font-size:1.875rem;line-height:2.25rem}.text-sm{font-size:.875rem;line-height:1.25rem}.text-base{font-size:1rem;line-height:1.5rem}.text-lg{font-size:1.125rem;line-height:1.75rem}.font-semibold{font-weight:600}.font-medium{font-weight:500}.font-bold{font-weight:700}.font-normal{font-weight:400}.text-slate-700{--tw-text-opacity:1;color:rgb(51 65 85/var(--tw-text-opacity))}.text-creen-600{--tw-text-opacity:1;color:rgb(12 162 107/var(--tw-text-opacity))}.text-gray-200{--tw-text-opacity:1;color:rgb(229 231 235/var(--tw-text-opacity))}.text-white{--tw-text-opacity:1;color:rgb(255 255 255/var(--tw-text-opacity))}.text-stone-700{--tw-text-opacity:1;color:rgb(68 64 60/var(--tw-text-opacity))}.text-sky-900{--tw-text-opacity:1;color:rgb(12 74 110/var(--tw-text-opacity))}.text-gray-700{--tw-text-opacity:1;color:rgb(55 65 81/var(--tw-text-opacity))}.text-stone-500{--tw-text-opacity:1;color:rgb(120 113 108/var(--tw-text-opacity))}.shadow-lg{--tw-shadow:0 10px 15px -3px rgba(0,0,0,.1),0 4px 6px -4px rgba(0,0,0,.1);--tw-shadow-colored:0 10px 15px -3px var(--tw-shadow-color),0 4px 6px -4px var(--tw-shadow-color)}.shadow-lg,.shadow-md{box-shadow:var(--tw-ring-offset-shadow,0 0 #0000),var(--tw-ring-shadow,0 0 #0000),var(--tw-shadow)}.shadow-md{--tw-shadow:0 4px 6px -1px rgba(0,0,0,.1),0 2px 4px -2px rgba(0,0,0,.1);--tw-shadow-colored:0 4px 6px -1px var(--tw-shadow-color),0 2px 4px -2px var(--tw-shadow-color)}.shadow-sm{--tw-shadow:0 1px 2px 0 rgba(0,0,0,.05);--tw-shadow-colored:0 1px 2px 0 var(--tw-shadow-color);box-shadow:var(--tw-ring-offset-shadow,0 0 #0000),var(--tw-ring-shadow,0 0 #0000),var(--tw-shadow)}.shadow-stone-300{--tw-shadow-color:#d6d3d1;--tw-shadow:var(--tw-shadow-colored)}.filter{filter:var(--tw-blur) var(--tw-brightness) var(--tw-contrast) var(--tw-grayscale) var(--tw-hue-rotate) var(--tw-invert) var(--tw-saturate) var(--tw-sepia) var(--tw-drop-shadow)}.hover\:bg-creen-700:hover{--tw-bg-opacity:1;background-color:rgb(11 140 94/var(--tw-bg-opacity))}.hover\:bg-creen-600:hover{--tw-bg-opacity:1;background-color:rgb(12 162 107/var(--tw-bg-opacity))}.hover\:text-creen-100:hover{--tw-text-opacity:1;color:rgb(198 231 218/var(--tw-text-opacity))}.hover\:text-creen-500:hover{--tw-text-opacity:1;color:rgb(29 189 131/var(--tw-text-opacity))}.hover\:text-creen-600:hover{--tw-text-opacity:1;color:rgb(12 162 107/var(--tw-text-opacity))}.hover\:text-slate-900:hover{--tw-text-opacity:1;color:rgb(15 23 42/var(--tw-text-opacity))}.hover\:text-white:hover{--tw-text-opacity:1;color:rgb(255 255 255/var(--tw-text-opacity))}.hover\:shadow-none:hover{--tw-shadow:0 0 #0000;--tw-shadow-colored:0 0 #0000;box-shadow:var(--tw-ring-offset-shadow,0 0 #0000),var(--tw-ring-shadow,0 0 #0000),var(--tw-shadow)}.focus\:border-creen-400:focus{--tw-border-opacity:1;border-color:rgb(58 203 150/var(--tw-border-opacity))}.focus\:ring:focus{--tw-ring-offset-shadow:var(--tw-ring-inset) 0 0 0 var(--tw-ring-offset-width) var(--tw-ring-offset-color);--tw-ring-shadow:var(--tw-ring-inset) 0 0 0 calc(3px + var(--tw-ring-offset-width)) var(--tw-ring-color);box-shadow:var(--tw-ring-offset-shadow),var(--tw-ring-shadow),var(--tw-shadow,0 0 #0000)}.focus\:ring-creen-300:focus{--tw-ring-opacity:1;--tw-ring-color:rgb(83 213 165/var(--tw-ring-opacity))}.focus\:ring-opacity-50:focus{--tw-ring-opacity:0.5}.focus\:ring-offset-0:focus{--tw-ring-offset-width:0px}.active\:bg-creen-500:active{--tw-bg-opacity:1;background-color:rgb(29 189 131/var(--tw-bg-opacity))}.active\:text-creen-800:active{--tw-text-opacity:1;color:rgb(6 115 75/var(--tw-text-opacity))}.disabled\:cursor-not-allowed:disabled{cursor:not-allowed}.disabled\:bg-stone-600:disabled{--tw-bg-opacity:1;background-color:rgb(87 83 78/var(--tw-bg-opacity))}@media (min-width:768px){.md\:visible{visibility:visible}.md\:top-3{top:.75rem}.md\:col-span-3{grid-column:span 3/span 3}.md\:mb-0{margin-bottom:0}.md\:ml-8{margin-left:2rem}.md\:inline{display:inline}.md\:flex{display:flex}.md\:hidden{display:none}.md\:w-\[30\%\]{width:30%}.md\:w-3\/5{width:60%}.md\:w-1\/2{width:50%}.md\:grid-cols-2{grid-template-columns:repeat(2,minmax(0,1fr))}.md\:items-start{align-items:flex-start}.md\:gap-7{gap:1.75rem}.md\:gap-8{gap:2rem}.md\:border-r{border-right-width:1px}.md\:border-r-slate-300{--tw-border-opacity:1;border-right-color:rgb(203 213 225/var(--tw-border-opacity))}.md\:text-xl{font-size:1.25rem;line-height:1.75rem}}@media (min-width:1024px){.crwlr-prose .lg\:no-margin-bottom{margin-bottom:0}.lg\:-mt-28{margin-top:-7rem}.lg\:grid{display:grid}.lg\:w-\[23\%\]{width:23%}.lg\:w-2\/3{width:66.666667%}.lg\:w-1\/3{width:33.333333%}.lg\:grid-cols-2{grid-template-columns:repeat(2,minmax(0,1fr))}.lg\:grid-cols-3{grid-template-columns:repeat(3,minmax(0,1fr))}.lg\:items-stretch{align-items:stretch}.lg\:justify-items-stretch{justify-items:stretch}.lg\:gap-5{gap:1.25rem}}@media (min-width:1280px){.xl\:w-1\/5{width:20%}.xl\:w-3\/4{width:75%}} </style>
<link rel="stylesheet" href="/css/highlight/monokai-sublime.min.css?id=41c020e9bd57b47ab0668140bb6af51b">
</head><body id="crw" class="bg-slate-700"><svg style="display: none" xmlns="http://www.w3.org/2000/svg">
<defs>
<symbol id="book" fill="none" viewBox="0 0 24 24" stroke="currentColor" stroke-width="2">
    <path stroke-linecap="round" stroke-linejoin="round" d="M12 6.253v13m0-13C10.832 5.477 9.246 5 7.5 5S4.168 5.477 3 6.253v13C4.168 18.477 5.754 18 7.5 18s3.332.477 4.5 1.253m0-13C13.168 5.477 14.754 5 16.5 5c1.747 0 3.332.477 4.5 1.253v13C19.832 18.477 18.247 18 16.5 18c-1.746 0-3.332.477-4.5 1.253" />
</symbol>
<symbol id="collection" fill="none" viewBox="0 0 24 24" stroke="currentColor" stroke-width="2">
    <path stroke-linecap="round" stroke-linejoin="round" d="M19 11H5m14 0a2 2 0 012 2v6a2 2 0 01-2 2H5a2 2 0 01-2-2v-6a2 2 0 012-2m14 0V9a2 2 0 00-2-2M5 11V9a2 2 0 012-2m0 0V5a2 2 0 012-2h6a2 2 0 012 2v2M7 7h10" />
</symbol>
<symbol id="github" fill="currentColor" viewBox="0 0 121 118" stroke="currentColor">
    <path d="M60.388,0 C27.041,0 0,27.036 0,60.388 C0,87.069 17.303,109.705 41.297,117.69 C44.315,118.249 45.423,116.38 45.423,114.785 C45.423,113.345 45.367,108.588 45.341,103.542 C28.541,107.195 24.996,96.417 24.996,96.417 C22.249,89.437 18.291,87.581 18.291,87.581 C12.812,83.833 18.704,83.91 18.704,83.91 C24.768,84.336 27.961,90.133 27.961,90.133 C33.347,99.365 42.088,96.696 45.534,95.153 C46.076,91.25 47.641,88.586 49.368,87.078 C35.955,85.551 21.855,80.373 21.855,57.234 C21.855,50.641 24.214,45.254 28.077,41.025 C27.45,39.504 25.383,33.362 28.662,25.044 C28.662,25.044 33.733,23.421 45.273,31.234 C50.09,29.896 55.256,29.225 60.388,29.202 C65.52,29.225 70.69,29.896 75.516,31.234 C87.042,23.421 92.106,25.044 92.106,25.044 C95.393,33.362 93.325,39.504 92.698,41.025 C96.57,45.254 98.913,50.641 98.913,57.234 C98.913,80.428 84.786,85.535 71.339,87.03 C73.505,88.904 75.435,92.579 75.435,98.213 C75.435,106.293 75.365,112.796 75.365,114.785 C75.365,116.392 76.452,118.275 79.513,117.682 C103.494,109.688 120.775,87.06 120.775,60.388 C120.775,27.036 93.738,0 60.388,0"></path>
    <path d="M22.872,86.704 C22.739,87.004 22.267,87.094 21.837,86.888 C21.399,86.691 21.153,86.282 21.295,85.981 C21.425,85.672 21.898,85.586 22.335,85.793 C22.774,85.99 23.024,86.403 22.872,86.704"></path>
    <path d="M25.318,89.432 C25.03,89.699 24.467,89.575 24.085,89.153 C23.69,88.732 23.616,88.169 23.908,87.898 C24.205,87.631 24.751,87.756 25.147,88.177 C25.542,88.603 25.619,89.162 25.318,89.432"></path>
    <path d="M27.699,92.91 C27.329,93.167 26.724,92.926 26.35,92.389 C25.98,91.852 25.98,91.208 26.358,90.95 C26.733,90.692 27.329,90.924 27.708,91.457 C28.077,92.003 28.077,92.647 27.699,92.91"></path>
    <path d="M30.961,96.27 C30.63,96.635 29.925,96.537 29.409,96.039 C28.881,95.552 28.734,94.861 29.066,94.496 C29.401,94.13 30.11,94.233 30.63,94.727 C31.154,95.213 31.314,95.909 30.961,96.27"></path>
    <path d="M35.461,98.221 C35.315,98.694 34.636,98.909 33.952,98.708 C33.269,98.501 32.822,97.947 32.96,97.469 C33.102,96.993 33.784,96.769 34.473,96.984 C35.155,97.19 35.603,97.74 35.461,98.221"></path>
    <path d="M40.403,98.583 C40.42,99.081 39.84,99.494 39.122,99.503 C38.4,99.519 37.816,99.116 37.808,98.626 C37.808,98.123 38.375,97.714 39.097,97.702 C39.815,97.688 40.403,98.088 40.403,98.583"></path>
    <path d="M45.002,97.8 C45.088,98.286 44.589,98.785 43.876,98.918 C43.175,99.046 42.526,98.746 42.437,98.264 C42.35,97.766 42.858,97.267 43.558,97.138 C44.272,97.014 44.911,97.306 45.002,97.8"></path>
</symbol>
<symbol id="twitter" viewBox="0 0 400 400">
<path fill="currentColor" d="M163.4,305.5c88.7,0,137.2-73.5,137.2-137.2c0-2.1,0-4.2-0.1-6.2c9.4-6.8,17.6-15.3,24.1-25
c-8.6,3.8-17.9,6.4-27.7,7.6c10-6,17.6-15.4,21.2-26.7c-9.3,5.5-19.6,9.5-30.6,11.7c-8.8-9.4-21.3-15.2-35.2-15.2
c-26.6,0-48.2,21.6-48.2,48.2c0,3.8,0.4,7.5,1.3,11c-40.1-2-75.6-21.2-99.4-50.4c-4.1,7.1-6.5,15.4-6.5,24.2
c0,16.7,8.5,31.5,21.5,40.1c-7.9-0.2-15.3-2.4-21.8-6c0,0.2,0,0.4,0,0.6c0,23.4,16.6,42.8,38.7,47.3c-4,1.1-8.3,1.7-12.7,1.7
c-3.1,0-6.1-0.3-9.1-0.9c6.1,19.2,23.9,33.1,45,33.5c-16.5,12.9-37.3,20.6-59.9,20.6c-3.9,0-7.7-0.2-11.5-0.7
C110.8,297.5,136.2,305.5,163.4,305.5"/>
</symbol>
<symbol id="box" fill="none" stroke="currentColor" viewBox="0 0 25 18">
    <g transform="translate(0.648926, 0.707031)">
        <polygon stroke-linecap="round" stroke-linejoin="round" points="11.6182861 0 2.32592773 2.53967285 -1.99206185e-14 5.53771973 2.07519531 6.41015625 2.07519531 12.001709 11.6182861 15.9133301 21.0585938 12.001709 21.0585938 6.41015625 23.2336426 5.53771973 20.8463135 2.53967285"></polygon>
        <polyline stroke-linecap="round" stroke-linejoin="round" points="2.32592773 2.53967285 11.6168213 5.23913574 20.8463135 2.53967285"></polyline>
        <polyline stroke-linecap="round" stroke-linejoin="round" points="2.07519531 6.41015625 8.70874023 8.62097168 11.6168213 5.23913574 14.430542 8.62097168 21.0585938 6.41015625"></polyline>
        <line x1="11.6168213" y1="5.23913574" x2="11.6168213" y2="15.9133301"></line>
    </g>
</symbol>
<symbol id="calendar" fill="none" stroke="currentColor" stroke-width="2" viewBox="0 0 24 24">
    <path stroke-linecap="round" stroke-linejoin="round" d="M8 7V3m8 4V3m-9 8h10M5 21h14a2 2 0 002-2V7a2 2 0 00-2-2H5a2 2 0 00-2 2v12a2 2 0 002 2z" />
</symbol>
</defs>
</svg><nav class="bg-slate-700 h-16 overflow-hidden border-b-4 border-b-creen-400">
    <div class="w-full max-w-7xl mx-auto flex justify-between h-16 items-center px-5 z-10">
        <a href="https://www.crwlr.software" id="logo" title="crwlr.software" class="inline-block">
            <img src="/images/logo-nav-desktop.png" class="h-9 hidden md:inline" alt="crwlr.software logo" />
            <img src="/images/logo-white-border.png" class="h-8 md:hidden" alt="crwlr.software logo mobile" />
        </a>
        <ul id="navlinks" class="flex z-20">
            <li><a class="text-white hover:text-slate-900 text-lg md:text-xl ml-5 md:ml-8" href="https://www.crwlr.software/packages" title="Overview of PHP packages"
>Packages</a></li>
            <li><a class="text-white hover:text-slate-900 text-lg md:text-xl ml-5 md:ml-8" href="https://www.crwlr.software/blog" title="Blog about crawling and scraping with PHP"
>Blog</a></li>
            <li><a class="text-white hover:text-slate-900 text-lg md:text-xl ml-5 md:ml-8" href="https://www.crwlr.software/contact" title="Get in touch"
>Contact</a></li>
        </ul>
    </div>
    <div class="w-[150%] h-80 -rotate-12 bg-creen-600 invisible md:visible -mt-20 lg:-mt-28 z-0"></div>
</nav>
<main id="content" class="bg-slate-50 text-stone-700 text-lg">
<div class="w-full max-w-7xl mx-auto p-5 pt-9 pb-10"><article class="crwlr-prose">
<h1>Dealing with HTTP (Url) Query Strings in PHP</h1>
<div class="date">2022-06-02</div>
<html><body><p><strong>There is a new package in town called <a href="/packages/query-string">query-string</a>. It allows to create, access and manipulate query strings for HTTP requests in a very convenient way. Here's a quick overview of what you can do with it and also how it can be used via the url package.</strong></p>
<p>The last months I started thinking about improving how you can change a URL's query string. It all started with <a href="https://twitter.com/chrolear/status/1511309249049153545" target="_blank" rel="noopener">this tweet</a> as an answer to <a href="https://twitter.com/heychazza" target="_blank" rel="noopener">@heychazza's</a> tweet about a nice way to build URLs in javascript.</p><p class="text-center"><a href="https://twitter.com/chrolear/status/1511309249049153545" class="inline-block" target="_blank" rel="noopener"><img src="/images/blog/2022-06-02/query-params-tweet.png" class="w-[597px]" alt="Screenshot of a tweet by @chrolear saying: In PHP you can use my url package to get and set query params as array. I Could maybe also add a method to set/add a single param 🤔"></a></p>
<p>Then last week someone added <a href="https://github.com/crwlrsoft/url/issues/27" target="_blank" rel="noopener">this github issue</a> for the url package, and it got me thinking more about this. I liked the suggested API to get and set query params, but I found that it's not enough for more complex query strings. As query strings are also used in POST requests and sent in the request body, I now finally added a separate <a href="/packages/query-string">query-string package</a> and also implemented it in the <a href="/packages/url/v1.2/query-string">url package</a>. </p>
<h2>Implementation in the Url Package</h2>
<p>First off: I set the required PHP version for the new package to 8.0 as the last 7.x version (7.4) is already in the final "security fixes only" phase. The url package currently still requires only 7.2. As I probably plan another BC break for v2 of the url package, for now I just added the query-string package as suggestion to the composer.json. You can manually install it, when you're already on PHP 8.x and want to use the advanced query string functionality.</p>
<p>When you've installed it via</p>
<pre><code class="language-bash">composer require crwlr/query-string</code></pre>
<p>the new <code>queryString()</code> method of the <code>Url</code> class returns an instance of the <code>Query</code> class shipped with the new package. Here's a quick usage example:</p>
<pre><code class="language-php">$url = Url::parse('https://www.example.com/listing?page[number]=3&amp;page[size]=25');

$url-&gt;queryString()
    -&gt;get('page')
    -&gt;set('number', '4');

var_dump($url-&gt;__toString());

// string(68) "https://www.example.com/listing?page%5Bnumber%5D=4&amp;page%5Bsize%5D=25"</code></pre>
<h2>Standalone Usage</h2>
<p>If you want to parse query strings standalone, not in the URL context, you can create an instance of the <code>Query</code> class from string or from array:</p>
<pre><code class="language-php">$query = Query::fromString('foo=bar&amp;baz=quz');

$query = Query::fromArray(['foo' =&gt; 'bar', 'baz' =&gt; 'quz']);</code></pre>
<h3>Access</h3>
<p>Here a quick example of different ways how to access query string params:</p>
<pre><code class="language-php">$fooValue = Query::fromString('foo=bar&amp;baz=quz')-&gt;get('foo'); // string(3) "bar"</code></pre>
<p>When the requested key is an array, the <code>get()</code> method returns another (child) <code>Query</code> instance that you can query further:</p>
<pre><code class="language-php">$fooBazValue = Query::fromString('foo[bar]=1&amp;foo[baz]=2&amp;foo[quz]=3')
    -&gt;get('foo')
    -&gt;get('baz'); // string(1) "2"</code></pre>
<p>You can check if a certain key exists in the query:</p>
<pre><code class="language-php">$query = Query::fromString('foo=1&amp;bar=2');

$query-&gt;has('bar'); // bool(true)

$query-&gt;has('baz'); // bool(false)</code></pre>
<p>You can get the first or last element of an indexed array:</p>
<pre><code class="language-php">$query = Query::fromString('foo[]=1&amp;foo[]=2&amp;foo[]=3');

$query-&gt;first('foo'); // string(1) "1"

$query-&gt;last('foo');  // string(1) "3"</code></pre>
<p>You can check if the value for a certain key is an array of a scalar value:</p>
<pre><code class="language-php">$query = Query::fromString('foo[]=1&amp;foo[]=2&amp;bar=3');

$query-&gt;isArray('foo'); // bool(true)

$query-&gt;isScalar('foo'); // bool(false)

$query-&gt;isArray('bar'); // bool(false)

$query-&gt;isScalar('bar'); // bool(true)</code></pre>
<p>And of course you can then convert the query to a string or to an array again:</p>
<pre><code class="language-php">$query = Query::fromString('foo=bar&amp;baz=quz');

$queryArray = $query-&gt;toArray();

// array(2) {
//   ["foo"]=&gt;
//   string(3) "bar"
//   ["baz"]=&gt;
//   string(3) "quz"
// }

$query = Query::fromArray(['foo' =&gt; 'bar', 'baz' =&gt; 'quz']);

$queryString = $query-&gt;toString(); // string(15) "foo=bar&amp;baz=quz"</code></pre>
<h3>Manipulation</h3>
<p>You can <strong>set</strong> a certain key:</p>
<pre><code class="language-php">$query = Query::fromString('foo=bar')-&gt;set('baz', 'quz');

// string(15) "foo=bar&amp;baz=quz"</code></pre>
<p>Also to an array:</p>
<pre><code class="language-php">$query = Query::fromString('foo=1&amp;bar=2')
    -&gt;set('baz', ['3', '4']);

// string(29) "foo=1&amp;bar=2&amp;baz[0]=3&amp;baz[1]=4"</code></pre>
<p>You can also <strong>append</strong> values <strong>to</strong> an existing array:</p>
<pre><code class="language-php">$query = Query::fromString('foo[]=1&amp;foo[]=2')
    -&gt;appendTo('foo', '3');

// string(26) "foo[0]=1&amp;foo[1]=2&amp;foo[2]=3"

$query = Query::fromString('foo[bar]=1&amp;foo[baz]=2')
    -&gt;appendTo('foo', ['quz' =&gt; '3']);

// string(32) "foo[bar]=1&amp;foo[baz]=2&amp;foo[quz]=3"</code></pre>
<p><strong>Remove</strong> keys or values from keys:</p>
<pre><code class="language-php">$query = Query::fromString('foo[]=1&amp;foo[]=2&amp;bar=3&amp;baz=4')
    -&gt;remove('foo');

// string(11) "bar=3&amp;baz=4"

$query = Query::fromString('foo[]=1&amp;foo[]=2&amp;foo[]=3&amp;foo[]=2')
    -&gt;removeValueFrom('foo', '2');

// string(17) "foo[0]=1&amp;foo[1]=3"</code></pre>
<p>And you can <strong>filter</strong> or <strong>map</strong> queries with callback functions:</p>
<pre><code class="language-php">$query = Query::fromString('no1=12&amp;no2=7&amp;no3=23&amp;no4=9&amp;no5=10')
    -&gt;filter(function ($value, $key) {
        return (int) $value &gt;= 10;
    });

// string(20) "no1=12&amp;no3=23&amp;no5=10"

$query = Query::fromString('foo=1&amp;bar=2&amp;baz=3&amp;quz=4')
    -&gt;map(function ($value) {
        return (int) $value + 3;
    });

// string(23) "foo=4&amp;bar=5&amp;baz=6&amp;quz=7"</code></pre>
<p>For more details have a look at the <a href="/packages/query-string/v1.0/getting-started">documentation</a>. If you're having any question or issues, don't be shy and reach out on twitter or github.</p></body></html>
</article>
<script type="application/ld+json">{"@context":"https:\/\/schema.org","@type":"BlogPosting","headline":"Dealing with HTTP (Url) Query Strings in PHP","author":{"@type":"Person","name":"Christian Olear","alternateName":"Otsch"},"description":"There is a new package in town called query-string. It allows to create, access and manipulate query strings for HTTP requests in a very convenient way. Here's a quick overview of what you can do with it and also how it can be used via the url package.","dateCreated":"2022-06-02","datePublished":"2022-06-02","keywords":"crwlr, url, query-string, query, string, querystring, PHP, HTTP, requests, GET, POST"}</script>
</div>
</main>
<footer class="bg-slate-700 border-t-8 border-t-creen-400">
    <div class="w-full max-w-7xl mx-auto px-5 py-8 text-white">

        <p class="mb-3 text-center">
            <a class="text-white hover:text-creen-100 cursor-pointer font-semibold mr-10"
   href="https://twitter.com/crwlrsoft" target="_blank" rel="noopener"><svg class="inline-block align-middle h-10 w-10 mr-3"><use href="#twitter" /></svg> <span class="align-middle">Twitter</span></a>            <a class="text-white hover:text-creen-100 cursor-pointer font-semibold"
   href="https://github.com/crwlrsoft" target="_blank" rel="noopener"><svg class="inline-block align-middle h-6 w-6 mr-3"><use href="#github" /></svg> <span class="align-middle">GitHub</span></a>        </p>

        <p class="text-center">
            <a class="text-white hover:text-creen-100 cursor-pointer font-semibold mr-3"
   href="/privacy">Privacy</a> |
            <a class="text-white hover:text-creen-100 cursor-pointer font-semibold ml-3"
   href="/imprint">Imprint</a>        </p>
    </div>
</footer>
<script src="/js/highlight.min.js?id=e46338bb5182ab5b40675e85a5cdcc41"></script>
<script>hljs.highlightAll();</script>
</body>
</html>
";}

================================================
FILE: tests/CrawlerTest.php
================================================
<?php

namespace tests;

use Crwlr\Crawler\Steps\Exceptions\PreRunValidationException;
use Crwlr\Crawler\Steps\StepOutputType;
use tests\_Stubs\Crawlers\DummyOne;
use tests\_Stubs\Crawlers\DummyTwo;
use Crwlr\Crawler\Crawler;
use Crwlr\Crawler\Output;
use Crwlr\Crawler\Result;
use Crwlr\Crawler\Steps\Loading\Http;
use Crwlr\Crawler\Steps\Step;
use Crwlr\Crawler\Steps\StepInterface;
use Crwlr\Crawler\Stores\Store;
use Crwlr\Crawler\Stores\StoreInterface;
use Generator;
use Mockery;
use PHPUnit\Framework\TestCase;

function helper_getDummyCrawler(): Crawler
{
    return new DummyOne();
}

function helper_getDummyCrawlerWithInputReturningStep(): Crawler
{
    $crawler = helper_getDummyCrawler();

    $step = helper_getInputReturningStep();

    $crawler->addStep($step);

    return $crawler;
}

/** @var TestCase $this */

test(
    'The methods to define UserAgent, Logger and Loader instances are called in construct and the getter methods ' .
    'always return the same instance.',
    function () {
        $crawler = new DummyTwo();

        expect($crawler->getUserAgent()->testProperty)->toBe('foo')
            ->and($crawler->getLogger()->testProperty)->toBe('foo')
            ->and($crawler->getLoader()->testProperty)->toBe('foo')
            ->and($crawler->userAgentCalled)->toBe(1)
            ->and($crawler->loggerCalled)->toBe(1)
            ->and($crawler->loaderCalled)->toBe(1);

        $crawler->getUserAgent()->testProperty = 'bar';

        $crawler->getLogger()->testProperty = 'bar';

        $crawler->getLoader()->testProperty = 'bar';

        $crawler->addStep(Http::get()); // adding steps passes on logger and loader, should use the same instances

        expect($crawler->getUserAgent()->testProperty)->toBe('bar')
            ->and($crawler->getLogger()->testProperty)->toBe('bar')
            ->and($crawler->getLoader()->testProperty)->toBe('bar')
            ->and($crawler->userAgentCalled)->toBe(1)
            ->and($crawler->loggerCalled)->toBe(1)
            ->and($crawler->loaderCalled)->toBe(1);
    },
);

it('gives you the current memory limit', function () {
    expect(Crawler::getMemoryLimit())->toBeString();
});

it('changes the current memory limit when allowed', function () {
    $currentLimit = Crawler::getMemoryLimit();

    if ($currentLimit === '512M') {
        $newValue = '1G';
    } else {
        $newValue = '512M';
    }

    $setLimitReturnValue = Crawler::setMemoryLimit($newValue);

    if ($setLimitReturnValue === false) {
        expect(Crawler::getMemoryLimit())->toBe($currentLimit);
    } else {
        expect(Crawler::getMemoryLimit())->toBe($newValue);
    }
});

test('You can set a single input for the first step using the input method', function () {
    $crawler = helper_getDummyCrawlerWithInputReturningStep();

    $crawler->input('https://www.example.com');

    $results = helper_generatorToArray($crawler->run());

    expect($results[0]->toArray()['unnamed'])->toBe('https://www.example.com');
});

test('You can set multiple inputs by multiply calling the input method', function () {
    $crawler = helper_getDummyCrawlerWithInputReturningStep();

    $crawler->input('https://www.crwl.io');

    $crawler->input('https://www.otsch.codes');

    $results = helper_generatorToArray($crawler->run());

    expect($results[0]->toArray()['unnamed'])->toBe('https://www.crwl.io');

    expect($results[1]->toArray()['unnamed'])->toBe('https://www.otsch.codes');
});

test('You can set multiple inputs using the inputs (plural) method', function () {
    $crawler = helper_getDummyCrawlerWithInputReturningStep();

    $crawler->inputs(['https://www.crwl.io', 'https://www.otsch.codes']);

    $results = helper_generatorToArray($crawler->run());

    expect($results[0]->toArray()['unnamed'])->toBe('https://www.crwl.io');

    expect($results[1]->toArray()['unnamed'])->toBe('https://www.otsch.codes');
});

test('Initial inputs are reset after the crawler was run', function () {
    $crawler = helper_getDummyCrawlerWithInputReturningStep();

    $crawler->inputs(['https://www.crwl.io', 'https://www.otsch.codes']);

    $results = helper_generatorToArray($crawler->run());

    expect($results)->toHaveCount(2);

    $crawler->input('https://fetzi.dev/');

    $results = helper_generatorToArray($crawler->run());

    expect($results)->toHaveCount(1);
});

test('You can add steps and the Crawler class passes on its Logger and also its Loader if needed', function () {
    $step = Mockery::mock(StepInterface::class);

    $step->shouldReceive('addLogger')->once();

    $crawler = helper_getDummyCrawler();

    $crawler->addStep($step);

    $step = helper_getLoadingStep();

    $step = Mockery::mock($step)->makePartial();

    $step->shouldReceive('addLogger')->once();

    $step->shouldReceive('setLoader')->once();

    $step->shouldReceive('setParentCrawler')->once()->andReturnSelf();

    /** @var Step $step */

    $crawler->addStep($step);
});

test('You can add steps and they are invoked when the Crawler is run', function () {
    $step1 = helper_getValueReturningStep('step1 output')->keepAs('step1');

    $step2 = helper_getValueReturningStep('step2 output')->keepAs('step2');

    $crawler = helper_getDummyCrawler()
        ->addStep($step1)
        ->addStep($step2);

    $crawler->input('randomInput');

    $results = helper_generatorToArray($crawler->run());

    expect($results)->toHaveCount(1)
        ->and($results[0]->toArray())->toBe(['step1' => 'step1 output', 'step2' => 'step2 output']);

});

it('resets the initial inputs and calls the resetAfterRun method of all its steps', function () {
    $step = helper_getInputReturningStep()->uniqueOutputs();

    $crawler = helper_getDummyCrawler()
        ->inputs(['input1', 'input1', 'input2'])
        ->addStep($step->keepAs('foo'));

    $results = helper_generatorToArray($crawler->run());

    expect($results)->toHaveCount(2)
        ->and($results[0]->toArray())->toBe(['foo' => 'input1'])
        ->and($results[1]->toArray())->toBe(['foo' => 'input2']);

    $crawler->inputs(['input1', 'input3']);

    $results = helper_generatorToArray($crawler->run());

    expect($results)->toHaveCount(2)
        ->and($results[0]->toArray())->toBe(['foo' => 'input1'])
        ->and($results[1]->toArray())->toBe(['foo' => 'input3']);

});

test('You can add a step group as a step and all it\'s steps are invoked when the Crawler is run', function () {
    $crawler = helper_getDummyCrawler();

    $step1 = Mockery::mock(StepInterface::class);

    $step1->shouldReceive('invokeStep')->andReturn(helper_arrayToGenerator(['foo']));

    $step1->shouldReceive('addLogger');

    $step2 = Mockery::mock(StepInterface::class);

    $step2->shouldReceive('invokeStep')->andReturn(helper_arrayToGenerator(['bar']));

    $step2->shouldReceive('addLogger');

    $step3 = Mockery::mock(StepInterface::class);

    $step3->shouldReceive('invokeStep')->andReturn(helper_arrayToGenerator(['baz']));

    $step3->shouldReceive('addLogger');

    $crawler->addStep(
        Crawler::group()
            ->addStep($step1)
            ->addStep($step2)
            ->addStep($step3),
    );

    expect(true)->toBeTrue(); // So pest doesn't complain that there is no assertion.
});

/* ----------------------------- keep() and keepAs() ----------------------------- */

test('when you call keep() or keepAs() on a step, it keeps its output data until the end', function () {
    $crawler = helper_getDummyCrawler();

    $crawler
        ->input('test')
        ->addStep(
            helper_getValueReturningStep(['father' => 'Karl', 'mother' => 'Ludmilla'])->keep(),
        )
        ->addStep(
            helper_getValueReturningStep([
                'daughter1' => 'Elisabeth',
                'son1' => 'Leon',
                'son2' => 'Franz',
                'daughter2' => 'Julia',
                'daughter3' => 'Franziska',
            ])->keep(['daughter' => 'daughter2', 'son' => 'son2']),
        )
        ->addStep(helper_getValueReturningStep('Lea')->keepAs('cousin'))
        ->addStep(
            helper_getValueReturningStep([
                'grandson1' => 'Jonah',
                'granddaughter1' => 'Paula',
                'granddaughter2' => 'Sophie',
            ]),
        );

    $results = iterator_to_array($crawler->run());

    expect($results[0]->toArray())->toBe([
        'father' => 'Karl',
        'mother' => 'Ludmilla',
        'daughter' => 'Julia',
        'son' => 'Franz',
        'cousin' => 'Lea',
        'grandson1' => 'Jonah',
        'granddaughter1' => 'Paula',
        'granddaughter2' => 'Sophie',
    ]);
});

it('immediately stops when keepAs() is not used with a scalar value output step', function () {
    $crawler = helper_getDummyCrawler();

    $step1 = new class extends Step {
        public bool $wasCalled = false;

        protected function invoke(mixed $input): Generator
        {
            $this->wasCalled = true;

            yield ['father' => 'Karl', 'mother' => 'Ludmilla'];
        }

        public function outputType(): StepOutputType
        {
            return StepOutputType::AssociativeArrayOrObject;
        }
    };

    $step2 = new class extends Step {
        protected function invoke(mixed $input): Generator
        {
            yield 'foo';
        }

        public function outputType(): StepOutputType
        {
            return StepOutputType::Scalar;
        }
    };

    $crawler
        ->input('test')
        ->addStep($step1->keep())
        ->addStep($step2->keep());

    try {
        $results = iterator_to_array($crawler->run());
    } catch (PreRunValidationException $exception) {
    }

    expect($results ?? null)->toBeEmpty()
        ->and($step1->wasCalled)->toBeFalse()
        ->and($this->getActualOutputForAssertion())->toContain('Pre-Run validation error in step number 2')
        ->and($exception ?? null)->toBeInstanceOf(PreRunValidationException::class);
});

it('sends all results to the Store when there is one and still yields the results', function () {
    $store = Mockery::mock(StoreInterface::class);

    $store->shouldReceive('addLogger');

    $store->shouldReceive('store')->times(3);

    $crawler = helper_getDummyCrawler();

    $crawler->input('gogogo');

    $crawler->setStore($store);

    $step = new class extends Step {
        protected function invoke(mixed $input): Generator
        {
            yield 'one';
            yield 'two';
            yield 'three';
        }
    };

    $crawler->addStep($step->keepAs('number'));

    $results = helper_generatorToArray($crawler->run());

    expect($results)->toHaveCount(3)
        ->and($results[0]->toArray())->toBe(['number' => 'one'])
        ->and($results[1]->toArray())->toBe(['number' => 'two'])
        ->and($results[2]->toArray())->toBe(['number' => 'three']);
});

it(
    'actually runs the crawler without the need to traverse results manually, when runAndTraverse is called',
    function () {
        $step = helper_getInputReturningStep();

        $store = Mockery::mock(StoreInterface::class);

        $store->shouldReceive('addLogger');

        $store->shouldNotReceive('store');

        $crawler = helper_getDummyCrawler()
            ->addStep($step)
            ->setStore($store)
            ->input('test');

        $crawler->run();

        $store = Mockery::mock(StoreInterface::class);

        $store->shouldReceive('store', 'addLogger')->once();

        $crawler = helper_getDummyCrawler()
            ->addStep($step)
            ->setStore($store)
            ->input('test');

        $crawler->runAndTraverse();
    },
);

it('yields only unique outputs from a step when uniqueOutput was called', function () {
    $crawler = helper_getDummyCrawler();

    $crawler->addStep(helper_getInputReturningStep()->uniqueOutputs());

    $crawler->inputs(['one', 'two', 'three', 'one', 'three', 'four', 'one', 'five', 'two']);

    $results = helper_generatorToArray($crawler->run());

    expect($results)->toHaveCount(5);
});

it(
    'cascades step outputs immediately and doesn\'t wait for the current step being called with all the inputs',
    function () {
        $step1 = new class extends Step {
            protected function invoke(mixed $input): Generator
            {
                $this->logger?->info('step1 called');

                yield $input . ' step1-1';

                yield $input . ' step1-2';
            }
        };

        $step2 = new class extends Step {
            protected function invoke(mixed $input): Generator
            {
                $this->logger?->info('step2 called');

                yield $input . ' step2';
            }
        };

        $store = new class extends Store {
            public function store(Result $result): void
            {
                $this->logger?->info('Stored a result');
            }
        };

        $crawler = helper_getDummyCrawler()
            ->inputs(['input1', 'input2'])
            ->addStep($step1->keepAs('foo'))
            ->addStep($step2->keepAs('bar'))
            ->setStore($store);

        $crawler->runAndTraverse();

        $output = $this->getActualOutputForAssertion();

        $outputLines = explode("\n", $output);

        expect($outputLines[0])->toContain('step1 called')
            ->and($outputLines[1])->toContain('step2 called')
            ->and($outputLines[2])->toContain('Stored a result')
            ->and($outputLines[3])->toContain('step2 called')
            ->and($outputLines[4])->toContain('Stored a result')
            ->and($outputLines[5])->toContain('step1 called')
            ->and($outputLines[6])->toContain('step2 called')
            ->and($outputLines[7])->toContain('Stored a result')
            ->and($outputLines[8])->toContain('step2 called')
            ->and($outputLines[9])->toContain('Stored a result');
    },
);

it(
    'immediately calls the store for each final output',
    function () {
        $step1 = new class extends Step {
            protected function invoke(mixed $input): Generator
            {
                $this->logger?->info('step1 called');

                yield '1-1';

                yield '1-2';
            }
        };

        $step2 = new class extends Step {
            protected function invoke(mixed $input): Generator
            {
                $this->logger?->info('step2 called: ' . $input);

                yield $input . ' 2-1';

                yield $input . ' 2-2';
            }
        };

        $step3 = new class extends Step {
            protected function invoke(mixed $input): Generator
            {
                $this->logger?->info('step3 called: ' . $input);

                yield $input . ' 3-1';

                yield $input . ' 3-2';
            }
        };

        $step4 = new class extends Step {
            protected function invoke(mixed $input): Generator
            {
                $this->logger?->info('step4 called: ' . $input);

                yield $input . ' 4-1';

                yield $input . ' 4-2';
            }
        };

        $store = new class extends Store {
            public function store(Result $result): void
            {
                $this->logger?->info('Stored a result: ' . $result->get('unnamed'));
            }
        };

        $crawler = helper_getDummyCrawler()
            ->input('input')
            ->addStep($step1)
            ->addStep($step2)
            ->addStep($step3)
            ->addStep($step4)
            ->setStore($store);

        $crawler->runAndTraverse();

        $output = $this->getActualOutputForAssertion();

        $outputLines = explode("\n", $output);

        expect($outputLines[0])
            ->toContain('step1 called')
            ->and($outputLines[1])->toContain('step2 called: 1-1')
            ->and($outputLines[2])->toContain('step3 called: 1-1 2-1')
            ->and($outputLines[3])->toContain('step4 called: 1-1 2-1 3-1')
            ->and($outputLines[4])->toContain('Stored a result: 1-1 2-1 3-1 4-1')
            ->and($outputLines[5])->toContain('Stored a result: 1-1 2-1 3-1 4-2')
            ->and($outputLines[6])->toContain('step4 called: 1-1 2-1 3-2')
            ->and($outputLines[7])->toContain('Stored a result: 1-1 2-1 3-2 4-1')
            ->and($outputLines[8])->toContain('Stored a result: 1-1 2-1 3-2 4-2')
            ->and($outputLines[9])->toContain('step3 called: 1-1 2-2')
            ->and($outputLines[10])->toContain('step4 called: 1-1 2-2 3-1')
            ->and($outputLines[11])->toContain('Stored a result: 1-1 2-2 3-1 4-1')
            ->and($outputLines[12])->toContain('Stored a result: 1-1 2-2 3-1 4-2')
            ->and($outputLines[13])->toContain('step4 called: 1-1 2-2 3-2')
            ->and($outputLines[14])->toContain('Stored a result: 1-1 2-2 3-2 4-1')
            ->and($outputLines[15])->toContain('Stored a result: 1-1 2-2 3-2 4-2')
            ->and($outputLines[16])->toContain('step2 called: 1-2')
            ->and($outputLines[17])->toContain('step3 called: 1-2 2-1')
            ->and($outputLines[18])->toContain('step4 called: 1-2 2-1 3-1')
            ->and($outputLines[19])->toContain('Stored a result: 1-2 2-1 3-1 4-1')
            ->and($outputLines[20])->toContain('Stored a result: 1-2 2-1 3-1 4-2')
            ->and($outputLines[21])->toContain('step4 called: 1-2 2-1 3-2')
            ->and($outputLines[22])->toContain('Stored a result: 1-2 2-1 3-2 4-1')
            ->and($outputLines[23])->toContain('Stored a result: 1-2 2-1 3-2 4-2')
            ->and($outputLines[24])->toContain('step3 called: 1-2 2-2')
            ->and($outputLines[25])->toContain('step4 called: 1-2 2-2 3-1')
            ->and($outputLines[26])->toContain('Stored a result: 1-2 2-2 3-1 4-1')
            ->and($outputLines[27])->toContain('Stored a result: 1-2 2-2 3-1 4-2')
            ->and($outputLines[28])->toContain('step4 called: 1-2 2-2 3-2')
            ->and($outputLines[29])->toContain('Stored a result: 1-2 2-2 3-2 4-1')
            ->and($outputLines[30])->toContain('Stored a result: 1-2 2-2 3-2 4-2');
    },
);

it(
    'does not wait for all child outputs originating from an output of a step where keepAs() was called before ' .
    'calling the store',
    function () {
        $step1 = new class extends Step {
            protected function invoke(mixed $input): Generator
            {
                $this->logger?->info('step1 called');

                yield '1-1';

                yield '1-2';
            }
        };

        $step2 = new class extends Step {
            protected function invoke(mixed $input): Generator
            {
                $this->logger?->info('step2 called: ' . $input);

                yield $input . ' 2-1';

                yield $input . ' 2-2';
            }
        };

        $step2->keepAs('foo');

        $step3 = new class extends Step {
            protected function invoke(mixed $input): Generator
            {
                $this->logger?->info('step3 called: ' . $input);

                yield $input . ' 3-1';

                yield $input . ' 3-2';
            }
        };

        $step4 = new class extends Step {
            protected function invoke(mixed $input): Generator
            {
                $this->logger?->info('step4 called: ' . $input);

                yield $input . ' 4-1';

                yield $input . ' 4-2';
            }
        };

        $step4->keepAs('bar');

        $store = new class extends Store {
            public function store(Result $result): void
            {
                $this->logger?->info('Stored a result: ' . $result->get('bar'));
            }
        };

        $crawler = helper_getDummyCrawler()
            ->input('input')
            ->addStep($step1)
            ->addStep($step2)
            ->addStep($step3)
            ->addStep($step4)
            ->setStore($store);

        $crawler->runAndTraverse();

        $output = $this->getActualOutputForAssertion();

        $outputLines = explode("\n", $output);

        expect($outputLines[0])->toContain('step1 called')
            ->and($outputLines[1])->toContain('step2 called: 1-1')
            ->and($outputLines[2])->toContain('step3 called: 1-1 2-1')
            ->and($outputLines[3])->toContain('step4 called: 1-1 2-1 3-1')
            ->and($outputLines[4])->toContain('Stored a result: 1-1 2-1 3-1 4-1')
            ->and($outputLines[5])->toContain('Stored a result: 1-1 2-1 3-1 4-2')
            ->and($outputLines[6])->toContain('step4 called: 1-1 2-1 3-2')
            ->and($outputLines[7])->toContain('Stored a result: 1-1 2-1 3-2 4-1')
            ->and($outputLines[8])->toContain('Stored a result: 1-1 2-1 3-2 4-2')
            ->and($outputLines[9])->toContain('step3 called: 1-1 2-2')
            ->and($outputLines[10])->toContain('step4 called: 1-1 2-2 3-1')
            ->and($outputLines[11])->toContain('Stored a result: 1-1 2-2 3-1 4-1')
            ->and($outputLines[12])->toContain('Stored a result: 1-1 2-2 3-1 4-2')
            ->and($outputLines[13])->toContain('step4 called: 1-1 2-2 3-2')
            ->and($outputLines[14])->toContain('Stored a result: 1-1 2-2 3-2 4-1')
            ->and($outputLines[15])->toContain('Stored a result: 1-1 2-2 3-2 4-2')
            ->and($outputLines[16])->toContain('step2 called: 1-2')
            ->and($outputLines[17])->toContain('step3 called: 1-2 2-1')
            ->and($outputLines[18])->toContain('step4 called: 1-2 2-1 3-1')
            ->and($outputLines[19])->toContain('Stored a result: 1-2 2-1 3-1 4-1')
            ->and($outputLines[20])->toContain('Stored a result: 1-2 2-1 3-1 4-2')
            ->and($outputLines[21])->toContain('step4 called: 1-2 2-1 3-2')
            ->and($outputLines[22])->toContain('Stored a result: 1-2 2-1 3-2 4-1')
            ->and($outputLines[23])->toContain('Stored a result: 1-2 2-1 3-2 4-2')
            ->and($outputLines[24])->toContain('step3 called: 1-2 2-2')
            ->and($outputLines[25])->toContain('step4 called: 1-2 2-2 3-1')
            ->and($outputLines[26])->toContain('Stored a result: 1-2 2-2 3-1 4-1')
            ->and($outputLines[27])->toContain('Stored a result: 1-2 2-2 3-1 4-2')
            ->and($outputLines[28])->toContain('step4 called: 1-2 2-2 3-2')
            ->and($outputLines[29])->toContain('Stored a result: 1-2 2-2 3-2 4-1')
            ->and($outputLines[30])->toContain('Stored a result: 1-2 2-2 3-2 4-2');
    },
);

it('logs memory usage if you want it to', function () {
    $step1 = helper_getValueReturningStep('foo');

    $step2 = helper_getValueReturningStep('bar');

    $crawler = helper_getDummyCrawler()
        ->input('go')
        ->addStep($step1)
        ->addStep($step2)
        ->monitorMemoryUsage();

    $crawler->runAndTraverse();

    $output = $this->getActualOutputForAssertion();

    expect($output)->toContain('memory usage: ');
});

it('sends all outputs to the outputHook when defined', function () {
    $outputs = [];

    $crawler = helper_getDummyCrawler()
        ->input(1)
        ->addStep(helper_getNumberIncrementingStep())
        ->addStep(helper_getNumberIncrementingStep())
        ->outputHook(function (Output $output, int $stepIndex, StepInterface $step) use (&$outputs) {
            $outputs[$stepIndex][] = $output->get();
        });

    $crawler->runAndTraverse();

    expect($outputs)->toHaveCount(2)
        ->and($outputs[0])->toHaveCount(1)
        ->and($outputs[0][0])->toBe(2)
        ->and($outputs[1])->toHaveCount(1)
        ->and($outputs[1][0])->toBe(3);
});

test(
    'When result is not explicitly composed and last step produces array output with string keys, it uses those keys ' .
    'for the result.',
    function () {
        $crawler = helper_getDummyCrawler()
            ->input('hello')
            ->addStep(helper_getValueReturningStep(['foo' => 'bar', 'baz' => 'quz']));

        $results = helper_generatorToArray($crawler->run());

        expect($results[0]->toArray())->toBe(['foo' => 'bar', 'baz' => 'quz']);
    },
);

it('just runs the crawler and dumps all results as array when runAndDump() is called', function () {
    helper_getDummyCrawlerWithInputReturningStep()
        ->inputs([
            ['foo' => 'one', 'bar' => 'two'],
            ['baz' => 'three', 'quz' => 'four'],
        ])
        ->runAndDump();

    $actualOutput = $this->getActualOutputForAssertion();

    expect(explode('array(2)', $actualOutput))->toHaveCount(3)
        ->and($actualOutput)->toContain('["foo"]=>')
        ->and($actualOutput)->toContain('string(3) "one"')
        ->and($actualOutput)->toContain('["bar"]=>')
        ->and($actualOutput)->toContain('string(3) "two"')
        ->and($actualOutput)->toContain('["baz"]=>')
        ->and($actualOutput)->toContain('string(5) "three"')
        ->and($actualOutput)->toContain('["quz"]=>')
        ->and($actualOutput)->toContain('string(4) "four"');
});


================================================
FILE: tests/HttpCrawler/AnonymousHttpCrawlerBuilderTest.php
================================================
<?php

use Crwlr\Crawler\HttpCrawler;
use Crwlr\Crawler\Loader\Http\HttpLoader;
use Crwlr\Crawler\UserAgents\BotUserAgent;
use Crwlr\Crawler\UserAgents\UserAgent;

it('builds an HttpCrawler instance with a bot user agent', function () {
    $crawler = HttpCrawler::make()->withBotUserAgent('YoloCrawler');

    expect($crawler)->toBeInstanceOf(HttpCrawler::class)
        ->and($crawler->getLoader())->toBeInstanceOf(HttpLoader::class);

    $loader = $crawler->getLoader();

    expect($loader->userAgent())->toBeInstanceOf(BotUserAgent::class);

    $userAgent = $loader->userAgent();

    /** @var BotUserAgent $userAgent */

    expect($userAgent->productToken())->toBe('YoloCrawler');
});

it('creates an HttpCrawler instance with a non bot user agent', function () {
    $crawler = HttpCrawler::make()
        ->withUserAgent('Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 ...');

    expect($crawler)->toBeInstanceOf(HttpCrawler::class)
        ->and($crawler->getLoader())->toBeInstanceOf(HttpLoader::class);

    $loader = $crawler->getLoader();

    expect($loader->userAgent())->toBeInstanceOf(UserAgent::class);

    $userAgent = $loader->userAgent();

    /** @var UserAgent $userAgent */

    expect($userAgent->__toString())->toBe('Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 ...');
});

it('creates an HttpCrawler instance with a mozilla 5.0 compatible user agent', function () {
    $crawler = HttpCrawler::make()->withMozilla5CompatibleUserAgent();

    $userAgent = $crawler->getLoader()->userAgent();

    expect($userAgent->__toString())->toBe('Mozilla/5.0 (compatible)');
});


================================================
FILE: tests/IoTest.php
================================================
<?php

namespace tests;

use Crwlr\Crawler\Io;

/**
 * @param mixed[] $keep
 */
function helper_getIoInstance(
    mixed $value,
    array $keep = [],
): Io {
    return new class ($value, $keep) extends Io {};
}

it('can be created with only a value.', function () {
    $io = helper_getIoInstance('test');

    expect($io)->toBeInstanceOf(Io::class);
});

test('you can add an array with data that should be kept (see Step::keep() functionality)', function () {
    $keep = ['foo' => 'bar', 'baz' => 'quz'];

    $io = helper_getIoInstance('test', keep: $keep);

    expect($io->keep)->toBe($keep);
});

test('you can create it from another Io instance and it keeps the value of the original instance.', function () {
    $io1 = helper_getIoInstance('test');

    $io2 = helper_getIoInstance($io1);

    expect($io2->get())->toBe('test');
});

test('when created from another Io instance it passes on the data to keep', function () {
    $io1 = helper_getIoInstance('test', keep: ['co' => 'derotsch']);

    $io2 = helper_getIoInstance($io1);

    expect($io2->keep)->toBe(['co' => 'derotsch']);
});

test('the withValue() method creates a new instance with that value but keeps the keep data', function () {
    $io1 = helper_getIoInstance('hey', ['baz' => 'three']);

    $io2 = $io1->withValue('ho');

    expect($io2->get())->toBe('ho')
        ->and($io2->keep)->toBe(['baz' => 'three']);
});

test(
    'the withPropertyValue() method creates a new instance and replaces a certain property in its array value',
    function () {
        $io1 = helper_getIoInstance(['a' => '1', 'b' => '2', 'c' => '3'], ['baz' => 'three']);

        $io2 = $io1->withPropertyValue('c', '4');

        expect($io2->get())->toBe(['a' => '1', 'b' => '2', 'c' => '4'])
            ->and($io2->keep)->toBe(['baz' => 'three']);
    },
);

test('if the property does not exist, it is added, when withPropertyValue() is used', function () {
    $io1 = helper_getIoInstance(['a' => '1', 'b' => '2']);

    $io2 = $io1->withPropertyValue('c', '3');

    expect($io2->get())->toBe(['a' => '1', 'b' => '2', 'c' => '3']);
});

it('gets a particular property by key from array output', function () {
    $io = helper_getIoInstance(['foo' => 'so', 'bar' => 'lala', 'baz' => 'bla']);

    expect($io->getProperty('bar'))->toBe('lala');
});

it('when the property does not exist, getProperty() returns the defined fallback value (default null)', function () {
    $io = helper_getIoInstance(['foo' => 'so', 'bar' => 'lala', 'baz' => 'bla']);

    expect($io->getProperty('quz'))->toBeNull()
        ->and($io->getProperty('quz', 123))->toBe(123);
});

it('sets a simple value key', function ($value, $key) {
    $io = helper_getIoInstance($value);

    expect($io->setKey())->toBe($key)
        ->and($io->getKey())->toBe($key);
})->with([
    ['foo', 'foo'],
    [123, '123'],
    [123.1234, '123.1234'],
    [true, 'true'],
    [false, 'false'],
    [null, 'null'],
]);

it('sets a key from array output', function () {
    $io = helper_getIoInstance(['foo' => 'bar', 'yo' => 123.45]);

    expect($io->setKey('yo'))->toBe('123.45')
        ->and($io->getKey())->toBe('123.45');
});

it('sets a key from object output', function () {
    $value = helper_getStdClassWithData(['foo' => 'bar', 'yo' => 123.45]);

    $io = helper_getIoInstance($value);

    expect($io->setKey('yo'))->toBe('123.45')
        ->and($io->getKey())->toBe('123.45');
});

it('creates a string key for array output when not providing a key name', function () {
    $io = helper_getIoInstance(['one', 'two', 'three']);

    expect($io->setKey())->toBe('6975f1fd65cae4b21e32f4f47bf153a8')
        ->and($io->getKey())->toBe('6975f1fd65cae4b21e32f4f47bf153a8');
});

it('creates a string key for object output when not providing a key name', function () {
    $object = helper_getStdClassWithData(['one', 'two', 'three']);

    $io = helper_getIoInstance($object);

    expect($io->setKey())->toBe('bb8dd69ea029ca1379df3994721f5fa9')
        ->and($io->getKey())->toBe('bb8dd69ea029ca1379df3994721f5fa9');
});

it('creates a string key for array output when provided key name doesn\'t exist in output array', function () {
    $io = helper_getIoInstance(['one', 'two', 'three']);

    expect($io->setKey('four'))->toBe('6975f1fd65cae4b21e32f4f47bf153a8')
        ->and($io->getKey())->toBe('6975f1fd65cae4b21e32f4f47bf153a8');
});

it('creates a string key for array output when provided key name doesn\'t exist in output object', function () {
    $object = helper_getstdClassWithData(['one', 'two', 'three']);

    $io = helper_getIoInstance($object);

    expect($io->setKey('four'))->toBe('bb8dd69ea029ca1379df3994721f5fa9')
        ->and($io->getKey())->toBe('bb8dd69ea029ca1379df3994721f5fa9');
});

test('getKey returns a key when setKey was not called yet', function () {
    $io = helper_getIoInstance('test');

    expect($io->getKey())->toBe('test');
});

test('isArrayWithStringKeys returns true when the value is an array with string keys', function () {
    $io = helper_getIoInstance(['foo' => 'one', 'bar' => 'two', 'baz' => 'three']);

    expect($io->isArrayWithStringKeys())->toBeTrue();
});

test('isArrayWithStringKeys returns false when the value is not an array with string keys', function ($value) {
    $io = helper_getIoInstance($value);

    expect($io->isArrayWithStringKeys())->toBeFalse();
})->with([
    123,
    true,
    ['foo', 'bar'],
    helper_getStdClassWithData(['foo' => 'bar']),
]);

it('adds data to keep when calling keep() and makes already existing keys an array', function () {
    $io = helper_getIoInstance('value', keep: ['foo' => 'one', 'bar' => 'two']);

    $io->keep(['bar' => 'three', 'baz' => 'four']);

    expect($io->keep)->toBe(['foo' => 'one', 'bar' => ['two', 'three'], 'baz' => 'four']);
});


================================================
FILE: tests/Loader/Http/Browser/ScreenshotConfigTest.php
================================================
<?php

namespace tests\Loader\Http\Browser;

use Crwlr\Crawler\Loader\Http\Browser\ScreenshotConfig;
use HeadlessChromium\Clip;
use HeadlessChromium\Page;
use Mockery;

it('can be constructed with a store path only', function () {
    $instance = new ScreenshotConfig('/some/path');

    expect($instance->storePath)->toBe('/some/path')
        ->and($instance->fileType)->toBe('png')
        ->and($instance->quality)->toBeNull()
        ->and($instance->fullPage)->toBeFalse();
});

it('can be constructed via the static make() method', function () {
    $instance = ScreenshotConfig::make('/some/different/path');

    expect($instance->storePath)->toBe('/some/different/path')
        ->and($instance->fileType)->toBe('png')
        ->and($instance->quality)->toBeNull()
        ->and($instance->fullPage)->toBeFalse();
});

test('the image file type can be changed to jpeg via the setImageFileType() method', function () {
    $instance = ScreenshotConfig::make('/some/path')->setImageFileType('jpeg');

    expect($instance->fileType)->toBe('jpeg')
        ->and($instance->quality)->toBe(80);
});

test('the image file type can be changed to webp via the setImageFileType() method', function () {
    $instance = ScreenshotConfig::make('/some/path')->setImageFileType('webp');

    expect($instance->fileType)->toBe('webp')
        ->and($instance->quality)->toBe(80);
});

test('the image file type can be changed to png via the setImageFileType() method', function () {
    $instance = ScreenshotConfig::make('/some/path')->setImageFileType('jpeg');

    $instance->setImageFileType('png');

    expect($instance->fileType)->toBe('png')
        ->and($instance->quality)->toBeNull();
});

test('setting the image file type to something different than png, jpeg or webp does not work', function () {
    $instance = ScreenshotConfig::make('/some/path')->setImageFileType('gif');

    expect($instance->fileType)->toBe('png');
});

test('the image quality can be changed via setQuality()', function () {
    $instance = ScreenshotConfig::make('/some/path')->setImageFileType('jpeg')->setQuality(65);

    expect($instance->quality)->toBe(65);
});

test('the image quality can not be changed via setQuality() when the file type is png', function () {
    $instance = ScreenshotConfig::make('/some/path')->setQuality(65);

    expect($instance->quality)->toBeNull();
});

test('the full page param can be set to true via setFullPage()', function () {
    $instance = ScreenshotConfig::make('/some/path')->setFullPage();

    expect($instance->fullPage)->toBeTrue();
});

it('creates a config array for the chrome-php library', function () {
    $pageMock = Mockery::mock(Page::class);

    $instance = ScreenshotConfig::make('/some/path');

    expect($instance->toChromePhpScreenshotConfig($pageMock))->toBe(['format' => 'png']);
});

test('the config array for the chrome-php library contains the image quality', function () {
    $pageMock = Mockery::mock(Page::class);

    $instance = ScreenshotConfig::make('/some/path')->setImageFileType('webp')->setQuality(75);

    expect($instance->toChromePhpScreenshotConfig($pageMock))->toBe(['format' => 'webp', 'quality' => 75]);
});

test('the config array has the necessary properties when fullPage is set to true', function () {
    $pageMock = Mockery::mock(Page::class);

    $pageMock->shouldReceive('getFullPageClip')->andReturn(Mockery::mock(Clip::class));

    $instance = ScreenshotConfig::make('/some/path')->setFullPage();

    $configArray = $instance->toChromePhpScreenshotConfig($pageMock);

    expect($configArray['format'])->toBe('png')
        ->and($configArray['captureBeyondViewport'])->toBeTrue()
        ->and($configArray['clip'])->toBeInstanceOf(Clip::class);
});


================================================
FILE: tests/Loader/Http/Cache/RetryManagerTest.php
================================================
<?php

namespace tests\Loader\Http\Cache;

use Crwlr\Crawler\Loader\Http\Cache\RetryManager;

it('returns true for status codes >= 400 when nothing else was defined', function (int $statusCode) {
    expect((new RetryManager())->shallBeRetried($statusCode))->toBeTrue();
})->with([[403], [404], [500], [503]]);

it('returns false for status codes below 400 when nothing else was defined', function (int $statusCode) {
    expect((new RetryManager())->shallBeRetried($statusCode))->toBeFalse();
})->with([[100], [200], [302], [308]]);

it(
    'returns true for only one error status code when only() was used with an int',
    function (int $statusCode, bool $expected) {
        $retryManager = new RetryManager();

        $retryManager->only(404);

        expect($retryManager->shallBeRetried($statusCode))->toBe($expected);
    },
)->with([
    [401, false],
    [403, false],
    [404, true],
    [405, false],
    [500, false],
    [503, false],
]);

it(
    'returns true for only a set of error status codes when only() was used with an array',
    function (int $statusCode, bool $expected) {
        $retryManager = new RetryManager();

        $retryManager->only([404, 503]);

        expect($retryManager->shallBeRetried($statusCode))->toBe($expected);
    },
)->with([
    [401, false],
    [403, false],
    [404, true],
    [405, false],
    [500, false],
    [503, true],
]);

it(
    'returns true for all error status codes except one, when except() was used with an int',
    function (int $statusCode, bool $expected) {
        $retryManager = new RetryManager();

        $retryManager->except(404);

        expect($retryManager->shallBeRetried($statusCode))->toBe($expected);
    },
)->with([
    [401, true],
    [403, true],
    [404, false],
    [405, true],
    [500, true],
    [503, true],
]);

it(
    'returns true except for a set of error status codes, when except() was used with an array',
    function (int $statusCode, bool $expected) {
        $retryManager = new RetryManager();

        $retryManager->except([403, 410, 500]);

        expect($retryManager->shallBeRetried($statusCode))->toBe($expected);
    },
)->with([
    [401, true],
    [403, false],
    [404, true],
    [405, true],
    [410, false],
    [500, false],
    [503, true],
]);


================================================
FILE: tests/Loader/Http/Cookies/CookieJarTest.php
================================================
<?php

namespace tests\Loader\Http\Cookies;

use Crwlr\Crawler\Loader\Http\Cookies\CookieJar;
use Crwlr\Url\Url;
use GuzzleHttp\Psr7\Response;
use HeadlessChromium\Cookies\Cookie;
use HeadlessChromium\Cookies\CookiesCollection;

test('addFrom works with a string url', function () {
    $jar = new CookieJar();

    $jar->addFrom('https://www.crwl.io', new Response(200, [
        'Set-Cookie' => ['cook13=v4lu3; Secure'],
    ]));

    $allCookiesForDomain = $jar->allByDomain('crwl.io');

    expect($allCookiesForDomain)->toHaveCount(1);
});

test('addFrom works with an instance of UriInterface', function () {
    $jar = new CookieJar();

    $jar->addFrom(Url::parsePsr7('https://www.crwl.io'), new Response(200, [
        'Set-Cookie' => ['cook13=v4lu3; Secure'],
    ]));

    $allCookiesForDomain = $jar->allByDomain('crwl.io');

    expect($allCookiesForDomain)->toHaveCount(1);
});

test('addFrom works with an instance of Url', function () {
    $jar = new CookieJar();

    $jar->addFrom(Url::parse('https://www.crwl.io'), new Response(200, [
        'Set-Cookie' => ['cook13=v4lu3; Secure'],
    ]));

    $allCookiesForDomain = $jar->allByDomain('crwl.io');

    expect($allCookiesForDomain)->toHaveCount(1);
});

test('addFrom() works with a CookieCollection from the chrome-php lib', function () {
    $jar = new CookieJar();

    $jar->addFrom(Url::parse('https://www.crwl.io'), new CookiesCollection([
        new Cookie([
            'name' => 'foo',
            'value' => 'one',
            'domain' => '.www.crwl.io',
            'expires' => '1745068860',
            'max-age' => '86400',
            'secure' => true,
            'httpOnly' => true,
            'sameSite' => 'Strict',
        ]),
        new Cookie([
            'name' => 'bar',
            'value' => 'two',
            'domain' => '.www.crwl.io',
            'expires' => '1729603260.5272',
            'path' => '/bar',
        ]),
        new Cookie([
            'name' => 'baz',
            'value' => 'three',
            'domain' => '.www.crwl.io',
            'expires' => '1764076860.878',
        ]),
    ]));

    $allCookiesForDomain = $jar->allByDomain('crwl.io');

    expect($allCookiesForDomain)->toHaveCount(3)
        ->and($allCookiesForDomain['foo']->expires()?->dateTime()->format('Y-m-d H:i'))->toBe('2025-04-19 13:21')
        ->and($allCookiesForDomain['foo']->name())->toBe('foo')
        ->and($allCookiesForDomain['foo']->value())->toBe('one')
        ->and($allCookiesForDomain['foo']->domain())->toBe('www.crwl.io')
        ->and($allCookiesForDomain['foo']->maxAge())->toBe(86400)
        ->and($allCookiesForDomain['foo']->path())->toBeNull()
        ->and($allCookiesForDomain['foo']->secure())->toBeTrue()
        ->and($allCookiesForDomain['foo']->httpOnly())->toBeTrue()
        ->and($allCookiesForDomain['foo']->sameSite())->toBe('Strict')
        ->and($allCookiesForDomain['bar']->expires()?->dateTime()->format('Y-m-d H:i'))->toBe('2024-10-22 13:21')
        ->and($allCookiesForDomain['bar']->name())->toBe('bar')
        ->and($allCookiesForDomain['bar']->value())->toBe('two')
        ->and($allCookiesForDomain['bar']->domain())->toBe('www.crwl.io')
        ->and($allCookiesForDomain['bar']->maxAge())->toBeNull()
        ->and($allCookiesForDomain['bar']->path())->toBe('/bar')
        ->and($allCookiesForDomain['bar']->secure())->toBeFalse()
        ->and($allCookiesForDomain['bar']->httpOnly())->toBeFalse()
        ->and($allCookiesForDomain['bar']->sameSite())->toBe('Lax')
        ->and($allCookiesForDomain['baz']->expires()?->dateTime()->format('Y-m-d H:i'))->toBe('2025-11-25 13:21');
});

it('adds all cookies from a response', function () {
    $jar = new CookieJar();

    $jar->addFrom(Url::parse('https://www.otsch.codes'), new Response(200, [
        'Set-Cookie' => ['cook13=v4lu3; Secure', 'anotherCookie=andItsValue', 'oneMoreCookie=dough'],
    ]));

    $allCookiesForDomain = $jar->allByDomain('otsch.codes');

    expect($allCookiesForDomain)->toHaveCount(3);
});

it('returns all cookies that should be sent to a url', function () {
    $jar = new CookieJar();

    $jar->addFrom(Url::parse('https://www.otsch.codes/blog'), new Response(200, [
        'Set-Cookie' => [
            'cook13=v4lu3; Secure',
            '__Host-anotherCookie=andItsValue; Secure; Path=/',
            'oneMoreCookie=dough',
        ],
    ]));

    expect($jar->getFor('https://www.otsch.codes/contact'))->toHaveCount(3)
        ->and($jar->getFor('https://jobs.otsch.codes/index'))->toHaveCount(2)
        ->and($jar->getFor('http://games.otsch.codes'))->toHaveCount(1);
});


================================================
FILE: tests/Loader/Http/Cookies/CookieTest.php
================================================
<?php

namespace tests\Loader\Http\Cookies;

use Crwlr\Crawler\Loader\Http\Cookies\Exceptions\InvalidCookieException;
use Crwlr\Crawler\Loader\Http\Cookies\Cookie;
use Crwlr\Crawler\Loader\Http\Cookies\Date;
use Crwlr\Url\Url;
use DateInterval;
use DateTime;
use DateTimeInterface;
use DateTimeZone;
use Psr\Http\Message\UriInterface;

test('It can be created with received from url as string argrument', function () {
    $cookie = new Cookie('https://www.crwlr.software/packages', 'cookieName=cookieValue');
    expect($cookie)->toBeInstanceOf(Cookie::class);
});

test('It can be created with received from url as Url object', function () {
    $cookie = new Cookie(Url::parse('https://www.crwlr.software/packages'), 'cookieName=cookieValue');
    expect($cookie)->toBeInstanceOf(Cookie::class);
});

test('It provides the received from url as PSR-7 Uri object', function () {
    $cookie = new Cookie('https://www.crwlr.software/contact', 'cookieName=cookieValue');
    expect($cookie->receivedFromUrl())->toBeInstanceOf(UriInterface::class);
});

test('It must at least have a name and value', function () {
    new Cookie(Url::parse('https://www.crwlr.software/packages'), 'cookieNameWithoutValueIsInvalid');
})->throws(InvalidCookieException::class);

test('It parses the name and value of the cookie', function () {
    $cookie = new Cookie('https://www.crwlr.software/blog', 'crwlrsoftware_session=foobar');
    expect($cookie->name())->toBe('crwlrsoftware_session');
    expect($cookie->value())->toBe('foobar');
});

test('The __toString() method returns name=value (only)', function () {
    $cookie = new Cookie('https://www.crwl.io', '__Secure-cook13N4m3=c00k1eV4lu3; Secure; Path=/');
    expect($cookie->__toString())->toBe('__Secure-cook13N4m3=c00k1eV4lu3');
});

test('It automatically sets the domain based on the received from url when no attribute is included', function () {
    // https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Set-Cookie
    // If omitted, this attribute defaults to the host of the current document URL, not including subdomains.
    $cookie = new Cookie('https://www.otsch.codes/blog', 'otschcodes_session=cook13');
    expect($cookie->domain())->toBe('otsch.codes');
});

test('It parses an expires attribute when included', function () {
    $cookie = new Cookie(
        'https://www.otsch.codes/blog',
        'otschcodes_session=cook13; Expires=Wed, 23-Feb-2022 10:13:41 GMT',
    );
    expect($cookie->expires())->toBeInstanceOf(Date::class);
    expect($cookie->expires()->dateTime()->format('Y-m-d H:i'))->toBe('2022-02-23 10:13'); // @phpstan-ignore-line
});

test('It parses a maxAge attribute when included', function () {
    $cookie = new Cookie('https://www.otsch.codes/blog', 'otschcodes_session=cook13; Max-Age=600');
    expect($cookie->maxAge())->toBeInt();
    expect($cookie->maxAge())->toBe(600);
});

test('It parses a domain attribute when included', function () {
    $cookie = new Cookie('https://sub.domain.example.com/foobar', 'fookie=cook13; domain=domain.example.com');
    expect($cookie->domain())->toBe('domain.example.com');
});

test('It\'s not allowed to set a different domain than the one of the document url it was received from', function () {
    new Cookie('https://sub.domain.example.com/foobar', 'fookie=cook13; domain=crwl.io');
})->throws(InvalidCookieException::class);

test('It\'s not allowed to set a subdomain that is not included in the document url it was received from', function () {
    new Cookie('https://sub.domain.example.com/foobar', 'fookie=cook13; domain=foo.example.com');
})->throws(InvalidCookieException::class);

test('When domain attribute is defined with leading dot, it\'s ignored', function () {
    $cookie = new Cookie('https://sub.domain.example.com/', 'fookie=cook13; domain=.domain.example.com');
    expect($cookie->domain())->toBe('domain.example.com');
});

test('It parses a path attribute when included', function () {
    $cookie = new Cookie('https://sub.domain.example.com/foobar', 'co=asdf2345; path=/foobar');
    expect($cookie->path())->toBe('/foobar');
});

test('It parses a secure attribute when included', function () {
    $cookie = new Cookie('https://sub.domain.example.com/foobar', 'co=asdf2345; Secure');
    expect($cookie->secure())->toBeTrue();
});

test(
    'It throws an exception when secure attribute is sent but url where it was received from is not on https',
    function () {
        new Cookie('http://www.example.io/foobar', 'eggs=ample; Secure');
    },
)->throws(InvalidCookieException::class);

test('It parses a SameSite attribute when included', function ($value) {
    $cookie = new Cookie('https://www.example.io/foobar', 'eggs=ample; SameSite=' . $value);
    expect($cookie->sameSite())->toBe($value);
})->with(['Strict', 'Lax', 'None']);

test('It throws an error when an unknown value is sent for the SameSite attribute', function () {
    new Cookie('https://www.example.io/foobar', 'eggs=ample; SameSite=Foo');
})->throws(InvalidCookieException::class);

test('It parses an HttpOnly attribute when included', function () {
    $cookie = new Cookie('https://jobs.foo.bar/', 'csrf=asdfjkloe123; HttpOnly');
    expect($cookie->httpOnly())->toBeTrue();
});

test('It\'s possible to set multiple attributes', function () {
    $cookie = new Cookie(
        'https://www.crwl.io',
        '__Secure-cook13N4m3=c00k1eV4lu3; Expires=Wed, 23-Feb-2022 10:13:41 GMT; Secure; Path=/foo',
    );
    expect($cookie->secure())->toBeTrue();
    expect($cookie->expires()?->dateTime()->format('d.m.Y H:i'))->toBe('23.02.2022 10:13');
    expect($cookie->path())->toBe('/foo');
});

test(
    'It throws an Exception when cookie name is prefixed with __Secure- or __Host- and not sent via https',
    function ($prefix) {
        new Cookie('http://example.com', $prefix . 'Abc=defg123; Secure');
    },
)->with(['__Secure-', '__Host-'])->throws(InvalidCookieException::class);

test(
    'It throws an Exception when cookie name is prefixed with __Secure- or __Host- and Secure flag is not included',
    function ($prefix) {
        new Cookie('https://example.com', $prefix . 'Abc=defg123;');
    },
)->with(['__Secure-', '__Host-'])->throws(InvalidCookieException::class);

test('Using __Secure- prefix works when received via https and Secure flag is included', function () {
    $cookie = new Cookie('https://www.crwl.io', '__Secure-Foo=bar123; Secure');
    expect($cookie->hasSecurePrefix())->toBeTrue();
});

test('It throws an Exception when __Host- prefix used and Domain attribute included', function () {
    new Cookie('https://www.crwlr.software/', '__Host-Foo=bar123; Secure; Domain=www.crwlr.software; Path=/');
})->throws(InvalidCookieException::class);

test('It throws an Exception when __Host- prefix used and Path attribute is not included', function () {
    new Cookie('https://www.crwlr.software/', '__Host-Foo=bar123; Secure;');
})->throws(InvalidCookieException::class);

test('It throws an Exception when __Host- prefix used and Path attribute is not "/"', function () {
    new Cookie('https://www.crwlr.software/', '__Host-Foo=bar123; Secure; Path=/foo');
})->throws(InvalidCookieException::class);

test('Using __Host- works when everything is valid', function () {
    $cookie = new Cookie('https://www.crwlr.software/', '__Host-Foo=bar123; Secure; Path=/');
    expect($cookie->hasHostPrefix())->toBeTrue();
});

test(
    'It should not be sent to a url when the domain doesn\'t match',
    function ($receivedFrom, $domainAttribute, $shouldBeSentTo) {
        $cookie = new Cookie($receivedFrom, 'cookie=value' . ($domainAttribute ? '; Domain=' . $domainAttribute : ''));
        expect($cookie->shouldBeSentTo($shouldBeSentTo))->toBeFalse();
    },
)->with([
    ['https://www.crwlr.software', null, 'https://www.otsch.codes'],
    ['https://www.crwlr.software', 'www.crwlr.software', 'https://jobs.crwlr.software'],
    ['https://www.crwlr.software', 'www.crwlr.software', 'https://crwlr.software'],
    ['https://sub.domain.crwlr.software', 'sub.domain.crwlr.software', 'https://sab.domain.crwlr.software'],
    ['https://sub.domain.crwlr.software', 'sub.domain.crwlr.software', 'https://domain.crwlr.software'],
]);

test('It should be sent to a url when the domain matches', function ($receivedFrom, $domainAttribute, $shouldBeSentTo) {
    $cookie = new Cookie($receivedFrom, 'cookie=value' . ($domainAttribute ? '; Domain=' . $domainAttribute : ''));
    expect($cookie->shouldBeSentTo($shouldBeSentTo))->toBeTrue();
})->with([
    ['https://www.crwlr.software', null, 'https://www.crwlr.software'],
    ['https://www.crwlr.software', null, 'https://crwlr.software'],
    ['https://www.crwlr.software', null, 'https://anything.crwlr.software'],
    ['https://sub.domain.crwlr.software', 'domain.crwlr.software', 'https://domain.crwlr.software'],
    ['https://sub.domain.crwlr.software', 'domain.crwlr.software', 'https://sab.domain.crwlr.software'],
]);

test(
    'It should not be sent to a url when it has a __Host- prefix and hosts don\'t match exactly',
    function ($receivedFrom, $shouldBeSentTo) {
        $cookie = new Cookie($receivedFrom, '__Host-cookie=value; Secure; Path=/');
        expect($cookie->shouldBeSentTo($shouldBeSentTo))->toBeFalse();
    },
)->with([
    ['https://www.crwlr.software', 'https://jobs.crwlr.software'],
    ['https://sub.domain.crwlr.software', 'https://domain.crwlr.software'],
    ['https://subdomain.crwlr.software', 'https://sabdomain.crwlr.software'],
]);

test('It should not be sent to non https url when secure flag is included', function () {
    $cookie = new Cookie('https://www.crwl.io', 'cookie=value; Secure');
    expect($cookie->shouldBeSentTo('http://www.crwl.io'))->toBeFalse();
});

test('It should be sent to https url when secure flag is included', function () {
    $cookie = new Cookie('https://www.crwl.io', 'cookie=value; Secure');
    expect($cookie->shouldBeSentTo('https://www.crwl.io'))->toBeTrue();
});

test('It should be sent to non https url when secure flag is included but host is localhost', function ($host) {
    $cookie = new Cookie('https://' . $host, 'cookie=value; Secure');
    expect($cookie->shouldBeSentTo('http://' . $host))->toBeTrue();
})->with(['localhost', '127.0.0.1']);

test(
    'It should not be sent to urls where the path doesn\'t match the sent path attribute',
    function ($path, $shouldBeSentTo) {
        $cookie = new Cookie('https://www.crwlr.software', 'cookie=value; Path=' . $path);
        expect($cookie->shouldBeSentTo('https://www.crwlr.software' . $shouldBeSentTo))->toBeFalse();
    },
)->with([
    ['/foo', '/bar'],
    ['/foo', '/foobar'],
    ['/foo', '/'],
    ['/foo', '/bar/foo'],
]);

test(
    'It should be sent to urls where the path does match the sent path attribute',
    function ($path, $shouldBeSentTo) {
        $cookie = new Cookie('https://www.crwlr.software', 'cookie=value; Path=' . $path);
        expect($cookie->shouldBeSentTo('https://www.crwlr.software' . $shouldBeSentTo))->toBeTrue();
    },
)->with([
    ['/', '/anything'],
    ['/foo', '/foo'],
    ['/foo', '/foo/something'],
    ['/foo', '/foo/some/thing'],
]);

test('It should not be sent when already expired', function () {
    $now = new DateTime('now', new DateTimeZone('GMT'));
    $now = $now->sub(new DateInterval('PT1S'));
    $cookie = new Cookie(
        'https://www.crwlr.software',
        'cookie=value; Expires=' . $now->format(DateTimeInterface::COOKIE),
    );
    expect($cookie->shouldBeSentTo('https://www.crwlr.software'))->toBeFalse();
});

test('It should be sent when date of expires attribute is not reached', function () {
    $now = new DateTime('now', new DateTimeZone('GMT'));
    $now = $now->add(new DateInterval('PT5S'));
    $cookie = new Cookie(
        'https://www.crwlr.software',
        'cookie=value; Expires=' . $now->format(DateTimeInterface::COOKIE),
    );
    expect($cookie->shouldBeSentTo('https://www.crwlr.software'))->toBeTrue();
});

test('It should not be sent when maxAge attribute is already reached', function () {
    $cookie = new Cookie('https://www.crwlr.software', 'cookie=value; Max-Age=1');

    expect($cookie->shouldBeSentTo('https://www.crwlr.software'))->toBeTrue();

    invade($cookie)->receivedAtTimestamp -= 2; // instead of sleep, manipulate the timestamp when it was received.

    expect($cookie->shouldBeSentTo('https://www.crwlr.software'))->toBeFalse();
});

test('It is immediately expired when the max-age attribute is zero or negative', function ($maxAgeValue) {
    $cookie = new Cookie('https://www.crwlr.software', 'cookie=value; Max-Age=' . $maxAgeValue);
    expect($cookie->shouldBeSentTo('https://www.crwlr.software'))->toBeFalse();
})->with([0, -1, -5, -1000]);

test('It should be sent when maxAge attribute is not yet reached', function () {
    $cookie = new Cookie('https://www.crwlr.software', 'cookie=value; Max-Age=1');
    expect($cookie->shouldBeSentTo('https://www.crwlr.software'))->toBeTrue();
});


================================================
FILE: tests/Loader/Http/Cookies/DateTest.php
================================================
<?php

namespace tests\Loader\Http\Cookies;

use Crwlr\Crawler\Loader\Http\Cookies\Date;
use DateTimeZone;

test('It can be created from a valid http header date format', function () {
    $date = new Date('Tue, 22-Feb-2022 16:04:55 GMT');

    expect($date)->toBeInstanceOf(Date::class);

    expect($date->dateTime()->format('Y-m-d H:i:s'))->toBe('2022-02-22 16:04:55');
});

test('It gets the timezone right', function () {
    $date = new Date('Tue, 22-Feb-2022 20:04:29 GMT');

    expect(
        $date->dateTime()->setTimezone(new DateTimeZone('Europe/Vienna'))->format('d.m.Y H:i:s'),
    )->toBe('22.02.2022 21:04:29');
});

test('It also works without the dashes between d-M-Y in the format', function () {
    $date = new Date('Wed, 05 Jul 2023 15:19:55 GMT');

    expect(
        $date->dateTime()->setTimezone(new DateTimeZone('Europe/Vienna'))->format('d.m.Y H:i:s'),
    )->toBe('05.07.2023 17:19:55');
});


================================================
FILE: tests/Loader/Http/HeadlessBrowserLoaderHelperTest.php
================================================
<?php

namespace tests\Loader\Http;

use Closure;
use Crwlr\Crawler\Loader\Http\Cookies\CookieJar;
use Crwlr\Crawler\Loader\Http\HeadlessBrowserLoaderHelper;
use Crwlr\Crawler\Steps\Loading\Http;
use Exception;
use GuzzleHttp\Psr7\Request;
use HeadlessChromium\AutoDiscover;
use HeadlessChromium\Browser\ProcessAwareBrowser;
use HeadlessChromium\BrowserFactory;
use HeadlessChromium\Communication\Message;
use HeadlessChromium\Communication\Session;
use HeadlessChromium\Cookies\CookiesCollection;
use HeadlessChromium\Page;
use HeadlessChromium\PageUtils\PageNavigation;
use Mockery;
use Psr\Log\LoggerInterface;
use tests\_Stubs\DummyLogger;

use function tests\helper_getMinThrottler;

function helper_setUpHeadlessChromeMocks(
    ?Closure $pageNavigationArgsClosure = null,
    ?Closure $createBrowserArgsExpectationCallback = null,
    ?Closure $browserMockCallback = null,
    ?Closure $pageSessionMockCallback = null,
    ?Closure $pageMockCallback = null,
): BrowserFactory {
    $browserFactoryMock = Mockery::mock(BrowserFactory::class);

    $browserMock = Mockery::mock(ProcessAwareBrowser::class);

    $createBrowserExpectation = $browserFactoryMock->shouldReceive('createBrowser');

    if ($createBrowserArgsExpectationCallback) {
        $createBrowserExpectation->withArgs($createBrowserArgsExpectationCallback);
    }

    $createBrowserExpectation->andReturn($browserMock);

    $pageMock = Mockery::mock(Page::class);

    $browserMock->shouldReceive('createPage')->andReturn($pageMock);

    if ($browserMockCallback) {
        $browserMockCallback($browserMock);
    }

    $sessionMock = Mockery::mock(Session::class);

    $pageMock->shouldReceive('getSession')->andReturn($sessionMock);

    if ($pageSessionMockCallback) {
        $pageSessionMockCallback($sessionMock);
    }

    $pageMock->shouldReceive('getCookies')->andReturn(new CookiesCollection([]));

    $sessionMock->shouldReceive('once');

    $pageNavigationMock = Mockery::mock(PageNavigation::class);

    $pageMock->shouldReceive('navigate')->andReturn($pageNavigationMock);

    $pageMock->shouldReceive('getHtml')->andReturn('<html><head></head><body>Hello World!</body></html>');

    if ($pageMockCallback) {
        $pageMockCallback($pageMock);
    }

    $waitForNavigationCall = $pageNavigationMock->shouldReceive('waitForNavigation');

    if ($pageNavigationArgsClosure) {
        $waitForNavigationCall->withArgs($pageNavigationArgsClosure);
    }

    return $browserFactoryMock;
}

it('uses the configured timeout', function () {
    $browserFactoryMock = helper_setUpHeadlessChromeMocks(function (string $event, int $timeout) {
        return $event === Page::LOAD && $timeout === 45_000;
    });

    $helper = new HeadlessBrowserLoaderHelper($browserFactoryMock);

    $helper->setTimeout(45_000);

    $response = $helper->navigateToPageAndGetRespondedRequest(
        new Request('GET', 'https://www.example.com/foo'),
        helper_getMinThrottler(),
        cookieJar: new CookieJar(),
    );

    expect(Http::getBodyString($response))->toBe('<html><head></head><body>Hello World!</body></html>');
});

it('returns the configured timeout', function () {
    $helper = new HeadlessBrowserLoaderHelper();

    expect($helper->getTimeout())->toBe(30_000);

    $helper->setTimeout(75_000);

    expect($helper->getTimeout())->toBe(75_000);
});

it('waits for the configured browser navigation event', function () {
    $browserFactoryMock = helper_setUpHeadlessChromeMocks(function (string $event, int $timeout) {
        return $event === Page::FIRST_MEANINGFUL_PAINT && $timeout === 57_000;
    });

    $helper = new HeadlessBrowserLoaderHelper($browserFactoryMock);

    $helper
        ->waitForNavigationEvent(Page::FIRST_MEANINGFUL_PAINT)
        ->setTimeout(57_000);

    $response = $helper->navigateToPageAndGetRespondedRequest(
        new Request('GET', 'https://www.example.com/foo'),
        helper_getMinThrottler(),
        cookieJar: new CookieJar(),
    );

    expect(Http::getBodyString($response))->toBe('<html><head></head><body>Hello World!</body></html>');
});

it('uses the correct executable', function () {
    $helper = new HeadlessBrowserLoaderHelper();

    $helper->setExecutable('somethingthatdefinitelyisntachromeexecutable');

    $invadedHelper = invade($helper);

    $exception = null;

    try {
        $invadedHelper->getBrowser(new Request('GET', 'https://www.example.com/foo'));
    } catch (Exception $exception) {
    }

    expect($exception)->not->toBeNull();

    $chromeExecutable = (new AutoDiscover())->guessChromeBinaryPath();

    $helper = new HeadlessBrowserLoaderHelper();

    $helper->setExecutable($chromeExecutable);

    $invadedHelper = invade($helper);

    $invadedHelper->getBrowser(new Request('GET', 'https://www.example.com/foo'));

    $browserFactory = $invadedHelper->browserFactory;

    expect($browserFactory)->toBeInstanceOf(BrowserFactory::class);

    /** @var BrowserFactory $browserFactory */

    $invadedBrowserFactory = invade($browserFactory);

    expect($invadedBrowserFactory->chromeBinary)->toBe($chromeExecutable);
});

it('calls the temporary post navigate hooks once', function () {
    $browserFactoryMock = helper_setUpHeadlessChromeMocks(
        pageMockCallback: function (Mockery\MockInterface $pageMock) {
            $pageMock->shouldReceive('assertNotClosed')->once();
        },
    );

    $logger = new DummyLogger();

    $helper = new HeadlessBrowserLoaderHelper($browserFactoryMock, $logger);

    $helper->setTempPostNavigateHooks([
        function (Page $page, LoggerInterface $logger) {
            $logger->info('hook 1 called');
        },
        function (Page $page, LoggerInterface $logger) {
            $logger->info('hook 2 called');
        },
        function (Page $page, LoggerInterface $logger) {
            $logger->info('hook 3 called');
        },
    ]);

    $helper->navigateToPageAndGetRespondedRequest(
        new Request('GET', 'https://www.example.com/foo'),
        helper_getMinThrottler(),
        cookieJar: new CookieJar(),
    );

    expect($logger->messages)->toHaveCount(3)
        ->and($logger->messages[0]['message'])->toBe('hook 1 called')
        ->and($logger->messages[1]['message'])->toBe('hook 2 called')
        ->and($logger->messages[2]['message'])->toBe('hook 3 called');

    $helper->navigateToPageAndGetRespondedRequest(
        new Request('GET', 'https://www.example.com/foo'),
        helper_getMinThrottler(),
        cookieJar: new CookieJar(),
    );

    expect($logger->messages)->toHaveCount(3);
});

it(
    'passes the script source provided via the setPageInitScript() method, to the ' .
    'ProcessAwareBrowser::setPagePreScript() method',
    function () {
        $script = 'console.log(\'hey\');';

        $browserFactoryMock = helper_setUpHeadlessChromeMocks(
            browserMockCallback: function (Mockery\MockInterface $browser) use ($script) {
                $browser
                    ->shouldReceive('setPagePreScript')
                    ->once()
                    ->with($script);
            },
        );

        $helper = new HeadlessBrowserLoaderHelper($browserFactoryMock);

        $helper->setPageInitScript($script);

        $helper->navigateToPageAndGetRespondedRequest(
            new Request('GET', 'https://www.example.com/bar'),
            helper_getMinThrottler(),
            cookieJar: new CookieJar(),
        );
    },
);

it('does not call the ProcessAwareBrowser::setPagePreScript() when no page init script was defined', function () {
    $browserFactoryMock = helper_setUpHeadlessChromeMocks(
        browserMockCallback: function (Mockery\MockInterface $browser) {
            $browser->shouldNotReceive('setPagePreScript');
        },
    );

    $helper = new HeadlessBrowserLoaderHelper($browserFactoryMock);

    $helper->navigateToPageAndGetRespondedRequest(
        new Request('GET', 'https://www.example.com/bar'),
        helper_getMinThrottler(),
        cookieJar: new CookieJar(),
    );
});

it(
    'passes the userAgent option when Request contains a user-agent header and useNativeUserAgent() was not called',
    function () {
        $browserFactoryMock = helper_setUpHeadlessChromeMocks(
            createBrowserArgsExpectationCallback: function ($options) {
                return array_key_exists('userAgent', $options) && $options['userAgent'] === 'MyBot';
            },
        );

        $helper = new HeadlessBrowserLoaderHelper($browserFactoryMock);

        $response = $helper->navigateToPageAndGetRespondedRequest(
            new Request('GET', 'https://www.example.com/bar', ['user-agent' => ['MyBot']]),
            helper_getMinThrottler(),
            cookieJar: new CookieJar(),
        );

        expect(Http::getBodyString($response))->toBe('<html><head></head><body>Hello World!</body></html>');
    },
);

it(
    'does not pass the userAgent option when Request contains a user-agent header and useNativeUserAgent() was called',
    function () {
        $browserFactoryMock = helper_setUpHeadlessChromeMocks(
            createBrowserArgsExpectationCallback: function ($options) {
                return !array_key_exists('userAgent', $options);
            },
        );

        $helper = new HeadlessBrowserLoaderHelper($browserFactoryMock);

        $helper->useNativeUserAgent();

        $response = $helper->navigateToPageAndGetRespondedRequest(
            new Request('GET', 'https://www.example.com/bar', ['user-agent' => ['MyBot']]),
            helper_getMinThrottler(),
            cookieJar: new CookieJar(),
        );

        expect(Http::getBodyString($response))->toBe('<html><head></head><body>Hello World!</body></html>');
    },
);

it('clears the browsers cookies when no cookie jar is provided', function () {
    $browserFactoryMock = helper_setUpHeadlessChromeMocks(
        pageSessionMockCallback: function (Mockery\MockInterface $mock) {
            $mock
                ->shouldReceive('sendMessageSync')
                ->once()
                ->withArgs(function (Message $message) {
                    return $message->getMethod() === 'Network.clearBrowserCookies';
                });
        },
    );

    $helper = new HeadlessBrowserLoaderHelper($browserFactoryMock);

    $response = $helper->navigateToPageAndGetRespondedRequest(
        new Request('GET', 'https://www.example.com/yolo', ['user-agent' => ['MyBot']]),
        helper_getMinThrottler(),
    );

    expect(Http::getBodyString($response))->toBe('<html><head></head><body>Hello World!</body></html>');
});

it('reuses a previously opened page', function () {
    $browserFactoryMock = helper_setUpHeadlessChromeMocks(
        pageMockCallback: function (Mockery\MockInterface $pageMock) {
            $pageMock->shouldReceive('assertNotClosed')->twice();
        },
    );

    $helper = new HeadlessBrowserLoaderHelper($browserFactoryMock);

    $t = helper_getMinThrottler();

    $c = new CookieJar();

    $helper->navigateToPageAndGetRespondedRequest(new Request('GET', 'https://www.example.com/foo'), $t, null, $c);

    $helper->navigateToPageAndGetRespondedRequest(new Request('GET', 'https://www.example.com/bar'), $t, null, $c);

    $helper->navigateToPageAndGetRespondedRequest(new Request('GET', 'https://www.example.com/baz'), $t, null, $c);
});


================================================
FILE: tests/Loader/Http/HttpLoaderPolitenessTest.php
================================================
<?php

namespace tests\Loader\Http;

use Crwlr\Crawler\Loader\Http\Exceptions\LoadingException;
use Crwlr\Crawler\Loader\Http\HeadlessBrowserLoaderHelper;
use Crwlr\Crawler\Loader\Http\HttpLoader;
use Crwlr\Crawler\Loader\Http\Messages\RespondedRequest;
use Crwlr\Crawler\UserAgents\BotUserAgent;
use Crwlr\Crawler\UserAgents\UserAgent;
use GuzzleHttp\Psr7\Response;
use HeadlessChromium\Browser;
use HeadlessChromium\Communication\Session;
use HeadlessChromium\Cookies\CookiesCollection;
use HeadlessChromium\Page;
use HeadlessChromium\PageUtils\PageNavigation;
use Mockery;
use PHPUnit\Framework\TestCase;
use Psr\Http\Client\ClientInterface;
use Psr\Http\Message\RequestInterface;

use function tests\helper_getDummyRobotsTxtResponse;

function helper_wait300ms(): void
{
    $start = microtime(true);
    while ((microtime(true) - $start) < 0.3) {
    }
}

/** @var TestCase $this */

it('throttles requests to the same domain', function ($loadingMethod) {
    $httpClient = Mockery::mock(ClientInterface::class);

    $httpClient->shouldReceive('sendRequest')->once()->andReturnUsing(function (RequestInterface $request) {
        $response = new Response(200, [], $request->getUri()->__toString() . ' response');

        helper_wait300ms();

        return $response;
    });

    $httpClient->shouldReceive('sendRequest')->once()->andReturn(new Response(200));

    $loader = new HttpLoader(new UserAgent('SomeUserAgent'), $httpClient);

    $loader->{$loadingMethod}('https://www.example.com/foo');

    $firstResponse = microtime(true);

    $loader->{$loadingMethod}('https://www.example.com/bar');

    $secondResponse = microtime(true);

    $diff = $secondResponse - $firstResponse;

    expect($diff)->toBeGreaterThan(0.3)
        ->and($diff)->toBeLessThan(0.62);
})->with(['load', 'loadOrFail']);

it('also throttles requests using the headless browser', function ($loadingMethod) {
    $browserMock = Mockery::mock(Browser::class);

    $pageMock = Mockery::mock(Page::class);

    $sessionMock = Mockery::mock(Session::class);

    $sessionMock->shouldReceive('once');

    $pageMock->shouldReceive('assertNotClosed')->once();

    $pageMock->shouldReceive('getSession')->andReturn($sessionMock);

    $pageNavigationMock = Mockery::mock(PageNavigation::class);

    $pageNavigationMock->shouldReceive('waitForNavigation');

    $pageMock
        ->shouldReceive('navigate')
        ->once()
        ->andReturnUsing(function (string $url) use ($pageNavigationMock) {
            helper_wait300ms();

            return $pageNavigationMock;
        });

    $pageMock->shouldReceive('getCookies')->andReturn(new CookiesCollection());

    $pageMock->shouldReceive('getHtml')->andReturn('<html>foo</html>');

    $browserMock->shouldReceive('createPage')->andReturn($pageMock);

    $browserHelperMock = Mockery::mock(HeadlessBrowserLoaderHelper::class)->makePartial();

    $browserHelperMock
        ->shouldAllowMockingProtectedMethods()
        ->shouldReceive('getBrowser')
        ->andReturn($browserMock);

    $loader = new HttpLoader(new UserAgent('SomeUserAgent'));

    invade($loader)->browserHelper = $browserHelperMock;

    $loader->useHeadlessBrowser();

    $loader->{$loadingMethod}('https://www.example.com/foo');

    $pageMock->shouldReceive('navigate')->andReturn($pageNavigationMock);

    $pageMock->shouldReceive('getCookies')->andReturn(new CookiesCollection());

    $firstResponse = microtime(true);

    $loader->{$loadingMethod}('https://www.example.com/bar');

    $secondResponse = microtime(true);

    $diff = $secondResponse - $firstResponse;

    expect($diff)->toBeGreaterThan(0.3)
        ->and($diff)->toBeLessThan(0.62);
})->with(['load', 'loadOrFail']);

it('does not throttle requests to different domains', function ($loadingMethod) {
    $httpClient = Mockery::mock(ClientInterface::class);

    $httpClient->shouldReceive('sendRequest')->once()->andReturnUsing(function (RequestInterface $request) {
        $response = new Response(200, [], $request->getUri()->__toString() . ' response');

        helper_wait300ms();

        return $response;
    });

    $httpClient->shouldReceive('sendRequest')->once()->andReturn(new Response(200));

    $loader = new HttpLoader(new UserAgent('SomeUserAgent'), $httpClient);

    $loader->{$loadingMethod}('https://www.example.com/foo');

    $firstResponse = microtime(true);

    $loader->{$loadingMethod}('https://www.example.org/bar');

    $secondResponse = microtime(true);

    $diff = $secondResponse - $firstResponse;

    expect($diff)->toBeLessThan(0.001);
})->with(['load', 'loadOrFail']);

it('respects rules from robots.txt from load method', function () {
    $client = Mockery::mock(ClientInterface::class);

    $client->shouldReceive('sendRequest')->once()->andReturn(helper_getDummyRobotsTxtResponse());

    $loader = new HttpLoader(new BotUserAgent('FooBot'), $client);

    $response = $loader->load('https://www.crwlr.software/secret');

    expect($response)->toBeNull();

    $output = $this->getActualOutputForAssertion();

    expect($output)->toContain('Loaded https://www.crwlr.software/robots.txt');

    expect($output)->toContain('Crawler is not allowed to load https://www.crwlr.software/secret');
});

it('respects rules from robots.txt from loadOrFail method', function () {
    $client = Mockery::mock(ClientInterface::class);

    $client->shouldReceive('sendRequest')->once()->andReturn(helper_getDummyRobotsTxtResponse());

    $loader = new HttpLoader(new BotUserAgent('FooBot'), $client);

    $loader->loadOrFail('https://www.crwlr.software/secret');
})->throws(LoadingException::class);

it('does not respect rules from robots.txt when user agent isn\'t instance of BotUserAgent', function () {
    $client = Mockery::mock(ClientInterface::class);

    $client->shouldReceive('sendRequest')->once()->andReturn(helper_getDummyRobotsTxtResponse());

    $loader = new HttpLoader(new UserAgent('FooBot'), $client);

    $response = $loader->load('https://www.crwlr.software/secret');

    expect($response)->toBeInstanceOf(RespondedRequest::class);

    $output = $this->getActualOutputForAssertion();

    expect($output)->not()->toContain('Loaded https://www.crwlr.software/robots.txt');

    expect($output)->not()->toContain('Crawler is not allowed to load https://www.crwlr.software/secret');
});


================================================
FILE: tests/Loader/Http/HttpLoaderTest.php
================================================
<?php

namespace tests\Loader\Http;

use Crwlr\Crawler\Cache\FileCache;
use Crwlr\Crawler\Loader\Http\Cookies\CookieJar;
use Crwlr\Crawler\Loader\Http\Messages\RespondedRequest;
use Crwlr\Crawler\Loader\Http\Exceptions\LoadingException;
use Crwlr\Crawler\Loader\Http\HttpLoader;
use Crwlr\Crawler\Loader\Http\Politeness\Throttler;
use Crwlr\Crawler\Steps\Filters\Filter;
use Crwlr\Crawler\Steps\Loading\Http;
use Crwlr\Crawler\UserAgents\BotUserAgent;
use Exception;
use GuzzleHttp\Client;
use GuzzleHttp\Psr7\Request;
use GuzzleHttp\Psr7\Response;
use InvalidArgumentException;
use Mockery;
use PHPUnit\Framework\TestCase;
use Psr\Http\Client\ClientInterface;
use Psr\Http\Message\RequestInterface;
use Psr\Http\Message\UriInterface;
use Psr\SimpleCache\CacheInterface;
use tests\_Stubs\DummyLogger;
use tests\_Stubs\RespondedRequestChild;
use Throwable;

use function tests\helper_cachedir;
use function tests\helper_getFastLoader;
use function tests\helper_nonBotUserAgent;
use function tests\helper_resetCacheDir;

afterEach(function () {
    helper_resetCacheDir();
});

/** @var TestCase $this */

it('accepts url string as argument to load', function () {
    $httpClient = Mockery::mock(ClientInterface::class);

    $httpClient->shouldReceive('sendRequest')->twice()->andReturn(new Response());

    $httpLoader = new HttpLoader(helper_nonBotUserAgent(), $httpClient);

    $httpLoader->load('https://www.crwlr.software');

    $httpLoader->loadOrFail('https://www.crwlr.software');
});

it('fails and logs an error when invoked with a relative reference URI', function () {
    $logger = new DummyLogger();

    $httpLoader = new HttpLoader(helper_nonBotUserAgent(), logger: $logger);

    $httpLoader->load('/foo');

    expect($logger->messages)->not->toBeEmpty()
        ->and($logger->messages[0]['message'])->toBe(
            'Invalid input URL: /foo - The URI is a relative reference and therefore can\'t be loaded.',
        );
});

it('fails and throws an exception when loadOrFail() is called with a relative reference URI', function () {
    $logger = new DummyLogger();

    $httpLoader = new HttpLoader(helper_nonBotUserAgent(), logger: $logger);

    $httpLoader->loadOrFail('/foo');
})->throws(InvalidArgumentException::class);

it('accepts RequestInterface as argument to load', function () {
    $httpClient = Mockery::mock(ClientInterface::class);

    $httpClient->shouldReceive('sendRequest')->twice()->andReturn(new Response());

    $httpLoader = new HttpLoader(helper_nonBotUserAgent(), $httpClient);

    $httpLoader->load(new Request('GET', 'https://www.crwlr.software'));

    $httpLoader->loadOrFail(new Request('GET', 'https://www.crwlr.software'));
});

it('fails and logs an error when invoked with a RequestInterface object having a relative reference URI', function () {
    $logger = new DummyLogger();

    $httpLoader = new HttpLoader(helper_nonBotUserAgent(), logger: $logger);

    $httpLoader->load(new Request('GET', '/foo'));

    expect($logger->messages)->not->toBeEmpty()
        ->and($logger->messages[0]['message'])->toBe(
            'Invalid input URL: /foo - The URI is a relative reference and therefore can\'t be loaded.',
        );
});

it(
    'fails and throws an exception when loadOrFail() is called with a RequestInterface object having a relative ' .
    'reference URI',
    function () {
        $logger = new DummyLogger();

        $httpLoader = new HttpLoader(helper_nonBotUserAgent(), logger: $logger);

        $httpLoader->loadOrFail(new Request('GET', '/foo'));
    },
)->throws(InvalidArgumentException::class);

it(
    'calls the before and after load hooks regardless whether the response was successful or not',
    function ($responseStatusCode) {
        $httpClient = Mockery::mock(ClientInterface::class);

        if ($responseStatusCode === 300) {
            $httpClient->shouldReceive('sendRequest')
                ->twice()
                ->andReturn(new Response($responseStatusCode), new Response(200));
        } else {
            $httpClient->shouldReceive('sendRequest')->once()->andReturn(new Response($responseStatusCode));
        }

        $httpLoader = new HttpLoader(helper_nonBotUserAgent(), $httpClient);

        $beforeLoadWasCalled = false;

        $httpLoader->beforeLoad(function () use (&$beforeLoadWasCalled) {
            $beforeLoadWasCalled = true;
        });

        $afterLoadWasCalled = false;

        $httpLoader->afterLoad(function () use (&$afterLoadWasCalled) {
            $afterLoadWasCalled = true;
        });

        $httpLoader->load('https://www.otsch.codes');

        expect($beforeLoadWasCalled)->toBeTrue()
            ->and($afterLoadWasCalled)->toBeTrue();
    },
)->with([
    [100],
    [200],
    [300],
    [400],
    [500],
]);

it('calls the onSuccess hook on a successful response', function ($responseStatusCode) {
    $httpClient = Mockery::mock(ClientInterface::class);

    $httpClient->shouldReceive('sendRequest')->twice()->andReturn(new Response($responseStatusCode));

    $httpLoader = new HttpLoader(helper_nonBotUserAgent(), $httpClient);

    $onSuccessWasCalled = false;

    $httpLoader->onSuccess(function () use (&$onSuccessWasCalled) {
        $onSuccessWasCalled = true;
    });

    $httpLoader->load('https://www.otsch.codes');

    expect($onSuccessWasCalled)->toBeTrue();

    $onSuccessWasCalled = false;

    $httpLoader->loadOrFail('https://www.otsch.codes');

    expect($onSuccessWasCalled)->toBeTrue();
})->with([
    [200],
    [201],
    [202],
]);

it('calls the onError hook on a failed request', function ($responseStatusCode) {
    $httpClient = Mockery::mock(ClientInterface::class);

    $httpClient->shouldReceive('sendRequest')->once()->andReturn(new Response($responseStatusCode));

    $httpLoader = new HttpLoader(helper_nonBotUserAgent(), $httpClient);

    $onErrorWasCalled = false;

    $httpLoader->onError(function () use (&$onErrorWasCalled) {
        $onErrorWasCalled = true;
    });

    $httpLoader->load('https://www.otsch.codes');

    expect($onErrorWasCalled)->toBeTrue();
})->with([
    [400],
    [404],
    [422],
    [500],
]);

it('calls the onCacheHit hook when a response for the request was found in the cache', function (string $loadMethod) {
    $cache = new FileCache(helper_cachedir());

    $userAgent = helper_nonBotUserAgent();

    $respondedRequest = new RespondedRequest(
        new Request(
            'GET',
            'https://www.example.com/foo',
            ['Host' => ['www.example.com'], 'User-Agent' => [(string) $userAgent]],
        ),
        new Response(body: 'Hello World!'),
    );

    $cache->set($respondedRequest->cacheKey(), $respondedRequest);

    $httpLoader = new HttpLoader($userAgent);

    $httpLoader->setCache($cache);

    $onCacheHitWasCalled = false;

    $httpLoader->onCacheHit(function () use (&$onCacheHitWasCalled) {
        $onCacheHitWasCalled = true;
    });

    $response = $httpLoader->{$loadMethod}('https://www.example.com/foo');

    /** @var RespondedRequest $response */

    expect($onCacheHitWasCalled)->toBeTrue()
        ->and($response->isServedFromCache())->toBeTrue();
})->with(['load', 'loadOrFail']);

it('throws an Exception when request fails in loadOrFail method', function () {
    $httpClient = Mockery::mock(ClientInterface::class);

    $httpClient->shouldReceive('sendRequest')->once()->andReturn(new Response(400));

    $httpLoader = new HttpLoader(helper_nonBotUserAgent(), $httpClient);

    $onErrorWasCalled = false;

    $httpLoader->onError(function () use (&$onErrorWasCalled) {
        $onErrorWasCalled = true;
    });

    try {
        $httpLoader->loadOrFail('https://www.otsch.codes');
    } catch (LoadingException $exception) {
        expect($exception)->toBeInstanceOf(LoadingException::class);
    }

    expect($onErrorWasCalled)->toBeFalse();
});

test('You can implement logic to disallow certain request', function () {
    $httpClient = Mockery::mock(ClientInterface::class);

    $httpClient->shouldReceive('sendRequest')->once()->andReturn(new Response());

    $httpLoader = new class (new BotUserAgent('Foo'), $httpClient) extends HttpLoader {
        public function isAllowedToBeLoaded(UriInterface $uri, bool $throwsException = false): bool
        {
            return $uri->__toString() === 'https://www.example.com/foo';
        }
    };

    $response = $httpLoader->load('https://www.example.com/foo');

    expect($response)->toBeInstanceOf(RespondedRequest::class);

    $response = $httpLoader->load('https://www.example.com/bar');

    expect($response)->toBeNull();
});

test(
    'The isAllowedToBeLoaded method is called with argument throwsException true when called from loadOrFail',
    function () {
        $httpClient = Mockery::mock(ClientInterface::class);

        $httpClient->shouldReceive('sendRequest')->once()->andReturn(new Response());

        $httpLoader = new class (new BotUserAgent('Foo'), $httpClient) extends HttpLoader {
            public function isAllowedToBeLoaded(UriInterface $uri, bool $throwsException = false): bool
            {
                if ($throwsException) {
                    throw new LoadingException('Fail to load ' . $uri->__toString());
                }

                return $uri->__toString() === 'https://www.example.com';
            }
        };

        $httpLoader->load('https://www.example.com');

        try {
            $httpLoader->loadOrFail('https://www.example.com');
        } catch (LoadingException $exception) {
            expect($exception)->toBeInstanceOf(LoadingException::class);
        }
    },
);

it('automatically handles redirects', function (string $loadingMethod) {
    $httpClient = Mockery::mock(ClientInterface::class);

    $httpClient->shouldReceive('sendRequest')
        ->twice()
        ->andReturn(
            new Response(301, ['Location' => 'https://www.redirect.com']),
            new Response(200, [], 'YES'),
        );

    $httpLoader = new HttpLoader(helper_nonBotUserAgent(), $httpClient);

    $respondedRequest = $httpLoader->{$loadingMethod}('https://www.crwlr.software/packages');

    /** @var RespondedRequest $respondedRequest */
    expect($respondedRequest->requestedUri())->toBe('https://www.crwlr.software/packages')
        ->and($respondedRequest->effectiveUri())->toBe('https://www.redirect.com')
        ->and($respondedRequest->response->getBody()->getContents())->toBe('YES');
})->with(['load', 'loadOrFail']);

it('calls request start and end tracking methods', function (string $loadingMethod) {
    $httpClient = Mockery::mock(ClientInterface::class);

    $httpClient->shouldReceive('sendRequest')->once()->andReturn(new Response(200));

    $throttler = new class extends Throttler {
        public function trackRequestStartFor(UriInterface $url): void
        {
            echo 'Track request start ' . $url . PHP_EOL;

            parent::trackRequestStartFor($url);
        }

        public function trackRequestEndFor(UriInterface $url): void
        {
            echo 'Track request end ' . $url . PHP_EOL;

            parent::trackRequestEndFor($url);
        }
    };

    $httpLoader = new HttpLoader(helper_nonBotUserAgent(), $httpClient, throttler: $throttler);

    $httpLoader->{$loadingMethod}('https://www.twitter.com');

    $output = $this->getActualOutputForAssertion();

    expect($output)->toContain('Track request start https://www.twitter.com')
        ->and($output)->toContain('Track request end https://www.twitter.com');
})->with(['load', 'loadOrFail']);

it(
    'calls trackRequestEndFor only once and with the original request URL when there is a redirect',
    function (string $loadingMethod) {
        $httpClient = Mockery::mock(ClientInterface::class);

        $httpClient
            ->shouldReceive('sendRequest')
            ->once()
            ->withArgs(function (Request $request) {
                return (string) $request->getUri() === 'https://www.example.com/foo';
            })
            ->andReturn(new Response(301, ['Location' => 'https://www.example.com/bar']));

        $httpClient
            ->shouldReceive('sendRequest')
            ->once()
            ->withArgs(function (Request $request) {
                return (string) $request->getUri() === 'https://www.example.com/bar';
            })
            ->andReturn(new Response(200));

        $throttler = new class extends Throttler {
            public function trackRequestEndFor(UriInterface $url): void
            {
                echo 'Track request end ' . $url . PHP_EOL;

                parent::trackRequestEndFor($url);
            }
        };

        $httpLoader = new HttpLoader(helper_nonBotUserAgent(), $httpClient, throttler: $throttler);

        $httpLoader->{$loadingMethod}('https://www.example.com/foo');

        $output = $this->getActualOutputForAssertion();

        expect($output)->toContain('Track request end https://www.example.com/foo')
            ->and(count(explode('Track request end', $output)))->toBe(2);
    },
)->with(['load', 'loadOrFail']);

it('automatically logs loading success message', function ($loadingMethod) {
    $httpClient = Mockery::mock(ClientInterface::class);

    $httpClient->shouldReceive('sendRequest')->once()->andReturn(new Response());

    $httpLoader = new HttpLoader(helper_nonBotUserAgent(), $httpClient);

    $httpLoader->{$loadingMethod}(new Request('GET', 'https://phpstan.org/'));

    $output = $this->getActualOutputForAssertion();

    expect($output)->toContain('Loaded https://phpstan.org/');
})->with(['load', 'loadOrFail']);

it('automatically logs loading error message in normal load method', function () {
    $httpClient = Mockery::mock(ClientInterface::class);

    $httpClient->shouldReceive('sendRequest')->once()->andReturn(new Response(500));

    $httpLoader = new HttpLoader(helper_nonBotUserAgent(), $httpClient);

    $httpLoader->load(new Request('GET', 'https://phpstan.org/'));

    $output = $this->getActualOutputForAssertion();

    expect($output)->toContain('Failed to load https://phpstan.org/');
});

it('automatically adds the User-Agent header before sending', function () {
    $httpClient = Mockery::mock(ClientInterface::class);

    $httpClient->shouldReceive('sendRequest')
        ->once()
        ->withArgs(function ($request) {
            return str_contains($request->getHeaderLine('User-Agent'), 'FooBot');
        })
        ->andReturn(new Response());

    $httpLoader = new HttpLoader(helper_nonBotUserAgent(), $httpClient);

    $httpLoader->load('https://www.facebook.com');
});

it('tries to get responses from cache', function () {
    $httpClient = Mockery::mock(ClientInterface::class);

    $httpClient->shouldNotReceive('sendRequest');

    $cache = Mockery::mock(CacheInterface::class);

    $cache->shouldReceive('has')->once()->andReturn(true);

    $cache->shouldReceive('get')
        ->once()
        ->andReturn(new RespondedRequest(new Request('GET', '/'), new Response()));

    $httpLoader = new HttpLoader(helper_nonBotUserAgent(), $httpClient);

    $httpLoader->setCache($cache);

    $httpLoader->load('https://www.facebook.com');
});

test(
    'when a response is served from cache, the RespondedRequest::isServedFromCache() method returns true,',
    function (string $loadMethod) {
        $cache = new FileCache(helper_cachedir());

        $userAgent = helper_nonBotUserAgent();

        $respondedRequest = new RespondedRequest(
            new Request(
                'GET',
                'https://www.example.com/bar',
                ['Host' => ['www.example.com'], 'User-Agent' => [(string) $userAgent]],
            ),
            new Response(body: 'Hi!'),
        );

        $cache->set($respondedRequest->cacheKey(), $respondedRequest);

        $clientMock = Mockery::mock(Client::class);

        $clientMock
            ->shouldReceive('sendRequest')
            ->once()
            ->withArgs(function (Request $request) {
                return (string) $request->getUri() === 'https://www.example.com/foo';
            })
            ->andReturn(new Response(body: 'Hi!'));

        $httpLoader = (new HttpLoader($userAgent, $clientMock))->setCache($cache);

        $response = $httpLoader->{$loadMethod}('https://www.example.com/foo');

        /** @var RespondedRequest $response */

        expect($response->isServedFromCache())->toBeFalse();

        $response = $httpLoader->{$loadMethod}('https://www.example.com/bar');

        /** @var RespondedRequest $response */

        expect($response->isServedFromCache())->toBeTrue();
    },
)->with(['load', 'loadOrFail']);

it(
    'does not serve a request from the cache, when skipCacheForNextRequest() was called',
    function (string $loadMethod) {
        $cache = new FileCache(helper_cachedir());

        $userAgent = helper_nonBotUserAgent();

        $respondedRequest = new RespondedRequest(
            new Request(
                'GET',
                'https://www.example.com/blog/posts',
                ['Host' => ['www.example.com'], 'User-Agent' => [(string) $userAgent]],
            ),
            new Response(body: 'previously cached blog posts'),
        );

        $cache->set($respondedRequest->cacheKey(), $respondedRequest);

        $clientMock = Mockery::mock(Client::class);

        $clientMock
            ->shouldReceive('sendRequest')
            ->once()
            ->withArgs(function (Request $request) {
                return (string) $request->getUri() === 'https://www.example.com/blog/posts';
            })
            ->andReturn(new Response(body: 'loaded blog posts'));

        $httpLoader = (new HttpLoader($userAgent, $clientMock))
            ->setCache($cache)
            ->skipCacheForNextRequest();

        $response = $httpLoader->{$loadMethod}('https://www.example.com/blog/posts');

        /** @var RespondedRequest $response */

        expect($response->isServedFromCache())->toBeFalse()
            ->and(Http::getBodyString($response))->toBe('loaded blog posts');

        // Skipping the cache is only effective for loading. It still adds the loaded response to the cache.
        // So on the next request, when not again calling the skip cache method, the cache will return that
        // previously loaded response.
        $response = $httpLoader->{$loadMethod}('https://www.example.com/blog/posts');

        expect($response->isServedFromCache())->toBeTrue()
            ->and(Http::getBodyString($response))->toBe('loaded blog posts');
    },
)->with(['load', 'loadOrFail']);

it('still handles legacy (until v0.7) cached responses', function () {
    $httpClient = Mockery::mock(ClientInterface::class);

    $httpClient->shouldNotReceive('sendRequest');

    $cache = Mockery::mock(CacheInterface::class);

    $cache->shouldReceive('has')->once()->andReturn(true);

    $cache->shouldReceive('get')
        ->once()
        ->andReturn([
            'requestMethod' => 'GET',
            'requestUri' => 'https://www.example.com/index',
            'requestHeaders' => ['foo' => ['bar']],
            'requestBody' => 'requestbody',
            'effectiveUri' => 'https://www.example.com/home',
            'responseStatusCode' => 201,
            'responseHeaders' => ['baz' => ['quz']],
            'responseBody' => 'responsebody',
        ]);

    $httpLoader = new HttpLoader(helper_nonBotUserAgent(), $httpClient);

    $httpLoader->setCache($cache);

    $respondedRequest = $httpLoader->load('https://www.example.com/index');

    expect($respondedRequest)->toBeInstanceOf(RespondedRequest::class)
        ->and($respondedRequest?->request->getMethod())->toBe('GET')
        ->and($respondedRequest?->requestedUri())->toBe('https://www.example.com/index')
        ->and($respondedRequest?->request->getHeaders())->toHaveKey('foo')
        ->and($respondedRequest?->request->getBody()->getContents())->toBe('requestbody')
        ->and($respondedRequest?->effectiveUri())->toBe('https://www.example.com/home')
        ->and($respondedRequest?->response->getStatusCode())->toBe(201)
        ->and($respondedRequest?->response->getHeaders())->toHaveKey('baz')
        ->and($respondedRequest?->response->getBody()->getContents())->toBe('responsebody');
});

it('fails when it gets a failed response from cache', function () {
    $httpClient = Mockery::mock(ClientInterface::class);

    $cache = Mockery::mock(CacheInterface::class);

    $cache->shouldReceive('has')->once()->andReturn(true);

    $cache->shouldReceive('get')
        ->once()
        ->andReturn(new RespondedRequest(new Request('GET', '/'), new Response(404)));

    $httpLoader = new HttpLoader(helper_nonBotUserAgent(), $httpClient);

    $httpLoader->setCache($cache);

    $onErrorWasCalled = false;

    $httpLoader->onError(function () use (&$onErrorWasCalled) {
        $onErrorWasCalled = true;
    });

    $httpLoader->load('https://www.facebook.com');

    expect($onErrorWasCalled)->toBeTrue();
});

it('fails when it gets a failed response from cache in loadOrFail', function () {
    $httpClient = Mockery::mock(ClientInterface::class);

    $cache = Mockery::mock(CacheInterface::class);

    $cache->shouldReceive('has')->once()->andReturn(true);

    $cache->shouldReceive('get')
        ->once()
        ->andReturn(new RespondedRequest(new Request('GET', 'facebook'), new Response(404)));

    $httpLoader = new HttpLoader(helper_nonBotUserAgent(), $httpClient);

    $httpLoader->setCache($cache);

    $httpLoader->loadOrFail('https://www.facebook.com');
})->throws(LoadingException::class);

it('adds loaded responses to the cache when it has a cache', function ($loadingMethod) {
    $httpClient = Mockery::mock(ClientInterface::class);

    $httpClient->shouldReceive('sendRequest')->once()->andReturn(new Response());

    $cache = Mockery::mock(CacheInterface::class);

    $cache->shouldReceive('has')->once()->andReturn(false);

    $cache->shouldReceive('set')->once();

    $httpLoader = new HttpLoader(helper_nonBotUserAgent(), $httpClient);

    $httpLoader->setCache($cache);

    $httpLoader->{$loadingMethod}('https://laravel.com/');
})->with(['load', 'loadOrFail']);

test(
    'when a cached response was an error response it retries to load it when retryCachedErrorResponses() was called',
    function (string $loadingMethod) {
        $httpClient = Mockery::mock(ClientInterface::class);

        $httpClient
            ->shouldReceive('sendRequest')
            ->twice()
            ->andReturn(new Response(404), new Response(200));

        $cache = new FileCache(helper_cachedir());

        $httpLoader = helper_getFastLoader(httpClient: $httpClient);

        $httpLoader->setCache($cache);

        $httpLoader->retryCachedErrorResponses();

        try {
            $httpLoader->{$loadingMethod}('https://www.example.com/articles/123');
        } catch (Throwable $exception) {
        }

        try {
            $httpLoader->{$loadingMethod}('https://www.example.com/articles/123');
        } catch (Throwable $exception) {
        }
    },
)->with(['load', 'loadOrFail']);

test('retrying cached error responses can be restricted to only certain response status codes', function () {
    $httpClient = Mockery::mock(ClientInterface::class);

    $httpClient
        ->shouldReceive('sendRequest')
        ->twice()
        ->andReturn(new Response(404), new Response(400));

    $cache = new FileCache(helper_cachedir());

    $httpLoader = helper_getFastLoader(httpClient: $httpClient);

    $httpLoader->setCache($cache);

    $httpLoader
        ->retryCachedErrorResponses()
        ->only([404, 503]);

    $respondedRequest = $httpLoader->load('https://www.example.com/foo');

    expect($respondedRequest?->response->getStatusCode())->toBe(404);

    $respondedRequest = $httpLoader->load('https://www.example.com/foo');

    expect($respondedRequest?->response->getStatusCode())->toBe(400);

    $respondedRequest = $httpLoader->load('https://www.example.com/foo');

    expect($respondedRequest?->response->getStatusCode())->toBe(400);
});

test('certain error status codes can be excluded from being retried', function () {
    $httpClient = Mockery::mock(ClientInterface::class);

    $httpClient
        ->shouldReceive('sendRequest')
        ->twice()
        ->andReturn(new Response(404), new Response(500));

    $cache = new FileCache(helper_cachedir());

    $httpLoader = helper_getFastLoader(httpClient: $httpClient);

    $httpLoader->setCache($cache);

    $httpLoader
        ->retryCachedErrorResponses()
        ->except([410, 500]);

    $respondedRequest = $httpLoader->load('https://www.example.com/foo');

    expect($respondedRequest?->response->getStatusCode())->toBe(404);

    $respondedRequest = $httpLoader->load('https://www.example.com/foo');

    expect($respondedRequest?->response->getStatusCode())->toBe(500);

    $respondedRequest = $httpLoader->load('https://www.example.com/foo');

    expect($respondedRequest?->response->getStatusCode())->toBe(500);
});

it(
    'adds responses to the cache but doesn\'t try to get them from the cache, when writeOnlyCache() was called',
    function ($loadingMethod) {
        $httpClient = Mockery::mock(ClientInterface::class);

        $httpClient->shouldReceive('sendRequest')->twice()->andReturn(new Response());

        $cache = new FileCache(helper_cachedir());

        $httpLoader = new HttpLoader(helper_nonBotUserAgent(), $httpClient);

        $httpLoader->setCache($cache);

        $httpLoader->writeOnlyCache();

        try {
            $httpLoader->{$loadingMethod}('https://www.example.com/articles/123');
        } catch (Throwable $exception) {
        }

        try {
            $httpLoader->{$loadingMethod}('https://www.example.com/articles/123');
        } catch (Throwable $exception) {
        }
    },
)->with(['load', 'loadOrFail']);

test(
    'When cache filters are defined via the cacheOnlyWhereUrl() method it caches only responses for matching URLs',
    function (string $loadingMethod) {
        $httpClient = Mockery::mock(ClientInterface::class);

        $httpClient
            ->shouldReceive('sendRequest')
            ->twice()
            ->andReturnUsing(function (Request $request) {
                return new Response(200, body: $request->getUri() . ' response');
            });

        $cache = new FileCache(helper_cachedir());

        $httpLoader = new HttpLoader(helper_nonBotUserAgent(), $httpClient);

        $httpLoader->setCache($cache);

        $httpLoader->cacheOnlyWhereUrl(Filter::urlPathStartsWith('/bar/'));

        $respondedRequest = $httpLoader->{$loadingMethod}('https://www.example.com/foo/something');

        expect($cache->get($respondedRequest->cacheKey()))->toBeNull();

        $respondedRequest = $httpLoader->{$loadingMethod}('https://www.example.com/bar/something');

        expect($cache->get($respondedRequest->cacheKey()))->toBeInstanceOf(RespondedRequest::class);
    },
)->with(['load', 'loadOrFail']);

test(
    'When multiple cache filters are defined via the cacheOnlyWhereUrl() method, all of them are used',
    function (string $loadingMethod) {
        $httpClient = Mockery::mock(ClientInterface::class);

        $httpClient
            ->shouldReceive('sendRequest')
            ->times(3)
            ->andReturnUsing(function (Request $request) {
                return new Response(200, body: $request->getUri() . ' response');
            });

        $cache = new FileCache(helper_cachedir());

        $httpLoader = new HttpLoader(helper_nonBotUserAgent(), $httpClient);

        $httpLoader->setCache($cache);

        $httpLoader
            ->cacheOnlyWhereUrl(Filter::urlPathStartsWith('/bar/'))
            ->cacheOnlyWhereUrl(Filter::urlHost('www.example.com'));

        $respondedRequest = $httpLoader->{$loadingMethod}('https://www.example.com/foo/something');

        expect($cache->get($respondedRequest->cacheKey()))->toBeNull();

        $respondedRequest = $httpLoader->{$loadingMethod}('https://www.crwlr.software/bar/something');

        expect($cache->get($respondedRequest->cacheKey()))->toBeNull();

        $respondedRequest = $httpLoader->{$loadingMethod}('https://www.example.com/bar/something');

        expect($cache->get($respondedRequest->cacheKey()))->toBeInstanceOf(RespondedRequest::class);
    },
)->with(['load', 'loadOrFail']);

test(
    'when a request was redirected, only one of the URLs has to match the filters defined via cacheOnlyWhereUrl()',
    function (string $loadingMethod) {
        $httpClient = Mockery::mock(ClientInterface::class);

        $httpClient
            ->shouldReceive('sendRequest')
            ->andReturnUsing(function (Request $request) {
                $url = (string) $request->getUri();

                $redirectUrl = null;

                if ($url === 'https://www.example.com/foo/something') {
                    $redirectUrl = 'https://www.example.com/bar/something';
                } elseif ($url === 'https://www.example.com/bar/something') {
                    $redirectUrl = 'https://www.example.com/baz/something';
                }

                if ($redirectUrl) {
                    return new Response(301, ['Location' => $redirectUrl]);
                }

                return new Response(200, body: $request->getUri() . ' response');
            });

        $cache = new FileCache(helper_cachedir());

        $httpLoader = new HttpLoader(helper_nonBotUserAgent(), $httpClient);

        $httpLoader->setCache($cache);

        $httpLoader->cacheOnlyWhereUrl(Filter::urlPathStartsWith('/bar/'));

        $respondedRequest = $httpLoader->{$loadingMethod}('https://www.example.com/foo/something');

        expect($cache->get($respondedRequest->cacheKey()))->toBeInstanceOf(RespondedRequest::class);

        $cache->clear();

        $respondedRequest = $httpLoader->{$loadingMethod}('https://www.example.com/bar/something');

        expect($cache->get($respondedRequest->cacheKey()))->toBeInstanceOf(RespondedRequest::class);

        $cache->clear();

        $respondedRequest = $httpLoader->{$loadingMethod}('https://www.example.com/baz/something');

        expect($cache->get($respondedRequest->cacheKey()))->toBeNull();
    },
)->with(['load', 'loadOrFail']);

it('uses the cache only for requests that meet the filter criteria', function (string $loadingMethod) {
    $httpClient = Mockery::mock(ClientInterface::class);

    $httpClient
        ->shouldReceive('sendRequest')
        ->once()
        ->andReturnUsing(function (Request $request) {
            return new Response(200, body: $request->getUri() . ' response');
        });

    $userAgent = helper_nonBotUserAgent();

    $cache = new FileCache(helper_cachedir());

    $cachedResponse = new RespondedRequest(
        new Request('GET', 'https://www.example.com/foo/test', headers: ['User-Agent' => $userAgent->__toString()]),
        new Response(),
    );

    $cache->set($cachedResponse->cacheKey(), $cachedResponse);

    $cachedResponse = new RespondedRequest(
        new Request('GET', 'https://www.example.com/bar/test', headers: ['User-Agent' => $userAgent->__toString()]),
        new Response(),
    );

    $cache->set($cachedResponse->cacheKey(), $cachedResponse);

    $httpLoader = new HttpLoader($userAgent, $httpClient);

    $httpLoader->setCache($cache);

    $httpLoader->cacheOnlyWhereUrl(Filter::urlPathStartsWith('/bar/'));

    $httpLoader->{$loadingMethod}('https://www.example.com/foo/test');

    $httpLoader->{$loadingMethod}('https://www.example.com/bar/test');
})->with(['load', 'loadOrFail']);

it('updates an existing cached response', function () {
    $httpClient = Mockery::mock(ClientInterface::class);

    $httpClient
        ->shouldReceive('sendRequest')
        ->once()
        ->andReturn(new Response(body: 'hello'));

    $cache = new FileCache(helper_cachedir());

    $cache->clear();

    $httpLoader = new HttpLoader(helper_nonBotUserAgent(), $httpClient);

    $httpLoader->setCache($cache);

    $response = $httpLoader->load('https://www.example.com/idontknow');

    if (!$response) {
        throw new Exception('failed to get response');
    }

    $extendedResponse = RespondedRequestChild::fromRespondedRequest($response);

    $httpLoader->addToCache($extendedResponse);

    $response = $httpLoader->load('https://www.example.com/idontknow');

    /** @var RespondedRequestChild $response */

    expect($response)->toBeInstanceOf(RespondedRequestChild::class)
        ->and($response->itseme())->toBe('mario');
});

it('does not add cookies to the cookie jar when a response was served from the cache', function () {
    $httpClient = Mockery::mock(ClientInterface::class);

    $httpClient->shouldNotReceive('sendRequest');

    $cache = new FileCache(helper_cachedir());

    $httpLoader = new HttpLoader(helper_nonBotUserAgent(), $httpClient);

    $httpLoader->setCache($cache);

    $respondedRequest = new RespondedRequest(
        new Request(
            'GET',
            'https://www.example.com/wtf',
            ['Host' => ['www.example.com'], 'User-Agent' => [(string) helper_nonBotUserAgent()]],
        ),
        new Response(headers: ['Set-Cookie' => 'foo=bar'], body: 'Wtf!'),
    );

    $cache->set($respondedRequest->cacheKey(), $respondedRequest);

    $httpLoader->load('https://www.example.com/wtf');

    $cookieJar = invade($httpLoader)->cookieJar;

    /** @var CookieJar $cookieJar */

    $cookies = $cookieJar->allByDomain('example.com');

    expect($cookies)->toHaveCount(0);
});

test('By default it uses the cookie jar and passes on cookies', function () {
    $httpClient = Mockery::mock(ClientInterface::class);

    $httpClient->shouldReceive('sendRequest')->withArgs(function (RequestInterface $request) {
        return $request->getUri()->__toString() === 'https://www.crwlr.software/';
    })->andReturn(new Response(200, ['Set-Cookie' => ['cookie1=foo']]));

    $httpClient->shouldReceive('sendRequest')->withArgs(function (RequestInterface $request) {
        $cookiesHeader = $request->getHeader('Cookie');

        return $request->getUri()->__toString() === 'https://www.crwlr.software/blog' &&
            $cookiesHeader === ['cookie1=foo'];
    })->andReturn(new Response(200, ['Set-Cookie' => ['cookie1=foo', 'cookie2=bar']]));

    $httpClient->shouldReceive('sendRequest')->withArgs(function (RequestInterface $request) {
        $cookiesHeader = $request->getHeader('Cookie');

        return $request->getUri()->__toString() === 'https://www.crwlr.software/contact' &&
            $cookiesHeader === ['cookie1=foo', 'cookie2=bar'];
    })->andReturn(new Response(200, ['Set-Cookie' => ['cookie1=foo2', 'cookie2=bar2', 'cookie3=baz']]));

    $httpClient->shouldReceive('sendRequest')->withArgs(function (RequestInterface $request) {
        $cookiesHeader = $request->getHeader('Cookie');

        return $request->getUri()->__toString() === 'https://www.crwlr.software/packages' &&
            $cookiesHeader === ['cookie1=foo2', 'cookie2=bar2', 'cookie3=baz'];
    })->andReturn(new Response());

    $httpLoader = new HttpLoader(helper_nonBotUserAgent(), $httpClient);

    $httpLoader->load('https://www.crwlr.software/');

    $httpLoader->load('https://www.crwlr.software/blog');

    $httpLoader->loadOrFail('https://www.crwlr.software/contact');

    $httpLoader->loadOrFail('https://www.crwlr.software/packages');

    expect(true)->toBeTrue(); // Just here so pest doesn't complain that there is no assertion.
});

test('You can turn off using the cookie jar', function () {
    $httpClient = Mockery::mock(ClientInterface::class);

    $httpClient->shouldReceive('sendRequest')->withArgs(function (RequestInterface $request) {
        return $request->getUri()->__toString() === 'https://www.crwlr.software/';
    })->andReturn(new Response(200, ['Set-Cookie' => ['cookie1=foo']]));

    $httpClient->shouldReceive('sendRequest')->withArgs(function (RequestInterface $request) {
        $cookiesHeader = $request->getHeader('Cookie');

        return $request->getUri()->__toString() === 'https://www.crwlr.software/blog' && $cookiesHeader === [];
    })->andReturn(new Response(200, ['Set-Cookie' => ['cookie1=foo', 'cookie2=bar']]));

    $httpClient->shouldReceive('sendRequest')->withArgs(function (RequestInterface $request) {
        $cookiesHeader = $request->getHeader('Cookie');

        return $request->getUri()->__toString() === 'https://www.crwlr.software/contact' && $cookiesHeader === [];
    })->andReturn(new Response(200, ['Set-Cookie' => ['cookie1=foo2', 'cookie2=bar2', 'cookie3=baz']]));

    $httpClient->shouldReceive('sendRequest')->withArgs(function (RequestInterface $request) {
        $cookiesHeader = $request->getHeader('Cookie');

        return $request->getUri()->__toString() === 'https://www.crwlr.software/packages' && $cookiesHeader === [];
    })->andReturn(new Response());

    $httpLoader = new HttpLoader(helper_nonBotUserAgent(), $httpClient);

    $httpLoader->dontUseCookies();

    $httpLoader->load('https://www.crwlr.software/');

    $httpLoader->load('https://www.crwlr.software/blog');

    $httpLoader->loadOrFail('https://www.crwlr.software/contact');

    $httpLoader->loadOrFail('https://www.crwlr.software/packages');

    expect(true)->toBeTrue(); // Just here so pest doesn't complain that there is no assertion.
});


================================================
FILE: tests/Loader/Http/Messages/RespondedRequestTest.php
================================================
<?php

namespace tests\Loader\Http\Messages;

use Crwlr\Crawler\Loader\Http\Browser\Screenshot;
use Crwlr\Crawler\Loader\Http\Messages\RespondedRequest;
use GuzzleHttp\Psr7\Request;
use GuzzleHttp\Psr7\Response;

use function tests\helper_testfilesdir;

it('can be created from request and response objects.', function () {
    $request = new Request('GET', '/');

    $response = new Response();

    $respondedRequest = new RespondedRequest($request, $response);

    expect($respondedRequest)->toBeInstanceOf(RespondedRequest::class);
});

test('creating with a redirect response adds a redirect uri.', function ($statusCode) {
    $request = new Request('GET', '/');

    $response = new Response($statusCode);

    $respondedRequest = new RespondedRequest($request, $response);

    expect($respondedRequest->redirects())->toHaveCount(1);
})->with([300, 301, 302, 303, 304, 305, 307, 308]);

test('creating with non redirect responses doesn\'t add a redirect uri.', function ($statusCode) {
    $request = new Request('GET', '/');

    $response = new Response($statusCode);

    $respondedRequest = new RespondedRequest($request, $response);

    expect($respondedRequest->redirects())->toHaveCount(0);
})->with([101, 200, 404, 500]);

test('isRedirect returns false when the response is not a redirect', function () {
    $request = new Request('GET', '/');

    $response = new Response(200);

    $respondedRequest = new RespondedRequest($request, $response);

    expect($respondedRequest->isRedirect())->toBeFalse();
});

test('isRedirect returns true when the response is a redirect', function () {
    $request = new Request('GET', '/');

    $response = new Response(301);

    $respondedRequest = new RespondedRequest($request, $response);

    expect($respondedRequest->isRedirect())->toBeTrue();
});

test('isRedirect returns true when the last response is a redirect', function () {
    $request = new Request('GET', '/');

    $response = new Response(301);

    $respondedRequest = new RespondedRequest($request, $response);

    $respondedRequest->setResponse(new Response(302));

    expect($respondedRequest->isRedirect())->toBeTrue();
});

test('isRedirect returns false when the last response is not a redirect', function () {
    $request = new Request('GET', '/');

    $response = new Response(301);

    $respondedRequest = new RespondedRequest($request, $response);

    $respondedRequest->setResponse(new Response(200));

    expect($respondedRequest->isRedirect())->toBeFalse();
});

test('the requested uri remains the same when the request was redirected.', function () {
    $request = new Request('GET', '/request-uri');

    $response = new Response(301, ['Location' => '/redirect-uri']);

    $respondedRequest = new RespondedRequest($request, $response);

    $respondedRequest->setResponse(new Response(200));

    expect($respondedRequest->requestedUri())->toBe('/request-uri');
});

test('when request was not redirected the effective uri equals the requested uri', function () {
    $request = new Request('GET', '/request-uri');

    $response = new Response(200);

    $respondedRequest = new RespondedRequest($request, $response);

    expect($respondedRequest->effectiveUri())->toBe('/request-uri');
});

test('when request was redirected the effective uri is the redirect uri', function () {
    $request = new Request('GET', '/request-uri');

    $response = new Response(301, ['Location' => '/redirect-uri']);

    $respondedRequest = new RespondedRequest($request, $response);

    $respondedRequest->setResponse(new Response(200));

    expect($respondedRequest->effectiveUri())->toBe('/redirect-uri');
});

test('the allUris() method returns all unique URIs', function () {
    $request = new Request('GET', '/request-uri');

    $response = new Response(301, ['Location' => '/redirect-uri']);

    $respondedRequest = new RespondedRequest($request, $response);

    $respondedRequest->setResponse(new Response(301, ['Location' => '/request-uri']));

    $respondedRequest->setResponse(new Response(301, ['Location' => '/another-redirect-uri']));

    $respondedRequest->setResponse(new Response(200));

    expect($respondedRequest->allUris())->toBe([
        '/request-uri',
        '/redirect-uri',
        '/another-redirect-uri',
    ]);
});

it('can be serialized', function () {
    $respondedRequest = new RespondedRequest(
        new Request('POST', '/home', ['key' => 'val'], 'bod'),
        new Response(201, ['k' => 'v'], 'res'),
        [new Screenshot('/path/to/screenshot.png'), new Screenshot('/another/path/to/screenshot.webp')],
    );

    $respondedRequest->addRedirectUri('/index');

    $serialized = serialize($respondedRequest);

    expect($serialized)->toBe(
        'O:51:"Crwlr\\Crawler\\Loader\\Http\\Messages\\RespondedRequest":9:{s:13:"requestMethod";s:4:"POST";s:10:' .
        '"requestUri";s:5:"/home";s:14:"requestHeaders";a:1:{s:3:"key";a:1:{i:0;s:3:"val";}}s:11:"requestBody";' .
        's:3:"bod";s:12:"effectiveUri";s:6:"/index";s:18:"responseStatusCode";i:201;s:15:"responseHeaders";a:1:{' .
        's:1:"k";a:1:{i:0;s:1:"v";}}s:12:"responseBody";s:3:"res";s:11:"screenshots";a:2:{i:0;' .
        's:23:"/path/to/screenshot.png";i:1;s:32:"/another/path/to/screenshot.webp";}}',
    );
});

test('an old serialized instance without screenshots array can be unserialized', function () {
    $serialized = 'O:51:"Crwlr\Crawler\Loader\Http\Messages\RespondedRequest":8:{s:13:"requestMethod";s:4:"POST";' .
        's:10:"requestUri";s:5:"/home";s:14:"requestHeaders";a:1:{s:3:"key";a:1:{i:0;s:3:"val";}}s:11:"requestBody";' .
        's:3:"bod";s:12:"effectiveUri";s:6:"/index";s:18:"responseStatusCode";i:201;s:15:"responseHeaders";a:1:{' .
        's:1:"k";a:1:{i:0;s:1:"v";}}s:12:"responseBody";s:3:"res";}';

    $respondedRequest = unserialize($serialized);

    /** @var RespondedRequest $respondedRequest */

    expect($respondedRequest)->toBeInstanceOf(RespondedRequest::class)
        ->and($respondedRequest->request->getMethod())->toBe('POST')
        ->and($respondedRequest->request->getUri()->__toString())->toBe('/home')
        ->and($respondedRequest->request->getHeaders())->toBe(['key' => ['val']])
        ->and($respondedRequest->request->getBody()->getContents())->toBe('bod')
        ->and($respondedRequest->effectiveUri())->toBe('/index')
        ->and($respondedRequest->response->getStatusCode())->toBe(201)
        ->and($respondedRequest->response->getHeaders())->toBe(['k' => ['v']])
        ->and($respondedRequest->response->getBody()->getContents())->toBe('res');
});

test('a serialized instance can be unserialized', function () {
    // We need actual existing file paths for screenshots
    $screenshot1 = helper_testfilesdir('screenshot1.png');

    $screenshot2 = helper_testfilesdir('screenshot2.jpeg');

    $serialized = 'O:51:"Crwlr\Crawler\Loader\Http\Messages\RespondedRequest":9:{s:13:"requestMethod";s:4:"POST";' .
        's:10:"requestUri";s:5:"/home";s:14:"requestHeaders";a:1:{s:3:"key";a:1:{i:0;s:3:"val";}}s:11:"requestBody";' .
        's:3:"bod";s:12:"effectiveUri";s:6:"/index";s:18:"responseStatusCode";i:201;s:15:"responseHeaders";a:1:{' .
        's:1:"k";a:1:{i:0;s:1:"v";}}s:12:"responseBody";s:3:"res";s:11:"screenshots";a:2:{i:0;' .
        's:' . strlen($screenshot1) . ':"' . $screenshot1 . '";i:1;' .
        's:' . strlen($screenshot2) . ':"' . $screenshot2 . '";}}';

    $respondedRequest = unserialize($serialized);

    /** @var RespondedRequest $respondedRequest */

    expect($respondedRequest)->toBeInstanceOf(RespondedRequest::class)
        ->and($respondedRequest->request->getMethod())->toBe('POST')
        ->and($respondedRequest->request->getUri()->__toString())->toBe('/home')
        ->and($respondedRequest->request->getHeaders())->toBe(['key' => ['val']])
        ->and($respondedRequest->request->getBody()->getContents())->toBe('bod')
        ->and($respondedRequest->effectiveUri())->toBe('/index')
        ->and($respondedRequest->response->getStatusCode())->toBe(201)
        ->and($respondedRequest->response->getHeaders())->toBe(['k' => ['v']])
        ->and($respondedRequest->response->getBody()->getContents())->toBe('res')
        ->and($respondedRequest->screenshots[0]->path)->toBe($screenshot1)
        ->and($respondedRequest->screenshots[1]->path)->toBe($screenshot2);
});

it('can be created from an old serialized array that was not containing the screenshots array', function () {
    $serialized = 'a:8:{s:13:"requestMethod";s:3:"GET";s:10:"requestUri";s:4:"/foo";s:14:"requestHeaders";a:0:{}s:11:' .
        '"requestBody";s:0:"";s:12:"effectiveUri";s:4:"/bar";s:18:"responseStatusCode";i:200;s:15:"responseHeaders";' .
        'a:0:{}s:12:"responseBody";s:0:"";}';

    $respondedRequest = RespondedRequest::fromArray(unserialize($serialized));

    expect($respondedRequest)->toBeInstanceOf(RespondedRequest::class)
        ->and($respondedRequest->request->getUri()->__toString())->toBe('/foo')
        ->and($respondedRequest->effectiveUri())->toBe('/bar');
});

it('can be created from a serialized array that is containing the screenshots array', function () {
    // We need actual existing file paths
    $screenshot1 = helper_testfilesdir('screenshot1.png');

    $screenshot2 = helper_testfilesdir('screenshot2.jpeg');

    $serialized = 'a:9:{s:13:"requestMethod";s:3:"GET";s:10:"requestUri";s:4:"/foo";s:14:"requestHeaders";a:0:{}s:11:' .
        '"requestBody";s:0:"";s:12:"effectiveUri";s:4:"/bar";s:18:"responseStatusCode";i:200;s:15:"responseHeaders";' .
        'a:0:{}s:12:"responseBody";s:0:"";s:11:"screenshots";a:2:{i:0;' .
        's:' . strlen($screenshot1) . ':"' . $screenshot1 . '";i:1;' .
        's:' . strlen($screenshot2) . ':"' . $screenshot2 . '";}}';

    $respondedRequest = RespondedRequest::fromArray(unserialize($serialized));

    expect($respondedRequest)->toBeInstanceOf(RespondedRequest::class)
        ->and($respondedRequest->request->getUri()->__toString())->toBe('/foo')
        ->and($respondedRequest->effectiveUri())->toBe('/bar')
        ->and($respondedRequest->screenshots[0]->path)->toBe($screenshot1)
        ->and($respondedRequest->screenshots[1]->path)->toBe($screenshot2);
});

test(
    'when creating from a serialized array, it checks screenshot paths for existence and throws away screenshots ' .
    'when the files don\'t exist',
    function () {
        $serialized = 'a:9:{s:13:"requestMethod";s:3:"GET";s:10:"requestUri";s:4:"/foo";s:14:"requestHeaders";' .
            'a:0:{}s:11:"requestBody";s:0:"";s:12:"effectiveUri";s:4:"/bar";s:18:"responseStatusCode";i:200;' .
            's:15:"responseHeaders";a:0:{}s:12:"responseBody";s:0:"";s:11:"screenshots";a:2:{i:0;' .
            's:24:"/path/to/screenshot1.png";i:1;s:25:"/path/to/screenshot2.jpeg";}}';

        $respondedRequest = RespondedRequest::fromArray(unserialize($serialized));

        expect($respondedRequest)->toBeInstanceOf(RespondedRequest::class)
            ->and($respondedRequest->request->getUri()->__toString())->toBe('/foo')
            ->and($respondedRequest->effectiveUri())->toBe('/bar')
            ->and($respondedRequest->screenshots)->toHaveCount(0);
    },
);

it('has a toArrayForResult() method', function () {
    $respondedRequest = new RespondedRequest(
        new Request('POST', '/home', ['key' => 'val'], 'bod'),
        new Response(201, ['k' => 'v'], 'res'),
        [new Screenshot('/path/to/screenshot.jpg')],
    );

    expect($respondedRequest->toArrayForResult())->toBe([
        'requestMethod' => 'POST',
        'requestUri' => '/home',
        'requestHeaders' => ['key' => ['val']],
        'requestBody' => 'bod',
        'effectiveUri' => '/home',
        'responseStatusCode' => 201,
        'responseHeaders' => ['k' => ['v']],
        'responseBody' => 'res',
        'screenshots' => ['/path/to/screenshot.jpg'],
        'url' => '/home',
        'uri' => '/home',
        'status' => 201,
        'headers' => ['k' => ['v']],
        'body' => 'res',
    ]);
});

it('generates a cache key for an instance', function () {
    $respondedRequest = new RespondedRequest(new Request('GET', '/foo/bar'), new Response());

    expect($respondedRequest->cacheKey())->toBe('27ca75942fb28ed0d8fb3f9b077dd582');
});


================================================
FILE: tests/Loader/Http/Politeness/RobotsTxtHandlerTest.php
================================================
<?php

namespace tests\Loader\Http\Politeness;

use Crwlr\Crawler\Loader\Http\HttpLoader;
use Crwlr\Crawler\Loader\Http\Politeness\RobotsTxtHandler;
use Crwlr\Crawler\Logger\CliLogger;
use Crwlr\Crawler\UserAgents\BotUserAgent;
use Crwlr\Crawler\UserAgents\UserAgent;
use Crwlr\Crawler\UserAgents\UserAgentInterface;
use GuzzleHttp\Client;
use GuzzleHttp\Psr7\Response;
use GuzzleHttp\Psr7\Utils;
use Mockery;
use PHPUnit\Framework\TestCase;
use Psr\Http\Message\RequestInterface;

function helper_getLoaderWithRobotsTxt(string $robotsTxtContent = '', ?UserAgentInterface $userAgent = null): HttpLoader
{
    if (!$userAgent) {
        $userAgent = new BotUserAgent('FooBot');
    }

    $httpClient = Mockery::mock(Client::class);

    if ($userAgent instanceof BotUserAgent) {
        $httpClient->shouldReceive('sendRequest')->withArgs(function (RequestInterface $request) {
            return str_ends_with($request->getUri()->__toString(), '/robots.txt');
        })->andReturn(new Response(200, [], Utils::streamFor($robotsTxtContent)));
    }

    return new HttpLoader($userAgent, $httpClient);
}

/** @var TestCase $this */

test('route is disallowed when it\'s disallowed for my user agent', function () {
    $robotsTxt = <<<ROBOTSTXT
        User-agent: FooBot
        Disallow: /foo/
        ROBOTSTXT;

    $loader = helper_getLoaderWithRobotsTxt($robotsTxt);

    $robotsTxt = new RobotsTxtHandler($loader);

    expect($robotsTxt->isAllowed('https://www.example.com/foo/bar'))->toBeFalse();
});

test('route is disallowed when it\'s disallowed for all user agents', function () {
    $robotsTxt = <<<ROBOTSTXT
        User-agent: *
        Disallow: /foo/
        ROBOTSTXT;

    $loader = helper_getLoaderWithRobotsTxt($robotsTxt);

    $robotsTxt = new RobotsTxtHandler($loader);

    expect($robotsTxt->isAllowed('https://www.example.com/foo/bar'))->toBeFalse();
});

test(
    'route is not disallowed when it\'s disallowed for all user agents but my user agent is not a BotUserAgent',
    function () {
        $robotsTxt = <<<ROBOTSTXT
            User-agent: *
            Disallow: /foo/
            ROBOTSTXT;

        $loader = helper_getLoaderWithRobotsTxt($robotsTxt, new UserAgent('Any User Agent'));

        $robotsTxt = new RobotsTxtHandler($loader);

        expect($robotsTxt->isAllowed('https://www.example.com/foo/bar'))->toBeTrue();
    },
);

test(
    'route is not disallowed when it\'s disallowed for all user agent but I want to ignore wildcard rules',
    function () {
        $robotsTxt = <<<ROBOTSTXT
            User-agent: *
            Disallow: /foo/
            ROBOTSTXT;

        $loader = helper_getLoaderWithRobotsTxt($robotsTxt);

        $robotsTxt = new RobotsTxtHandler($loader);

        $robotsTxt->ignoreWildcardRules();

        expect($robotsTxt->isAllowed('https://www.example.com/foo/bar'))->toBeTrue();
    },
);

it('gets all the sitemap URLs from robots.txt', function () {
    $robotsTxt = <<<ROBOTSTXT
        User-agent: *
        Disallow:

        Sitemap: https://www.example.com/sitemap.xml
        Sitemap: https://www.example.com/sitemap2.xml
        sitemap: https://www.example.com/sitemap3.xml
        ROBOTSTXT;

    $loader = helper_getLoaderWithRobotsTxt($robotsTxt);

    $robotsTxt = new RobotsTxtHandler($loader);

    expect($robotsTxt->getSitemaps('https://www.example.com/home'))->toBe([
        'https://www.example.com/sitemap.xml',
        'https://www.example.com/sitemap2.xml',
        'https://www.example.com/sitemap3.xml',
    ]);
});

it('fails silently when parsing fails', function () {
    $robotsTxt = <<<ROBOTSTXT
        Disallow: /
        ROBOTSTXT;

    $loader = helper_getLoaderWithRobotsTxt($robotsTxt);

    $robotsTxt = new RobotsTxtHandler($loader, new CliLogger());

    expect($robotsTxt->isAllowed('https://www.example.com/anything'))->toBeTrue();

    $logOutput = $this->getActualOutputForAssertion();

    expect($logOutput)->toContain('Failed to parse robots.txt');
});


================================================
FILE: tests/Loader/Http/Politeness/ThrottlerTest.php
================================================
<?php

namespace tests\Loader\Http\Politeness;

use Crwlr\Crawler\Loader\Http\Politeness\Throttler;
use Crwlr\Crawler\Loader\Http\Politeness\TimingUnits\MultipleOf;
use Crwlr\Url\Url;
use Crwlr\Utils\Microseconds;
use InvalidArgumentException;

it('waits between 1.0 and 2.0 times of the time span that the last request took by default', function () {
    $url = Url::parsePsr7('https://www.example.com');

    $throttler = new Throttler();

    $throttler->waitAtLeast(Microseconds::fromSeconds(0.001));

    $throttler->trackRequestStartFor($url);

    usleep(Microseconds::fromSeconds(0.1)->value);

    $throttler->trackRequestEndFor($url);

    $requestEndTime = Microseconds::fromSeconds(microtime(true));

    $throttler->waitForGo($url);

    $readyForNextRequest = Microseconds::fromSeconds(microtime(true));

    $diff = $readyForNextRequest->subtract($requestEndTime);

    expect($diff->value)->toBeGreaterThan(100000)
        ->and($diff->value)->toBeLessThan(220000); // A bit more than * 2.0 because other things happening also take time.
});

it('waits min 0.25s by default', function () {
    $url = Url::parsePsr7('https://www.example.com');

    $throttler = new Throttler();

    $throttler->trackRequestStartFor($url);

    $throttler->trackRequestEndFor($url);

    $requestEndTime = Microseconds::fromSeconds(microtime(true));

    $throttler->waitForGo($url);

    $readyForNextRequest = Microseconds::fromSeconds(microtime(true));

    $diff = $readyForNextRequest->subtract($requestEndTime);

    expect($diff->value)->toBeGreaterThan(250000);
});

it('respects the max wait time you set', function () {
    $url = Url::parsePsr7('https://www.example.com');

    $throttler = new Throttler();

    $throttler
        ->waitBetween(new MultipleOf(10), new MultipleOf(20))
        ->waitAtMax(Microseconds::fromSeconds(0.1));

    $throttler->trackRequestStartFor($url);

    usleep(Microseconds::fromSeconds(0.1)->value);

    $throttler->trackRequestEndFor($url);

    $requestEndTime = Microseconds::fromSeconds(microtime(true));

    $throttler->waitForGo($url);

    $readyForNextRequest = Microseconds::fromSeconds(microtime(true));

    $diff = $readyForNextRequest->subtract($requestEndTime);

    expect($diff->value)->toBeLessThan(110000); // A bit more than * 1.0 because other things happening also take time.
});

it('waits only if there was already a request to the same domain', function () {
    $url = Url::parsePsr7('https://www.example.com');

    $throttler = new Throttler();

    $throttler
        ->waitBetween(new MultipleOf(10), new MultipleOf(20))
        ->waitAtMax(Microseconds::fromSeconds(0.1));

    $throttler->trackRequestStartFor($url);

    usleep(Microseconds::fromSeconds(0.01)->value);

    $throttler->trackRequestEndFor($url);

    $requestEndTime = Microseconds::fromSeconds(microtime(true));

    $throttler->waitForGo(Url::parsePsr7('https://www.crwlr.software'));

    $readyForNextRequest = Microseconds::fromSeconds(microtime(true));

    $diff = $readyForNextRequest->subtract($requestEndTime);

    expect($diff->value)->toBeLessThan(1000);
});

it('throws an exception if you try to set different types for from and to', function () {
    new Throttler(Microseconds::fromSeconds(0.1), new MultipleOf(0.5));
})->throws(InvalidArgumentException::class);

it('throws an exception if you try to set the from value bigger than the to value with Microseconds', function () {
    new Throttler(Microseconds::fromSeconds(2.0), Microseconds::fromSeconds(1.0));
})->throws(InvalidArgumentException::class);

it('throws an exception if you try to set the from value bigger than the to value with MultipleOf', function () {
    new Throttler(new MultipleOf(1.0), new MultipleOf(0.9));
})->throws(InvalidArgumentException::class);

it('does not throw an exception when from and to values are equal', function () {
    new Throttler(Microseconds::fromSeconds(2.0), Microseconds::fromSeconds(2.0));

    new Throttler(new MultipleOf(1.0), new MultipleOf(1.0));

    expect(true)->toBeTrue();
});

test('internal _requestToUrlWasStarted returns false when _internalTrackStartFor was not called', function () {
    $url = Url::parsePsr7('https://www.example.com');

    $throttler = new Throttler();

    $throttler
        ->waitBetween(Microseconds::fromSeconds(0.001), Microseconds::fromSeconds(0.002))
        ->waitAtMax(Microseconds::fromSeconds(0.002));

    expect(invade($throttler)->_requestToUrlWasStarted($url))->toBeFalse();

    $throttler->trackRequestEndFor($url); // To check if no error/exception occurs when start was not called before.
});

test('internal _requestToUrlWasStarted returns true when _internalTrackStartFor was called', function () {
    $url = Url::parsePsr7('https://www.example.com');

    $throttler = new Throttler();

    $throttler
        ->waitBetween(Microseconds::fromSeconds(0.001), Microseconds::fromSeconds(0.002))
        ->waitAtMax(Microseconds::fromSeconds(0.002));

    $throttler->trackRequestStartFor($url);

    $invadedThrottler = invade($throttler);

    expect($invadedThrottler->_requestToUrlWasStarted($url))->toBeTrue();

    // And after end of the request is tracked, it should return false again.
    $throttler->trackRequestEndFor($url);

    expect($invadedThrottler->_requestToUrlWasStarted($url))->toBeFalse();
});


================================================
FILE: tests/Loader/Http/Politeness/TimingUnits/MultipleOfTest.php
================================================
<?php

namespace tests\Loader\Http\Politeness\TimingUnits;

use Crwlr\Crawler\Loader\Http\Politeness\TimingUnits\MultipleOf;
use Crwlr\Utils\Microseconds;

it('calculates the multiple of a Microseconds instance', function () {
    expect(
        (new MultipleOf(7.89))
            ->calc(Microseconds::fromSeconds(1.23))
            ->toSeconds(),
    )->toBe(9.7047);
});


================================================
FILE: tests/Loader/Http/ProxyManagerTest.php
================================================
<?php

namespace tests\Loader\Http;

use Crwlr\Crawler\Loader\Http\ProxyManager;

it('knows if it manages only one or multiple proxy server', function () {
    $manager = new ProxyManager(['http://127.0.0.1:8001']);

    expect($manager->hasOnlySingleProxy())
        ->toBeTrue()
        ->and($manager->hasMultipleProxies())
        ->toBeFalse();

    $manager = new ProxyManager(['http://127.0.0.1:8001', 'http://127.0.0.1:8002']);

    expect($manager->hasOnlySingleProxy())
        ->toBeFalse()
        ->and($manager->hasMultipleProxies())
        ->toBeTrue();
});

it('returns the proxy when only one is defined', function () {
    $manager = new ProxyManager(['http://127.0.0.1:8003']);

    expect($manager->getProxy())
        ->toBe('http://127.0.0.1:8003')
        ->and($manager->getProxy())
        ->toBe('http://127.0.0.1:8003');
});

it('rotates the proxies when multiple are defined', function () {
    $manager = new ProxyManager(['http://127.0.0.1:8001', 'http://127.0.0.1:8002', 'http://127.0.0.1:8003']);

    expect($manager->getProxy())
        ->toBe('http://127.0.0.1:8001')
        ->and($manager->getProxy())
        ->toBe('http://127.0.0.1:8002')
        ->and($manager->getProxy())
        ->toBe('http://127.0.0.1:8003')
        ->and($manager->getProxy())
        ->toBe('http://127.0.0.1:8001');
});


================================================
FILE: tests/Loader/LoaderTest.php
================================================
<?php

namespace tests\Loader;

use Crwlr\Crawler\Loader\Loader;
use Crwlr\Crawler\UserAgents\BotUserAgent;
use Mockery;
use Psr\SimpleCache\CacheInterface;
use tests\_Stubs\DummyLogger;

test('You can set multiple hook callbacks for one type and they are executed when called', function (string $hookName) {
    $loader = new class (new BotUserAgent('FooBot'), $hookName) extends Loader {
        public function __construct(BotUserAgent $userAgent, private readonly string $hookName)
        {
            parent::__construct($userAgent);
        }

        public function load(mixed $subject): mixed
        {
            if ($this->hookName === 'afterLoad') {
                $this->callHook('beforeLoad'); // Loader won't run afterLoad when beforeLoad wasn't called.
            }

            $this->callHook($this->hookName);

            return 'something';
        }

        public function loadOrFail(mixed $subject): mixed
        {
            return 'something';
        }
    };
    $callback1Called = false;
    $loader->{$hookName}(function () use (&$callback1Called) {
        $callback1Called = true;
    });
    $callback2Called = false;
    $loader->{$hookName}(function () use (&$callback2Called) {
        $callback2Called = true;
    });
    $callback3Called = false;
    $loader->{$hookName}(function () use (&$callback3Called) {
        $callback3Called = true;
    });

    $loader->load('something');

    expect($callback1Called)->toBeTrue()
        ->and($callback2Called)->toBeTrue()
        ->and($callback3Called)->toBeTrue();
})->with([
    'beforeLoad',
    'onCacheHit',
    'onSuccess',
    'onError',
    'afterLoad',
]);

it('does not call the afterLoad hook when beforeLoad was not called before it', function () {
    $logger = new DummyLogger();

    $loader = new class (new BotUserAgent('FooBot'), $logger) extends Loader {
        public function load(mixed $subject): mixed
        {
            $this->callHook('afterLoad');

            return 'something';
        }

        public function loadOrFail(mixed $subject): mixed
        {
            return 'something';
        }
    };

    $callbackCalled = false;

    $loader->afterLoad(function () use (&$callbackCalled) {
        $callbackCalled = true;
    });

    $loader->load('something');

    expect($callbackCalled)->toBeFalse()
        ->and($logger->messages[0]['message'])->toStartWith(
            'The afterLoad hook was called without a preceding call to the beforeLoad hook.',
        );
});

it('calls the afterLoad hook when beforeLoad was called before it', function () {
    $logger = new DummyLogger();

    $loader = new class (new BotUserAgent('FooBot'), $logger) extends Loader {
        public function load(mixed $subject): mixed
        {
            $this->callHook('beforeLoad');

            $this->callHook('afterLoad');

            return 'something';
        }

        public function loadOrFail(mixed $subject): mixed
        {
            return 'something';
        }
    };

    $callbackCalled = false;

    $loader->afterLoad(function () use (&$callbackCalled) {
        $callbackCalled = true;
    });

    $loader->load('something');

    expect($callbackCalled)->toBeTrue()
        ->and($logger->messages)->toHaveCount(0);
});

test('You can set a cache and use it in the load function', function () {
    $loader = new class (new BotUserAgent('FooBot')) extends Loader {
        public function load(mixed $subject): string
        {
            $this->cache?->get('foo');

            return 'something';
        }
        public function loadOrFail(mixed $subject): mixed
        {
            return 'something';
        }
    };

    $cache = Mockery::mock(CacheInterface::class);

    $cache->shouldReceive('get')->with('foo')->once();

    $loader->setCache($cache);

    $loader->load('something');
});


================================================
FILE: tests/Logger/CliLoggerTest.php
================================================
<?php

namespace tests\Logger;

use Crwlr\Crawler\Logger\CliLogger;
use PHPUnit\Framework\TestCase;

/** @var TestCase $this */

test('It prints a message', function () {
    $logger = new CliLogger();
    $logger->log('info', 'Some log message.');
    $output = $this->getActualOutputForAssertion();
    expect($output)->toContain('Some log message.');
});

test('It prints the log level', function () {
    $logger = new CliLogger();
    $logger->log('alert', 'Everybody panic!');
    $output = $this->getActualOutputForAssertion();
    expect($output)->toContain('[ALERT]');
});

test('It starts with printing the time', function () {
    $logger = new CliLogger();
    $logger->log('warning', 'Warn about something.');
    $this->expectOutputRegex('/^\d\d:\d\d:\d\d:\d\d\d\d\d\d/');
});

test('It has methods for all the log levels', function ($logLevel) {
    $logger = new CliLogger();
    $logger->{$logLevel}('Some message');
    $output = $this->getActualOutputForAssertion();
    expect($output)->toContain('Some message');
    expect($output)->toContain('[' . strtoupper($logLevel) . ']');
})->with([
    'emergency',
    'alert',
    'critical',
    'error',
    'warning',
    'notice',
    'info',
    'debug',
]);


================================================
FILE: tests/Logger/PreStepInvocationLoggerTest.php
================================================
<?php

namespace tests\Logger;

use Crwlr\Crawler\Logger\PreStepInvocationLogger;
use tests\_Stubs\DummyLogger;

it('logs messages', function () {
    $logger = new PreStepInvocationLogger();

    $logger->info('test');

    $logger->warning('foo');

    $logger->error('some error');

    expect($logger->messages)->toHaveCount(3)
        ->and($logger->messages[0]['level'])->toBe('info')
        ->and($logger->messages[0]['message'])->toBe('test')
        ->and($logger->messages[1]['level'])->toBe('warning')
        ->and($logger->messages[1]['message'])->toBe('foo')
        ->and($logger->messages[2]['level'])->toBe('error')
        ->and($logger->messages[2]['message'])->toBe('some error');
});

it('passes log messages to another logger', function () {
    $logger = new PreStepInvocationLogger();

    $logger->info('test');

    $logger->warning('foo');

    $logger->error('some error');

    $anotherLogger = new DummyLogger();

    $logger->passToOtherLogger($anotherLogger);

    expect($anotherLogger->messages)->toHaveCount(3)
        ->and($anotherLogger->messages[0]['level'])->toBe('info')
        ->and($anotherLogger->messages[0]['message'])->toBe('test')
        ->and($anotherLogger->messages[1]['level'])->toBe('warning')
        ->and($anotherLogger->messages[1]['message'])->toBe('foo')
        ->and($anotherLogger->messages[2]['level'])->toBe('error')
        ->and($anotherLogger->messages[2]['message'])->toBe('some error');
});


================================================
FILE: tests/Pest.php
================================================
<?php

namespace tests;

use Crwlr\Crawler\HttpCrawler;
use Crwlr\Crawler\Input;
use Crwlr\Crawler\Loader\Http\HttpLoader;
use Crwlr\Crawler\Loader\Http\Messages\RespondedRequest;
use Crwlr\Crawler\Loader\Http\Politeness\Throttler;
use Crwlr\Crawler\Loader\Http\Politeness\TimingUnits\MultipleOf;
use Crwlr\Crawler\Loader\LoaderInterface;
use Crwlr\Crawler\Output;
use Crwlr\Crawler\Steps\Loading\LoadingStep;
use Crwlr\Crawler\Steps\Step;
use Crwlr\Crawler\Steps\StepInterface;
use Crwlr\Crawler\Steps\StepOutputType;
use Crwlr\Crawler\UserAgents\UserAgent;
use Crwlr\Crawler\UserAgents\UserAgentInterface;
use Crwlr\Crawler\Utils\OutputTypeHelper;
use Crwlr\Utils\Microseconds;
use Generator;
use GuzzleHttp\Psr7\Request;
use GuzzleHttp\Psr7\Response;
use GuzzleHttp\Psr7\Utils;
use Psr\Http\Client\ClientInterface;
use Psr\Log\LoggerInterface;
use stdClass;
use Symfony\Component\Process\Process;

class TestServerProcess
{
    public static ?Process $process = null;
}

uses()
    ->group('integration')
    ->beforeEach(function () {
        if (!isset(TestServerProcess::$process)) {
            TestServerProcess::$process = Process::fromShellCommandline(
                'php -S localhost:8000 ' . __DIR__ . '/_Integration/Server.php',
            );

            TestServerProcess::$process->start();

            usleep(100000);
        }
    })
    ->afterAll(function () {
        TestServerProcess::$process?->stop(3, SIGINT);

        TestServerProcess::$process = null;
    })
    ->in('_Integration');

function helper_dump(mixed $var): void
{
    error_log(var_export($var, true));
}

function helper_dieDump(mixed $var): void
{
    var_dump($var);
    ob_end_flush();
    exit;
}

function helper_getValueReturningStep(mixed $value): Step
{
    return new class ($value) extends Step {
        public function __construct(private mixed $value) {}

        protected function invoke(mixed $input): Generator
        {
            yield $this->value;
        }

        public function outputType(): StepOutputType
        {
            return OutputTypeHelper::isAssociativeArrayOrObject($this->value) ?
                StepOutputType::AssociativeArrayOrObject :
                StepOutputType::Scalar;
        }
    };
}

function helper_getInputReturningStep(): Step
{
    return new class extends Step {
        protected function invoke(mixed $input): Generator
        {
            yield $input;
        }
    };
}

function helper_getNumberIncrementingStep(): Step
{
    return new class extends Step {
        protected function invoke(mixed $input): Generator
        {
            yield $input + 1;
        }
    };
}

function helper_getStepYieldingMultipleNumbers(): Step
{
    return new class extends Step {
        protected function invoke(mixed $input): Generator
        {
            foreach (['one', 'two', 'two', 'three', 'four', 'three', 'five', 'three'] as $number) {
                yield $number;
            }
        }
    };
}

function helper_getStepYieldingMultipleArraysWithNumber(): Step
{
    return new class extends Step {
        protected function invoke(mixed $input): Generator
        {
            foreach (['one', 'two', 'two', 'three', 'four', 'three', 'five', 'three'] as $key => $number) {
                yield ['number' => $number, 'foo' => 'bar' . ($input === true ? ' ' . $key : '')];
            }
        }
    };
}

function helper_getStepYieldingObjectWithNumber(int $number): Step
{
    return new class ($number) extends Step {
        public function __construct(private int $number) {}

        protected function invoke(mixed $input): Generator
        {
            yield helper_getStdClassWithData(
                ['number' => $this->number, 'foo' => 'bar' . (is_int($input) ? ' ' . $input : '')],
            );
        }
    };
}

function helper_getStepYieldingMultipleObjectsWithNumber(): Step
{
    return new class extends Step {
        protected function invoke(mixed $input): Generator
        {
            foreach (['one', 'two', 'two', 'three', 'four', 'three', 'five', 'three'] as $key => $number) {
                yield helper_getStdClassWithData(
                    ['number' => $number, 'foo' => 'bar' . ($input === true ? ' ' . $key : '')],
                );
            }
        }
    };
}

function helper_getStepYieldingInputArrayAsSeparateOutputs(): Step
{
    return new class extends Step {
        protected function invoke(mixed $input): Generator
        {
            foreach ($input as $output) {
                yield $output;
            }
        }
    };
}

function helper_getLoadingStep(): Step
{
    return new class extends Step {
        /**
         * @use LoadingStep<LoaderInterface>
         */
        use LoadingStep;

        protected function invoke(mixed $input): Generator
        {
            yield 'yo';
        }
    };
}

function helper_getDummyRobotsTxtResponse(?string $forDomain = null): Response
{
    return new Response(
        200,
        [],
        "User-agent: FooBot\n" .
        "Disallow: " . ($forDomain ? '/' . $forDomain . '/secret' : 'secret'),
    );
}

/**
 * @param iterable<mixed> $iterable
 * @return void
 */
function helper_traverseIterable(iterable $iterable): void
{
    foreach ($iterable as $key => $value) {
        // just traverse
    }
}

/**
 * @param mixed[] $array
 * @return Generator<mixed>
 */
function helper_arrayToGenerator(array $array): Generator
{
    foreach ($array as $element) {
        yield $element;
    }
}

/**
 * @param Generator<mixed> $generator
 * @return mixed[]
 */
function helper_generatorToArray(Generator $generator): array
{
    $array = [];

    foreach ($generator as $value) {
        $array[] = $value;
    }

    return $array;
}

/**
 * @return Output[]
 */
function helper_invokeStepWithInput(StepInterface $step, mixed $input = null): array
{
    return helper_generatorToArray($step->invokeStep(new Input($input ?? 'anything')));
}

function helper_getStepFilesContent(string $filePathInFilesFolder): string
{
    $content = file_get_contents(__DIR__ . '/Steps/_Files/' . $filePathInFilesFolder);

    if ($content === false) {
        return '';
    }

    return $content;
}

/**
 * @param mixed[] $data
 */
function helper_getStdClassWithData(array $data): stdClass
{
    $object = new stdClass();

    foreach ($data as $key => $value) {
        $object->{$key} = $value;
    }

    return $object;
}

function helper_getSimpleListHtml(): string
{
    return <<<HTML
        <ul id="list">
            <li class="item">one</li>
            <li class="item">two</li>
            <li class="item">three</li>
            <li class="item">four</li>
        </ul>
        HTML;
}

function helper_getFastLoader(
    ?UserAgentInterface $userAgent = null,
    ?LoggerInterface $logger = null,
    ?ClientInterface $httpClient = null,
): HttpLoader {
    $loader = new HttpLoader($userAgent ?? UserAgent::mozilla5CompatibleBrowser(), $httpClient, $logger);

    $loader->throttle()
        ->waitBetween(new MultipleOf(0.0001), new MultipleOf(0.0002))
        ->waitAtLeast(Microseconds::fromSeconds(0.0001));

    return $loader;
}

function helper_getFastCrawler(): HttpCrawler
{
    return new class extends HttpCrawler {
        protected function userAgent(): UserAgentInterface
        {
            return new UserAgent('TestBot');
        }

        protected function loader(UserAgentInterface $userAgent, LoggerInterface $logger): LoaderInterface
        {
            return helper_getFastLoader($userAgent, $logger);
        }
    };
}

function helper_nonBotUserAgent(): UserAgent
{
    return new UserAgent('Mozilla/5.0 (compatible; FooBot)');
}

function helper_getMinThrottler(): Throttler
{
    return new Throttler(new MultipleOf(0.0001), new MultipleOf(0.0002), Microseconds::fromSeconds(0.0001));
}

/**
 * @param array<string, string|string[]> $requestHeaders
 * @param array<string, string|string[]> $responseHeaders
 */
function helper_getRespondedRequest(
    string $method = 'GET',
    string $url = 'https://www.example.com/foo',
    array $requestHeaders = [],
    ?string $requestBody = null,
    int $statusCode = 200,
    array $responseHeaders = [],
    ?string $responseBody = null,
): RespondedRequest {
    if ($requestBody !== null) {
        $request = new Request($method, $url, $requestHeaders, Utils::streamFor($requestBody));
    } else {
        $request = new Request($method, $url, $requestHeaders);
    }

    if ($responseBody !== null) {
        $response = new Response($statusCode, $responseHeaders, body: Utils::streamFor($responseBody));
    } else {
        $response = new Response($statusCode, $responseHeaders);
    }

    return new RespondedRequest($request, $response);
}

function helper_cachedir(?string $inDir = null): string
{
    $path = __DIR__ . '/_Temp/_cachedir';

    if ($inDir !== null) {
        return $path . (str_starts_with($inDir, '/') ? $inDir : '/' . $inDir);
    }

    return $path;
}

function helper_resetCacheDir(): void
{
    helper_resetTempDir(helper_cachedir());
}

function helper_storagedir(?string $inDir = null): string
{
    $path = __DIR__ . '/_Temp/_storagedir';

    if ($inDir !== null) {
        return $path . (str_starts_with($inDir, '/') ? $inDir : '/' . $inDir);
    }

    return $path;
}

function helper_resetStorageDir(): void
{
    helper_resetTempDir(helper_storagedir());
}

function helper_resetTempDir(string $dirPath): void
{
    $files = scandir($dirPath);

    if (is_array($files)) {
        foreach ($files as $file) {
            if ($file === '.' || $file === '..' || $file === '.gitkeep') {
                continue;
            }

            @unlink($dirPath . '/' . $file);
        }
    }
}

function helper_testfilesdir(?string $inDir = null): string
{
    $path = __DIR__ . '/_Temp/_testfilesdir';

    if ($inDir !== null) {
        return $path . (str_starts_with($inDir, '/') ? $inDir : '/' . $inDir);
    }

    return $path;
}


================================================
FILE: tests/ResultTest.php
================================================
<?php

namespace tests;

use Crwlr\Crawler\Loader\Http\Browser\Screenshot;
use Crwlr\Crawler\Loader\Http\Messages\RespondedRequest;
use Crwlr\Crawler\Result;
use GuzzleHttp\Psr7\Request;
use GuzzleHttp\Psr7\Response;

test('You can set and get a property', function () {
    $result = new Result();

    $result->set('title', 'PHP Web Developer');

    expect($result->get('title'))->toBe('PHP Web Developer');
});

test('You can set multiple values for a property', function () {
    $result = new Result();

    $result->set('location', 'Linz');

    expect($result->get('location'))->toBe('Linz');

    $result->set('location', 'Wien');

    expect($result->get('location'))->toBe(['Linz', 'Wien']);
});

test('The get method has a default value that you can set yourself', function () {
    $result = new Result();

    expect($result->get('foo'))->toBeNull()
        ->and($result->get('foo', '123'))->toBe('123');
});

test('You can convert it to a plain array', function () {
    $result = new Result();

    $result->set('title', 'PHP Web Developer (w/m/x)');

    $result->set('location', 'Linz');

    $result->set('location', 'Wien');

    expect($result->toArray())->toBe([
        'title' => 'PHP Web Developer (w/m/x)',
        'location' => ['Linz', 'Wien'],
    ]);
});

test('Converting to an array, also converts all objects at any level in the array to arrays', function () {
    $result = new Result();

    $result->set('foo', 'one');

    $result->set(
        'bar',
        helper_getStdClassWithData([
            'a' => 'b',
            'c' => helper_getStdClassWithData(['d' => 'e', 'f' => 'g']),
        ]),
    );

    $resultArray = $result->toArray();

    expect($resultArray)->toBe([
        'foo' => 'one',
        'bar' => [
            'a' => 'b',
            'c' => ['d' => 'e', 'f' => 'g'],
        ],
    ]);
});

test(
    'when the only element of the output array is some unnamed property, but the value is an array with keys, ' .
    'it returns only that child array',
    function () {
        $result = new Result();

        $result->set('unnamed', new RespondedRequest(
            new Request('GET', 'https://www.example.com/foo'),
            new Response(200, [], 'Hello World!'),
            [new Screenshot('/path/to/screenshot.png')],
        ));

        $resultArray = $result->toArray();

        expect($resultArray)->toBeArray()
            ->and(count($resultArray))->toBeGreaterThanOrEqual(14)
            ->and($resultArray['url'])->toBe('https://www.example.com/foo')
            ->and($resultArray['status'])->toBe(200)
            ->and($resultArray['body'])->toBe('Hello World!')
            ->and($resultArray['screenshots'][0])->toBe('/path/to/screenshot.png');
    },
);

test(
    'when the only element of the output array is an unnamed property, with a scalar value, it returns the unnamed key',
    function () {
        $result = new Result();

        $result->set('unnamed', 'foo');

        $resultArray = $result->toArray();

        expect($resultArray)->toBe(['unnamed' => 'foo']);
    },
);

test('when you add something with empty string as key it creates a name with incrementing number', function () {
    $result = new Result();

    $result->set('', 'foo');

    expect($result->get('unnamed1'))->toBe('foo');

    $result->set('', 'bar');

    expect($result->get('unnamed2'))->toBe('bar');

    $result->set('', 'baz');

    expect($result->get('unnamed3'))->toBe('baz');
});

test('you can create a new instance from another instance', function () {
    $instance1 = new Result();

    $instance1->set('foo', 'bar');

    $instance2 = new Result($instance1);

    expect($instance1->get('foo'))->toBe('bar')
        ->and($instance2->get('foo'))->toBe('bar');

    $instance2->set('baz', 'quz');

    expect($instance1->get('baz'))->toBeNull()
        ->and($instance2->get('baz'))->toBe('quz');
});

test('it makes a proper array of arrays if you repeatedly add (associative) arrays with the same key', function () {
    $result = new Result();

    $result->set('foo', ['bar' => 'one', 'baz' => 'two']);

    expect($result->get('foo'))->toBe(['bar' => 'one', 'baz' => 'two']);

    $result->set('foo', ['bar' => 'three', 'baz' => 'four']);

    expect($result->get('foo'))->toBe([
        ['bar' => 'one', 'baz' => 'two'],
        ['bar' => 'three', 'baz' => 'four'],
    ]);
});


================================================
FILE: tests/Steps/BaseStepTest.php
================================================
<?php

namespace tests\Steps;

use Crwlr\Crawler\Crawler;
use Crwlr\Crawler\HttpCrawler;
use Crwlr\Crawler\Input;
use Crwlr\Crawler\Loader\Http\Messages\RespondedRequest;
use Crwlr\Crawler\Logger\CliLogger;
use Crwlr\Crawler\Logger\PreStepInvocationLogger;
use Crwlr\Crawler\Output;
use Crwlr\Crawler\Steps\BaseStep;
use Crwlr\Crawler\Steps\Exceptions\PreRunValidationException;
use Crwlr\Crawler\Steps\Filters\Filter;
use Crwlr\Crawler\Steps\Html;
use Crwlr\Crawler\Steps\Loading\Http;
use Crwlr\Crawler\Steps\Refiners\StringRefiner;
use Crwlr\Crawler\Steps\Step;
use Crwlr\Crawler\Steps\StepOutputType;
use Exception;
use Generator;
use GuzzleHttp\Psr7\Request;
use GuzzleHttp\Psr7\Response;
use InvalidArgumentException;
use PHPUnit\Framework\TestCase;
use tests\_Stubs\DummyLogger;

use function tests\helper_getInputReturningStep;
use function tests\helper_getStdClassWithData;
use function tests\helper_getStepFilesContent;
use function tests\helper_getValueReturningStep;
use function tests\helper_invokeStepWithInput;

class TestStep extends BaseStep
{
    public ?bool $passesAllFilters = null;

    public function invokeStep(Input $input): Generator
    {
        $this->passesAllFilters = $this->passesAllFilters($input->get());

        yield new Output('yo');
    }
}

/** @var TestCase $this */

test('You can set a filter and passesAllFilters() tells if an output value passes that filter', function () {
    $step = new TestStep();

    $step->where(Filter::equal('hello'));

    helper_invokeStepWithInput($step, new Input('hello'));

    expect($step->passesAllFilters)->toBeTrue();

    helper_invokeStepWithInput($step, new Input('hola'));

    expect($step->passesAllFilters)->toBeFalse();
});

test('You can set multiple filters and passesAllFilters() tells if an output value passes that filters', function () {
    $step = new TestStep();

    $step->where(Filter::stringContains('foo'))
        ->where(Filter::equal('boo foo too'))
        ->where(Filter::notEqual('pew foo tew'));

    helper_invokeStepWithInput($step, new Input('boo foo too'));

    expect($step->passesAllFilters)->toBeTrue();

    helper_invokeStepWithInput($step, new Input('foo something'));

    expect($step->passesAllFilters)->toBeFalse();

    helper_invokeStepWithInput($step, new Input('pew foo tew'));

    expect($step->passesAllFilters)->toBeFalse();
});

test(
    'you can link filters using orWhere and passesAllFilters() is true when one of those filters evaluates to true',
    function () {
        $step = new TestStep();

        $step->where(Filter::stringStartsWith('foo'))
            ->orWhere(Filter::stringStartsWith('bar'))
            ->orWhere(Filter::stringEndsWith('foo'));

        helper_invokeStepWithInput($step, new Input('foo bar baz'));

        expect($step->passesAllFilters)->toBeTrue();

        helper_invokeStepWithInput($step, new Input('bar foo baz'));

        expect($step->passesAllFilters)->toBeTrue();

        helper_invokeStepWithInput($step, new Input('bar baz foo'));

        expect($step->passesAllFilters)->toBeTrue();

        helper_invokeStepWithInput($step, new Input('funky town'));

        expect($step->passesAllFilters)->toBeFalse();
    },
);

it('uses a key from an array when providing a key to the filter() method', function () {
    $step = new TestStep();

    $step->where('vendor', Filter::equal('crwlr'));

    helper_invokeStepWithInput($step, new Input(['vendor' => 'crwlr', 'package' => 'url']));

    expect($step->passesAllFilters)->toBeTrue();

    helper_invokeStepWithInput($step, new Input(['vendor' => 'illuminate', 'package' => 'support']));

    expect($step->passesAllFilters)->toBeFalse();
});

it('uses a key from an object when providing a key to the filter() method', function () {
    $step = new TestStep();

    $step->where('vendor', Filter::equal('crwlr'));

    helper_invokeStepWithInput($step, new Input(
        helper_getStdClassWithData(['vendor' => 'crwlr', 'package' => 'url']),
    ));

    expect($step->passesAllFilters)->toBeTrue();

    helper_invokeStepWithInput($step, new Input(
        helper_getStdClassWithData(['vendor' => 'illuminate', 'package' => 'support']),
    ));

    expect($step->passesAllFilters)->toBeFalse();
});

it('filters using a custom Closure filter', function () {
    $step = new TestStep();

    $step->where('bar', Filter::custom(function (mixed $value) {
        return in_array($value, ['one', 'two', 'three'], true);
    }));

    helper_invokeStepWithInput($step, ['foo' => 'one', 'bar' => 'two']);

    expect($step->passesAllFilters)->toBeTrue();

    helper_invokeStepWithInput($step, ['foo' => 'three', 'bar' => 'four']);

    expect($step->passesAllFilters)->toBeFalse();
});

it('throws an exception when you provide a string as first argument to filter but no second argument', function () {
    $step = new TestStep();

    $step->where('test');
})->throws(InvalidArgumentException::class);

it('removes an UTF-8 byte order mark from the beginning of a string', function () {
    $step = new class extends Step {
        protected function invoke(mixed $input): Generator
        {
            yield $input;
        }

        protected function validateAndSanitizeInput(mixed $input): mixed
        {
            return parent::validateAndSanitizeStringOrHttpResponse($input);
        }
    };

    $stringWithBom = helper_getStepFilesContent('Xml/rss-with-bom.xml');

    $response = new RespondedRequest(
        new Request('GET', 'https://www.example.com/rss'),
        new Response(body: $stringWithBom),
    );

    $outputs = helper_invokeStepWithInput($step, $response);

    expect($outputs)->toHaveCount(1)
        ->and($outputs[0]->get())->toBeString()
        ->and(substr($outputs[0]->get(), 0, 5))->toBe('<?xml');

    // Also test with string as input.
    $outputs = helper_invokeStepWithInput($step, $stringWithBom);

    expect($outputs)->toHaveCount(1)
        ->and($outputs[0]->get())->toBeString()
        ->and(substr($outputs[0]->get(), 0, 5))->toBe('<?xml');
});

it(
    'transfers log messages already logged when the Step uses a PreStepInvocationLogger before receiving the logger ' .
    'from the crawler',
    function () {
        $step = new class extends Step {
            public function __construct()
            {
                $this->addLogger(new PreStepInvocationLogger());

                $this->logger?->info('test');

                $this->logger?->warning('foo');
            }

            protected function invoke(mixed $input): Generator
            {
                yield $input;
            }
        };

        $crawlerLogger = new DummyLogger();

        $step->addLogger($crawlerLogger);

        expect($crawlerLogger->messages)->toHaveCount(2)
            ->and($crawlerLogger->messages[0]['level'])->toBe('info')
            ->and($crawlerLogger->messages[0]['message'])->toBe('test')
            ->and($crawlerLogger->messages[1]['level'])->toBe('warning')
            ->and($crawlerLogger->messages[1]['message'])->toBe('foo');
    },
);

it(
    'when using a PreStepInvocationLogger, the later created logger is also passed to refiners, so its log messages ' .
    'won\'t be lost',
    function () {
        $step = new class extends Step {
            public function __construct()
            {
                $this->addLogger(new PreStepInvocationLogger());

                $this->logger?->info('test');
            }

            protected function invoke(mixed $input): Generator
            {
                yield $input;
            }
        };

        $step->refineOutput('foo', StringRefiner::replace('foo', 'bar'));

        $logger = new DummyLogger();

        $step->addLogger($logger);

        helper_invokeStepWithInput($step, ['foo' => 1.2]);

        expect($logger->messages)->toHaveCount(2)
            ->and($logger->messages[1]['message'])->toBe(
                'Refiner StringRefiner::replace() can\'t be applied to value of type double',
            );
    },
);

/* ----------------------------- validateBeforeRun() ----------------------------- */

it(
    'throws an exception in validateBeforeRun() when output type is scalar and keep() was used but not keepAs()',
    function () {
        $step = new class extends Step {
            protected function invoke(mixed $input): Generator
            {
                yield $input;
            }

            public function outputType(): StepOutputType
            {
                return StepOutputType::Scalar;
            }
        };

        $step->keep()->validateBeforeRun(Http::get());
    },
)->throws(PreRunValidationException::class);

it(
    'logs a warning in validateBeforeRun() when output type is mixed and keep() was used but not keepAs()',
    function () {
        class SomeDemoStep extends Step
        {
            protected function invoke(mixed $input): Generator
            {
                yield $input;
            }
        }

        $step = new SomeDemoStep();

        $step->addLogger(new CliLogger())->keep()->validateBeforeRun(Http::get());

        expect($this->getActualOutputForAssertion())
            ->toContain('The tests\Steps\SomeDemoStep step potentially yields scalar value outputs');
    },
);

test(
    'the warning message, when output type is mixed and keep() was used but not keepAs() with an anonymous step ' .
    'class, extending a step that isn\'t one of the abstract classes Step or BaseStep, contains the parent step ' .
    'class',
    function () {
        class ParentStepClass extends Step
        {
            protected function invoke(mixed $input): Generator
            {
                yield $input;
            }
        }

        $step = new class extends ParentStepClass {};

        $step->addLogger(new CliLogger())->keep()->validateBeforeRun(Http::get());

        expect($this->getActualOutputForAssertion())
            ->toContain(
                'An anonymous class step, that is extending the tests\\Steps\\ParentStepClass step potentially ' .
                'yields scalar value outputs',
            );
    },
);

test(
    'the warning message, when output type is mixed and keep() was used but not keepAs() with an anonymous step ' .
    'class, extending one of the abstract classes Step or BaseStep, only mentions that it is an anonymous step class',
    function (string $extendClass) {
        $step = null;

        if ($extendClass === Step::class) {
            $step = new class extends Step {
                protected function invoke(mixed $input): Generator
                {
                    yield $input;
                }
            };
        } elseif ($extendClass === BaseStep::class) {
            $step = new class extends BaseStep {
                protected function invoke(mixed $input): Generator
                {
                    yield $input;
                }

                public function invokeStep(Input $input): Generator
                {
                    yield from $this->invoke($input);
                }
            };
        }

        if ($step === null) {
            throw new Exception('Invalid $extendClass parameter');
        }

        $step->addLogger(new CliLogger())->keep()->validateBeforeRun(Http::get());

        expect($this->getActualOutputForAssertion())
            ->toContain(
                'An anonymous class step potentially yields scalar value outputs',
            );
    },
)->with([
    [Step::class],
    [BaseStep::class],
]);

it('does not throw an exception or log a warning when output type is scalar and keepAs() was called', function () {
    helper_getInputReturningStep()->addLogger(new CliLogger())->keepAs('foo')->validateBeforeRun(Http::get());

    expect($this->getActualOutputForAssertion())
        ->not()
        ->toContain('The tests\Steps\SomeDemoStep step potentially yields scalar value outputs');
});

it('does not throw an exception or log a warning when output type is scalar and outputKey() was called', function () {
    helper_getInputReturningStep()->addLogger(new CliLogger())->outputKey('foo')->validateBeforeRun(Http::get());

    expect($this->getActualOutputForAssertion())
        ->not()
        ->toContain('The tests\Steps\SomeDemoStep step potentially yields scalar value outputs');
});

it('throws an exception when keepFromInput() was called and initial inputs contain a scalar value', function () {
    Http::get()
        ->keepFromInput()
        ->validateBeforeRun([
            ['foo' => 'bar', 'baz' => 'quz'],
            'scalar',
        ]);
})->throws(PreRunValidationException::class);

it('does not throw an exception when keepFromInput() was called and initial inputs are associative array', function () {
    Http::get()
        ->keepFromInput()
        ->validateBeforeRun([
            ['foo' => 'one'],
            ['foo' => 'two'],
        ]);
})->throwsNoExceptions();

it('logs an error when initial inputs are empty', function () {
    Http::get()
        ->addLogger(new CliLogger())
        ->validateBeforeRun([]);

    expect($this->getActualOutputForAssertion())
        ->toContain('You did not provide any initial inputs for your crawler.');
});

it('throws an exception when keepFromInput() was called and previous step yields scalar outputs', function () {
    Http::get()
        ->keepFromInput()
        ->validateBeforeRun(Html::getLink('.link'));
})->throws(PreRunValidationException::class);

it('does not throw an exception when keepInputAs() was called and previous step yields scalar outputs', function () {
    Http::get()
        ->keepInputAs('link')
        ->validateBeforeRun(Html::getLink('.link'));
})->throwsNoExceptions();

it('logs a warning, when keepFromInput() was called and previous step yields mixed outputs', function () {
    $stepWithMixedOutputType = new class extends Step {
        protected function invoke(mixed $input): Generator
        {
            yield 'yo';
        }

        public function outputType(): StepOutputType
        {
            return StepOutputType::Mixed;
        }
    };

    Http::get()
        ->keepFromInput()
        ->addLogger(new CliLogger())
        ->validateBeforeRun($stepWithMixedOutputType);

    expect($this->getActualOutputForAssertion())
        ->toContain('potentially yields scalar value outputs ')
        ->toContain('the next step can not keep it by using keepFromInput()');
});

test(
    'the warning message, when keepFromInput() was called and previous step yields mixed outputs with an anonymous ' .
    'step class, extending a step that isn\'t one of the abstract classes Step or BaseStep, contains the parent step ' .
    'class',
    function () {
        class ParentStepClassTwo extends Step
        {
            protected function invoke(mixed $input): Generator
            {
                yield 'yo';
            }

            public function outputType(): StepOutputType
            {
                return StepOutputType::Mixed;
            }
        }

        $stepWithMixedOutputType = new class extends ParentStepClassTwo {};

        Http::get()
            ->keepFromInput()
            ->addLogger(new CliLogger())
            ->validateBeforeRun($stepWithMixedOutputType);

        expect($this->getActualOutputForAssertion())
            ->toContain(
                'An anonymous class step, that is extending the tests\\Steps\\ParentStepClassTwo step potentially ' .
                'yields scalar value outputs',
            );
    },
);

test(
    'the warning message, when keepFromInput() was called and previous step yields mixed outputs with an anonymous ' .
    'step class, extending one of the abstract classes Step or BaseStep, only mentions that it is an anonymous step ' .
    'class',
    function (string $extendClass) {
        $stepWithMixedOutputType = null;

        if ($extendClass === Step::class) {
            $stepWithMixedOutputType = new class extends Step {
                protected function invoke(mixed $input): Generator
                {
                    yield 'yo';
                }

                public function outputType(): StepOutputType
                {
                    return StepOutputType::Mixed;
                }
            };
        } elseif ($extendClass === BaseStep::class) {
            $stepWithMixedOutputType = new class extends BaseStep {
                protected function invoke(mixed $input): Generator
                {
                    yield 'yo';
                }

                public function outputType(): StepOutputType
                {
                    return StepOutputType::Mixed;
                }

                public function invokeStep(Input $input): Generator
                {
                    yield from $this->invoke($input);
                }
            };
        }

        if ($stepWithMixedOutputType === null) {
            throw new Exception('Invalid $extendClass parameter');
        }

        Http::get()
            ->keepFromInput()
            ->addLogger(new CliLogger())
            ->validateBeforeRun($stepWithMixedOutputType);

        expect($this->getActualOutputForAssertion())
            ->toContain('An anonymous class step potentially yields scalar value outputs');
    },
)->with([
    [Step::class],
    [BaseStep::class],
]);

/* ----------------------------- keep() ----------------------------- */

it('adds all from array output to the keep array in the output object, when keep() is called', function () {
    $step = helper_getInputReturningStep()->keep();

    $outputs = helper_invokeStepWithInput($step, ['foo' => 'one', 'bar' => 'two']);

    expect($outputs[0]->keep)->toBe(['foo' => 'one', 'bar' => 'two']);
});

it('adds all from object output to the keep array in the output object, when keep() is called', function () {
    $step = helper_getInputReturningStep()->keep();

    $outputObject = new class {
        /**
         * @return array<string, string>
         */
        public function toArray(): array
        {
            return ['key' => 'value', 'key2' => 'value2'];
        }
    };

    $outputs = helper_invokeStepWithInput($step, $outputObject);

    expect($outputs[0]->keep)->toBe(['key' => 'value', 'key2' => 'value2']);
});

it('adds a key from array output to the keep array in the output, when keep() was called with a string', function () {
    $step = helper_getInputReturningStep()->keep('bar');

    $outputs = helper_invokeStepWithInput($step, ['foo' => 'one', 'bar' => 'two']);

    expect($outputs[0]->keep)->toBe(['bar' => 'two']);
});

it('adds multiple keys to the keep array in the output, when keep() was called with an array', function () {
    $step = helper_getInputReturningStep()->keep(['foo', 'baz']);

    $outputs = helper_invokeStepWithInput($step, ['foo' => 'one', 'bar' => 'two', 'baz' => 'three']);

    expect($outputs[0]->keep)->toBe(['foo' => 'one', 'baz' => 'three']);
});

it('maps output data to the keep array in the output, when keep() was called with an associative array', function () {
    $step = helper_getInputReturningStep()->keep(['foo', 'mappedKey' => 'baz']);

    $outputs = helper_invokeStepWithInput($step, ['foo' => 'one', 'bar' => 'two', 'baz' => 'three']);

    expect($outputs[0]->keep)->toBe(['foo' => 'one', 'mappedKey' => 'three']);
});

it('logs an error when output is scalar value and keep was used, and adds the value with an unnamed key', function () {
    $step = helper_getInputReturningStep()
        ->addLogger(new CliLogger())
        ->keep();

    $outputs = helper_invokeStepWithInput($step, 'hello');

    expect($outputs[0]->keep)->toBe(['unnamed1' => 'hello'])
        ->and($this->getActualOutputForAssertion())
        ->toContain('yielded an output that is neither an associative array, nor an object');
});

it('repeatedly adds properties with unnamed keys with increasing numbers', function () {
    $step = helper_getValueReturningStep('world')
        ->keepFromInput()
        ->keep();

    $outputs = helper_invokeStepWithInput($step, new Input('hello', keep: ['unnamed1' => 'servus']));

    expect($outputs)->toHaveCount(1)
        ->and($outputs[0]->keep)->toBe(['unnamed1' => 'servus', 'unnamed2' => 'hello', 'unnamed3' => 'world']);
});

/* ----------------------------- keepAs() ----------------------------- */

it('adds scalar value output with the defined key to keep output data, when keepAs() was used', function () {
    $step = helper_getInputReturningStep()
        ->keepAs('greeting');

    $outputs = helper_invokeStepWithInput($step, 'hello');

    expect($outputs[0]->keep)->toBe(['greeting' => 'hello']);
});

it('adds array output with the defined key to keep output data, when keepAs() was used', function () {
    $step = helper_getInputReturningStep()
        ->keepAs('test');

    $outputs = helper_invokeStepWithInput($step, ['foo' => 'bar']);

    expect($outputs[0]->keep)->toBe(['test' => ['foo' => 'bar']]);
});

/* ----------------------------- keepFromInput() ----------------------------- */

it('adds all from array input to the keep array in the output object, when keepFromInput() is called', function () {
    $step = helper_getValueReturningStep('foo')->keepFromInput();

    $outputs = helper_invokeStepWithInput($step, ['foo' => 'one', 'bar' => 'two']);

    expect($outputs[0]->keep)->toBe(['foo' => 'one', 'bar' => 'two']);
});

it('adds all from object input to the keep array in the output object, when keepFromInput() is called', function () {
    $step = helper_getValueReturningStep('foo')->keepFromInput();

    $inputObject = new class {
        /**
         * @return array<string, string>
         */
        public function toArray(): array
        {
            return ['key' => 'value', 'key2' => 'value2'];
        }
    };

    $outputs = helper_invokeStepWithInput($step, $inputObject);

    expect($outputs[0]->keep)->toBe(['key' => 'value', 'key2' => 'value2']);
});

it(
    'adds a key from array input to the keep array in the output, when keepFromInput() was called with a string',
    function () {
        $step = helper_getValueReturningStep('foo')->keepFromInput('bar');

        $outputs = helper_invokeStepWithInput($step, ['foo' => 'one', 'bar' => 'two']);

        expect($outputs[0]->keep)->toBe(['bar' => 'two']);
    },
);

it(
    'adds multiple keys from the input to the keep array in the output, when keepFromInput() was called with an array',
    function () {
        $step = helper_getValueReturningStep('foo')->keepFromInput(['foo', 'baz']);

        $outputs = helper_invokeStepWithInput($step, ['foo' => 'one', 'bar' => 'two', 'baz' => 'three']);

        expect($outputs[0]->keep)->toBe(['foo' => 'one', 'baz' => 'three']);
    },
);

it(
    'maps input data to the keep array in the output, when keepFromInput() was called with an associative array',
    function () {
        $step = helper_getValueReturningStep('foo')->keepFromInput(['foo', 'mappedKey' => 'baz']);

        $outputs = helper_invokeStepWithInput($step, ['foo' => 'one', 'bar' => 'two', 'baz' => 'three']);

        expect($outputs[0]->keep)->toBe(['foo' => 'one', 'mappedKey' => 'three']);
    },
);

it('logs an error when input is scalar value and keep was used, and adds the value with an unnamed key', function () {
    $step = helper_getValueReturningStep('foo')
        ->addLogger(new CliLogger())
        ->keepFromInput();

    $outputs = helper_invokeStepWithInput($step, 'hey');

    expect($outputs[0]->keep)->toBe(['unnamed1' => 'hey'])
        ->and($this->getActualOutputForAssertion())
        ->toContain('received an input that is neither an associative array, nor an object');
});

/* ----------------------------- keepInputAs() ----------------------------- */

it('adds scalar value input with the defined key to keep output data, when keepInputAs() was used', function () {
    $step = helper_getValueReturningStep('yo')
        ->keepInputAs('greeting');

    $outputs = helper_invokeStepWithInput($step, 'hello');

    expect($outputs[0]->keep)->toBe(['greeting' => 'hello']);
});

it('adds array input with the defined key to keep output data, when keepAs() was used', function () {
    $step = helper_getValueReturningStep('yay')
        ->keepInputAs('test');

    $outputs = helper_invokeStepWithInput($step, ['foo' => 'bar']);

    expect($outputs[0]->keep)->toBe(['test' => ['foo' => 'bar']]);
});

/* ------------------------ combinations of keep calls ------------------------ */

it('makes an array of values when the same key should be kept from input and output', function () {
    $step = helper_getValueReturningStep(['foo' => 'one', 'bar' => 'two'])
        ->keepFromInput('foo')
        ->keep(['foo', 'bar']);

    $outputs = helper_invokeStepWithInput($step, ['foo' => 'bar']);

    expect($outputs[0]->keep)->toBe(['foo' => ['bar', 'one'], 'bar' => 'two']);
});

test('same key in input and output, but they are mapped to different keys for keep data', function () {
    $step = helper_getValueReturningStep(['foo' => 'one', 'bar' => 'two'])
        ->keepFromInput(['inputFoo' => 'foo'])
        ->keep(['foo', 'bar']);

    $outputs = helper_invokeStepWithInput($step, ['foo' => 'bar']);

    expect($outputs[0]->keep)->toBe(['inputFoo' => 'bar', 'foo' => 'one', 'bar' => 'two']);
});

it('merges data for the same key recursively', function () {
    $step = helper_getValueReturningStep(['foo' => ['one', 'two'], 'bar' => 'two'])
        ->keepFromInput('foo')
        ->keep(['foo', 'bar']);

    $outputs = helper_invokeStepWithInput(
        $step,
        new Input(['foo' => ['bar', 'baz']], keep: ['foo' => 'test']),
    );

    expect($outputs[0]->keep)->toBe(['foo' => ['test', 'bar', 'baz', 'one', 'two'], 'bar' => 'two']);
});

/* ----------------------------- keepsAnything() ----------------------------- */

test(
    'keepsAnything() returns true when one of keep(), keepAs(), keepFromInput() or keepInputAs() was called',
    function (bool $callKeep, bool $callKeepAs, bool $callKeepFromInput, bool $callKeepInputAs, bool $expected) {
        $step = helper_getInputReturningStep();

        if ($callKeep) {
            $step->keep();
        }

        if ($callKeepAs) {
            $step->keepAs('foo');
        }

        if ($callKeepFromInput) {
            $step->keepFromInput();
        }

        if ($callKeepInputAs) {
            $step->keepInputAs('bar');
        }

        expect($step->keepsAnything())->toBe($expected);
    },
)->with([
    [false, false, false, false, false],
    [true, false, false, false, true],
    [false, true, false, false, true],
    [false, false, true, false, true],
    [false, false, false, true, true],
]);

test(
    'keepsAnythingFromInputData() returns true when one of keepFromInput() or keepInputAs() was called',
    function (bool $callKeep, bool $callKeepAs, bool $callKeepFromInput, bool $callKeepInputAs, bool $expected) {
        $step = helper_getInputReturningStep();

        if ($callKeep) {
            $step->keep();
        }

        if ($callKeepAs) {
            $step->keepAs('foo');
        }

        if ($callKeepFromInput) {
            $step->keepFromInput();
        }

        if ($callKeepInputAs) {
            $step->keepInputAs('bar');
        }

        expect($step->keepsAnythingFromInputData())->toBe($expected);
    },
)->with([
    [false, false, false, false, false],
    [true, false, false, false, false],
    [false, true, false, false, false],
    [false, false, true, false, true],
    [false, false, false, true, true],
]);

test(
    'keepsAnythingFromOutputData() returns true when one of keep() or keepAs() was called',
    function (bool $callKeep, bool $callKeepAs, bool $callKeepFromInput, bool $callKeepInputAs, bool $expected) {
        $step = helper_getInputReturningStep();

        if ($callKeep) {
            $step->keep();
        }

        if ($callKeepAs) {
            $step->keepAs('foo');
        }

        if ($callKeepFromInput) {
            $step->keepFromInput();
        }

        if ($callKeepInputAs) {
            $step->keepInputAs('bar');
        }

        expect($step->keepsAnythingFromOutputData())->toBe($expected);
    },
)->with([
    [false, false, false, false, false],
    [true, false, false, false, true],
    [false, true, false, false, true],
    [false, false, true, false, false],
    [false, false, false, true, false],
]);

/* ----------------------------- sub crawlers ----------------------------- */

it('logs an error message when a sub crawler is defined and step has no reference to a parent crawler', function () {
    $step = helper_getInputReturningStep()->addLogger(new CliLogger());

    $step->subCrawlerFor('bar', function (Crawler $crawler) {
        return $crawler->addStep(Http::get());
    });

    helper_invokeStepWithInput($step, ['foo' => 'one', 'bar' => ['https://www.example.com']]);

    expect($this->getActualOutputForAssertion())->toContain(
        'Can\'t make sub crawler, because the step has no reference to the parent crawler.',
    );
});

it('logs an error message when a sub crawler is defined and output is scalar value', function () {
    $step = helper_getInputReturningStep()->addLogger(new CliLogger());

    $step->setParentCrawler(HttpCrawler::make()->withUserAgent('Test'));

    $step->subCrawlerFor('bar', function (Crawler $crawler) {
        return $crawler->addStep(Http::get());
    });

    helper_invokeStepWithInput($step, 'foo');

    expect($this->getActualOutputForAssertion())
        ->toContain('The sub crawler feature works only with outputs that are associative arrays');
});

it('runs a sub crawler for a certain output property', function () {
    $step = helper_getInputReturningStep()->addLogger(new CliLogger());

    $step->setParentCrawler(HttpCrawler::make()->withUserAgent('Test'));

    $step->subCrawlerFor('bar', function (Crawler $crawler) {
        return $crawler->addStep(Html::root()->extract(['title' => 'h1']));
    });

    $results = helper_invokeStepWithInput($step, [
        'foo' => 'hey',
        'bar' => '<!doctype html><html><head></head><body><h1>Hello World!</h1></body>',
    ]);

    expect($results)->toHaveCount(1)
        ->and($results[0]->get())->toBe(['foo' => 'hey', 'bar' => ['title' => 'Hello World!']]);
});

test('when a sub crawler returns multiple results, they are an array in the parent output', function () {
    $step = helper_getInputReturningStep()->addLogger(new CliLogger());

    $step->setParentCrawler(HttpCrawler::make()->withUserAgent('Test'));

    $step->subCrawlerFor('bar', function (Crawler $crawler) {
        return $crawler->addStep(Html::each('.item')->extract(['title' => 'h3']));
    });

    $html = <<<HTML
        <!doctype html>
        <html>
        <head></head>
        <body>
        <div class="item"><h3>one</h3></div>
        <div class="item"><h3>two</h3></div>
        <div class="item"><h3>three</h3></div>
        </body>
        HTML;

    $results = helper_invokeStepWithInput($step, ['foo' => 'hey', 'bar' => $html, 'baz' => 'yo']);

    expect($results)->toHaveCount(1)
        ->and($results[0]->get())
        ->toBe([
            'foo' => 'hey',
            'bar' => [
                ['title' => 'one'],
                ['title' => 'two'],
                ['title' => 'three'],
            ],
            'baz' => 'yo',
        ]);
});

it('runs a sub crawler with multiple inputs, when defined property is array', function () {
    $step = helper_getInputReturningStep()->addLogger(new CliLogger());

    $step->setParentCrawler(HttpCrawler::make()->withUserAgent('Test'));

    $step->subCrawlerFor('bar', function (Crawler $crawler) {
        return $crawler->addStep(Html::root()->extract(['title' => 'h1']));
    });

    $results = helper_invokeStepWithInput($step, [
        'foo' => 'hey',
        'bar' => [
            '<!doctype html><html><head></head><body><h1>No. 1</h1></body>',
            '<!doctype html><html><head></head><body><h1>No. 2</h1></body>',
            '<!doctype html><html><head></head><body><h1>No. 3</h1></body>',
        ],
        'baz' => 'yo',
    ]);

    expect($results)->toHaveCount(1)
        ->and($results[0]->get())
        ->toBe([
            'foo' => 'hey',
            'bar' => [
                ['title' => 'No. 1'],
                ['title' => 'No. 2'],
                ['title' => 'No. 3'],
            ],
            'baz' => 'yo',
        ]);
});

it('does not run a sub crawler, when output does not contain the defined key', function () {
    $step = helper_getInputReturningStep()->addLogger(new CliLogger());

    $step->setParentCrawler(HttpCrawler::make()->withUserAgent('Test'));

    $step->subCrawlerFor('bar', function (Crawler $crawler) {
        return $crawler->addStep(Html::root()->extract(['title' => 'h1']));
    });

    $results = helper_invokeStepWithInput($step, ['foo' => 'hey', 'baz' => 'ho']);

    expect($results)->toHaveCount(1)
        ->and($results[0]->get())->toBe(['foo' => 'hey', 'baz' => 'ho']);
});


================================================
FILE: tests/Steps/CsvTest.php
================================================
<?php

namespace tests\Steps;

use Crwlr\Crawler\Loader\Http\Messages\RespondedRequest;
use Crwlr\Crawler\Input;
use Crwlr\Crawler\Steps\Csv;
use Crwlr\Crawler\Steps\Filters\Filter;
use GuzzleHttp\Psr7\Request;
use GuzzleHttp\Psr7\Response;
use GuzzleHttp\Psr7\Utils;
use InvalidArgumentException;
use stdClass;
use tests\_Stubs\DummyLogger;

use function tests\helper_invokeStepWithInput;
use function tests\helper_traverseIterable;

function helper_csvFilePath(string $fileName): string
{
    return __DIR__ . '/_Files/Csv/' . $fileName;
}

it('maps a CSV string', function () {
    $string = <<<CSV
        123,"crwl.io","https://www.crwl.io"
        234,"example.com","https://www.example.com"
        345,"otsch.codes","https://www.otsch.codes"
        456,"crwlr.software","https://www.crwlr.software"
        CSV;

    $outputs = helper_invokeStepWithInput(Csv::parseString(['id', 'domain', 'url']), $string);

    expect($outputs)->toHaveCount(4)
        ->and($outputs[0]->get())->toBe(['id' => '123', 'domain' => 'crwl.io', 'url' => 'https://www.crwl.io'])
        ->and($outputs[1]->get())->toBe(['id' => '234', 'domain' => 'example.com', 'url' => 'https://www.example.com'])
        ->and($outputs[2]->get())->toBe(['id' => '345', 'domain' => 'otsch.codes', 'url' => 'https://www.otsch.codes'])
        ->and($outputs[3]->get())->toBe(
            ['id' => '456', 'domain' => 'crwlr.software', 'url' => 'https://www.crwlr.software'],
        );
});

it('maps a file', function () {
    $outputs = helper_invokeStepWithInput(Csv::parseFile(['id', 'name', 'homepage']), helper_csvFilePath('basic.csv'));

    expect($outputs)->toHaveCount(3)
        ->and($outputs[0]->get())->toBe(['id' => '123', 'name' => 'Otsch', 'homepage' => 'https://www.otsch.codes'])
        ->and($outputs[1]->get())->toBe(['id' => '234', 'name' => 'John Doe', 'homepage' => 'https://www.john.doe'])
        ->and($outputs[2]->get())->toBe(['id' => '345', 'name' => 'Jane Doe', 'homepage' => 'https://www.jane.doe']);
});

it('works with a RespondedRequest as input', function () {
    $body = <<<CSV
        123,"John Doe","+431234567"
        234,"Jane Doe","+432345678"
        CSV;

    $respondedRequest = new RespondedRequest(new Request('GET', '/'), new Response(200, [], Utils::streamFor($body)));

    $outputs = helper_invokeStepWithInput(Csv::parseString(['id', 'name', 'phone']), $respondedRequest);

    expect($outputs)->toHaveCount(2)
        ->and($outputs[0]->get())->toBe(['id' => '123', 'name' => 'John Doe', 'phone' => '+431234567'])
        ->and($outputs[1]->get())->toBe(['id' => '234', 'name' => 'Jane Doe', 'phone' => '+432345678']);
});

it('works with an object having a __toString method', function () {
    $object = new class {
        public function __toString(): string
        {
            return <<<CSV
                123,"Max Mustermann","+431234567"
                234,"Julia Musterfrau","+432345678"
                CSV;
        }
    };

    $outputs = helper_invokeStepWithInput(Csv::parseString(['id', 'name', 'phone']), $object);

    expect($outputs)->toHaveCount(2)
        ->and($outputs[0]->get())->toBe(['id' => '123', 'name' => 'Max Mustermann', 'phone' => '+431234567'])
        ->and($outputs[1]->get())->toBe(['id' => '234', 'name' => 'Julia Musterfrau', 'phone' => '+432345678']);
});

it('logs an error message for other inputs', function (string $method, mixed $input) {
    $logger = new DummyLogger();

    $step = ($method === 'string' ? Csv::parseString(['column']) : Csv::parseFile(['column']))->addLogger($logger);

    helper_traverseIterable($step->invokeStep(new Input($input)));

    $logMessages = $logger->messages;

    expect($logMessages)->not->toBeEmpty()
        ->and($logMessages[0]['message'])->toStartWith(
            'The Crwlr\\Crawler\\Steps\\Csv step was called with input that it can not work with: ',
        )
        ->and($logMessages[0]['message'])->toEndWith('. The invalid input is of type ' . gettype($input) . '.');
})->with([
    ['string', 123],
    ['string', new stdClass()],
    ['string', 12.345],
    ['string', true],
    ['string', null],
    ['file', 123],
    ['file', new stdClass()],
    ['file', 12.345],
    ['file', true],
    ['file', null],
]);

it('can map columns using numerical array keys for the columns', function () {
    $string = <<<CSV
        123,"crwlr.software","https://www.crwlr.software","PHP Web Crawling and Scraping Library"
        234,"otsch.codes","https://www.otsch.codes","I am Otsch, I code"
        CSV;

    $outputs = helper_invokeStepWithInput(Csv::parseString([1 => 'domain', 3 => 'description']), $string);

    expect($outputs)->toHaveCount(2)
        ->and($outputs[0]->get())->toBe([
            'domain' => 'crwlr.software', 'description' => 'PHP Web Crawling and Scraping Library',
        ])
        ->and($outputs[1]->get())->toBe(['domain' => 'otsch.codes', 'description' => 'I am Otsch, I code']);
});

it('can map columns using numerical array keys for the columns when parsing file', function () {
    $outputs = helper_invokeStepWithInput(
        Csv::parseFile([1 => 'name', 2 => 'homepage']),
        helper_csvFilePath('basic.csv'),
    );

    expect($outputs)->toHaveCount(3)
        ->and($outputs[0]->get())->toBe(['name' => 'Otsch', 'homepage' => 'https://www.otsch.codes'])
        ->and($outputs[1]->get())->toBe(['name' => 'John Doe', 'homepage' => 'https://www.john.doe'])
        ->and($outputs[2]->get())->toBe(['name' => 'Jane Doe', 'homepage' => 'https://www.jane.doe']);
});

it('can map columns using null for columns to skip', function () {
    $string = <<<CSV
        1997,Ford,E350,"ac, abs, moon",3000.00
        1999,Chevy,"Venture \"Extended Edition\"","",4900.00
        1999,Chevy,"Venture \"Extended Edition, Very Large\"",,5000.00
        CSV;

    $outputs = helper_invokeStepWithInput(Csv::parseString([null, 'make', null, null, 'price']), $string);

    expect($outputs)->toHaveCount(3)
        ->and($outputs[0]->get())->toBe(['make' => 'Ford', 'price' => '3000.00'])
        ->and($outputs[1]->get())->toBe(['make' => 'Chevy', 'price' => '4900.00'])
        ->and($outputs[2]->get())->toBe(['make' => 'Chevy', 'price' => '5000.00']);
});

it('can map columns using null for columns to skip when parsing file', function () {
    $outputs = helper_invokeStepWithInput(Csv::parseFile(['id', null, 'homepage']), helper_csvFilePath('basic.csv'));

    expect($outputs)->toHaveCount(3)
        ->and($outputs[0]->get())->toBe(['id' => '123', 'homepage' => 'https://www.otsch.codes'])
        ->and($outputs[1]->get())->toBe(['id' => '234', 'homepage' => 'https://www.john.doe'])
        ->and($outputs[2]->get())->toBe(['id' => '345', 'homepage' => 'https://www.jane.doe']);
});

it('uses the values from the first line as output keys when no column mapping defined', function () {
    $string = <<<CSV
        id,title,price
        1,"Raspberry Pi Zero 2 W",16.99
        2,"Raspberry Pi Pico",4.20
        3,"Raspberry Pi 400 Personal Computer Kit & Unit",79.49
        CSV;

    $outputs = helper_invokeStepWithInput(Csv::parseString()->skipFirstLine(), $string);

    expect($outputs)->toHaveCount(3)
        ->and($outputs[0]->get())->toBe(['id' => '1', 'title' => 'Raspberry Pi Zero 2 W', 'price' => '16.99']);
});

it('uses the values from the first line as output keys when no column mapping defined when parsing file', function () {
    $outputs = helper_invokeStepWithInput(
        Csv::parseFile()->skipFirstLine(),
        helper_csvFilePath('with-column-headlines.csv'),
    );

    expect($outputs)->toHaveCount(3)
        ->and($outputs[0]->get())->toBe([
            'Stunde' => '1',
            'Montag' => 'Mathematik',
            'Dienstag' => 'Deutsch',
            'Mittwoch' => 'Englisch',
            'Donnerstag' => 'Erdkunde',
            'Freitag' => 'Politik',
        ]);
});

it('skips the first line when defined via method call to skipFirstLine method', function () {
    $string = <<<CSV
        Year,Make,Model,Description,Price
        1997,Ford,E350,"ac, abs, moon",3000.00
        1999,Chevy,"Venture \"Extended Edition\"","",4900.00
        1999,Chevy,"Venture \"Extended Edition, Very Large\"",,5000.00
        CSV;

    $step = Csv::parseString([null, 'make', null, null, 'price'])
        ->skipFirstLine();

    $outputs = helper_invokeStepWithInput($step, $string);

    expect($outputs)->toHaveCount(3)
        ->and($outputs[0]->get())->toBe(['make' => 'Ford', 'price' => '3000.00']);
});

it('skips the first line when parsing file when defined via method call to skipFirstLine method', function () {
    $step = Csv::parseFile([1 => 'fach-erste', 2 => 'fach-zweite'])
        ->skipFirstLine();

    $outputs = helper_invokeStepWithInput($step, helper_csvFilePath('with-column-headlines.csv'));

    expect($outputs)->toHaveCount(3)
        ->and($outputs[0]->get())->toBe(['fach-erste' => 'Mathematik', 'fach-zweite' => 'Deutsch'])
        ->and($outputs[1]->get())->toBe(['fach-erste' => 'Sport', 'fach-zweite' => 'Deutsch'])
        ->and($outputs[2]->get())->toBe(['fach-erste' => 'Sport', 'fach-zweite' => 'Religion (ev., kath.)']);
});

it('skips the first line when defined via constructor param', function () {
    $string = <<<CSV
        Year,Make,Model,Description,Price
        1997,Ford,E350,"ac, abs, moon",3000.00
        CSV;

    $outputs = helper_invokeStepWithInput(Csv::parseString([null, 'make', null, null, 'price'], true), $string);

    expect($outputs)->toHaveCount(1)
        ->and($outputs[0]->get())->toBe(['make' => 'Ford', 'price' => '3000.00']);
});

it('skips the first line when parsing file when defined via constructor param', function () {
    $outputs = helper_invokeStepWithInput(
        Csv::parseFile([1 => 'fach-erste', 3 => 'fach-dritte'], true),
        helper_csvFilePath('with-column-headlines.csv'),
    );

    expect($outputs)->toHaveCount(3)
        ->and($outputs[0]->get())->toBe(['fach-erste' => 'Mathematik', 'fach-dritte' => 'Englisch'])
        ->and($outputs[1]->get())->toBe(['fach-erste' => 'Sport', 'fach-dritte' => 'Englisch'])
        ->and($outputs[2]->get())->toBe(['fach-erste' => 'Sport', 'fach-dritte' => 'Kunst']);
});

it('uses a different separator when you set one', function () {
    $string = <<<CSV
        123|"CoDerOtsch"|Christian|Olear|35
        234|"g3n1u5"|Albert|Einstein|143
        345|"sWiFtY"|Taylor|Swift|32
        CSV;

    $step = Csv::parseString([1 => 'username', 2 => 'firstname', 3 => 'surname'])
        ->separator('|');

    $outputs = helper_invokeStepWithInput($step, $string);

    expect($outputs)->toHaveCount(3)
        ->and($outputs[0]->get())->toBe(['username' => 'CoDerOtsch', 'firstname' => 'Christian', 'surname' => 'Olear'])
        ->and($outputs[1]->get())->toBe(['username' => 'g3n1u5', 'firstname' => 'Albert', 'surname' => 'Einstein'])
        ->and($outputs[2]->get())->toBe(['username' => 'sWiFtY', 'firstname' => 'Taylor', 'surname' => 'Swift']);
});

it('uses a different separator when you set one, when parsing a file', function () {
    $step = Csv::parseFile([1 => 'username', 4 => 'age'])
        ->separator('*');

    $outputs = helper_invokeStepWithInput($step, helper_csvFilePath('separator.csv'));

    expect($outputs)->toHaveCount(3)
        ->and($outputs[0]->get())->toBe(['username' => 'CoDerOtsch', 'age' => '35'])
        ->and($outputs[1]->get())->toBe(['username' => 'g3n1u5', 'age' => '143'])
        ->and($outputs[2]->get())->toBe(['username' => 'sWiFtY', 'age' => '32']);
});

it('throws an InvalidArgumentException when you try to set a multi character separator', function () {
    Csv::parseString([])->separator('***');
})->throws(InvalidArgumentException::class);

it('uses a different enclosure when you set one', function () {
    $string = <<<CSV
        123,/Fritattensuppe/,3.9
        234,/Wiener Schnitzel vom Schwein/,12.7
        345,/Semmelknödel mit Schwammerlsauce/,9.5
        CSV;

    $step = Csv::parseString([1 => 'meal', 2 => 'price'])
        ->enclosure('/');

    $outputs = helper_invokeStepWithInput($step, $string);

    expect($outputs)->toHaveCount(3)
        ->and($outputs[0]->get())->toBe(['meal' => 'Fritattensuppe', 'price' => '3.9'])
        ->and($outputs[1]->get())->toBe(['meal' => 'Wiener Schnitzel vom Schwein', 'price' => '12.7'])
        ->and($outputs[2]->get())->toBe(['meal' => 'Semmelknödel mit Schwammerlsauce', 'price' => '9.5']);
});

it('uses a different enclosure when you set one, when parsing a file', function () {
    $step = Csv::parseFile([1 => 'meal', 2 => 'price'])
        ->enclosure('?');

    $outputs = helper_invokeStepWithInput($step, helper_csvFilePath('enclosure.csv'));

    expect($outputs)->toHaveCount(3)
        ->and($outputs[0]->get())->toBe(['meal' => 'Kräftige Rindsuppe', 'price' => '4.5'])
        ->and($outputs[1]->get())->toBe(['meal' => 'Crispy Chicken Burger', 'price' => '12'])
        ->and($outputs[2]->get())->toBe(['meal' => 'Duett von Saibling und Forelle', 'price' => '21']);
});

it('uses a different escape character when you set one', function () {
    $string = <<<CSV
        123,"test &"escape&" test",test
        CSV;

    $step = Csv::parseString([1 => 'escaped'])
        ->escape('&');

    $outputs = helper_invokeStepWithInput($step, $string);

    expect($outputs)->toHaveCount(1)
        ->and($outputs[0]->get())->toBe(['escaped' => 'test &"escape&" test']);
});

it('uses a different escape character when you set one, when parsing a file', function () {
    $step = Csv::parseFile([1 => 'escaped'])
        ->escape('%');

    $outputs = helper_invokeStepWithInput($step, helper_csvFilePath('escape.csv'));

    expect($outputs)->toHaveCount(2)
        ->and($outputs[0]->get())->toBe(['escaped' => 'test %"escape%" test'])
        ->and($outputs[1]->get())->toBe(['escaped' => 'foo %"escape%" bar %"baz%" lorem']);
});

it('filters rows', function () {
    $string = <<<CSV
        ID,firstname,surname,isPremium
        123,Freddy,Mercury,1
        124,Christian,Olear,1
        125,Jeff,Bezos,0
        CSV;

    $step = Csv::parseString(['id', 3 => 'isPremium'])
        ->skipFirstLine()
        ->where('isPremium', Filter::equal('1'));

    $outputs = helper_invokeStepWithInput($step, $string);

    expect($outputs)->toHaveCount(2)
        ->and($outputs[0]->get())->toBe(['id' => '123', 'isPremium' => '1'])
        ->and($outputs[1]->get())->toBe(['id' => '124', 'isPremium' => '1']);
});

it('filters rows when parsing a file', function () {
    $step = Csv::parseFile(['Stunde', 'Fach'])
        ->skipFirstLine()
        ->where('Fach', Filter::equal('Sport'));

    $outputs = helper_invokeStepWithInput($step, helper_csvFilePath('with-column-headlines.csv'));

    expect($outputs)->toHaveCount(2)
        ->and($outputs[0]->get())->toBe(['Stunde' => '2', 'Fach' => 'Sport'])
        ->and($outputs[1]->get())->toBe(['Stunde' => '3', 'Fach' => 'Sport']);
});

it('filters rows by multiple filters', function () {
    $string = <<<CSV
        ID,firstname,surname,isVip,queenBandMember
        123,Freddy,Mercury,1,1
        124,Ozzy,Osbourne,1,0
        125,Barry,Mitchell,0,1
        CSV;

    $step = Csv::parseString(['id', 3 => 'isVip', 4 => 'isQueenBandMember'])
        ->skipFirstLine()
        ->where('isVip', Filter::equal('1'))
        ->where('isQueenBandMember', Filter::equal('1'));

    $outputs = helper_invokeStepWithInput($step, $string);

    expect($outputs)->toHaveCount(1)
        ->and($outputs[0]->get())->toBe(['id' => '123', 'isVip' => '1', 'isQueenBandMember' => '1']);
});

it('filters rows by multiple filters when parsing a file', function () {
    $step = Csv::parseFile(['Stunde', 'Montag', 'Dienstag', 'Mittwoch', 'Donnerstag', 'Freitag'])
        ->skipFirstLine()
        ->where('Montag', Filter::equal('Sport'))
        ->where('Donnerstag', Filter::equal('Sport'));

    $outputs = helper_invokeStepWithInput($step, helper_csvFilePath('with-column-headlines.csv'));

    expect($outputs)->toHaveCount(1)
        ->and($outputs[0]->get())->toBe([
            'Stunde' => '2',
            'Montag' => 'Sport',
            'Dienstag' => 'Deutsch',
            'Mittwoch' => 'Englisch',
            'Donnerstag' => 'Sport',
            'Freitag' => 'Geschichte',
        ]);
});

it('filters rows with a StringCheck filter', function () {
    $string = <<<CSV
        ID,firstname,surname
        123,Christian,Bale
        124,"Christian Anton",Smith
        125,"Another Christian",Idontknow
        126,Jennifer,Aniston
        CSV;

    $step = Csv::parseString(['id', 'firstname'])
        ->skipFirstLine()
        ->where('firstname', Filter::stringContains('Christian'));

    $outputs = helper_invokeStepWithInput($step, $string);

    expect($outputs)->toHaveCount(3)
        ->and($outputs[0]->get())->toBe(['id' => '123', 'firstname' => 'Christian'])
        ->and($outputs[1]->get())->toBe(['id' => '124', 'firstname' => 'Christian Anton'])
        ->and($outputs[2]->get())->toBe(['id' => '125', 'firstname' => 'Another Christian']);
});


================================================
FILE: tests/Steps/Dom/HtmlDocumentTest.php
================================================
<?php

namespace tests\Steps\Dom;

use Crwlr\Crawler\Steps\Dom\HtmlDocument;
use Crwlr\Crawler\Steps\Dom\HtmlElement;
use Crwlr\Crawler\Steps\Dom\NodeList;

it('gets the href of a base tag in the document', function () {
    $html = '<!doctype html><html><head><title>foo</title><base href="/foo/bar" /></head><body>hello</body></html>';

    $document = new HtmlDocument($html);

    expect($document->getBaseHref())->toBe('/foo/bar');
});

it('gets the href of the first base tag in the document', function () {
    $html = <<<HTML
        <!doctype html>
        <html>
        <head>
            <title>foo</title>
            <base href="/foo" />
            <base href="/bar" />
        </head>
        <body>hey</body>
        </html>
        HTML;

    $document = new HtmlDocument($html);

    expect($document->getBaseHref())->toBe('/foo');
});

test('getBaseHref() returns null if the document does not contain a base tag', function () {
    $html = '<!doctype html><html><head><title>foo</title></head><body>hey</body></html>';

    $document = new HtmlDocument($html);

    expect($document->getBaseHref())->toBeNull();
});

test('the querySelector() method returns an HtmlElement object', function () {
    $html = '<!doctype html><html><head><title>foo</title></head><body><div class="element">hello</div></body></html>';

    $document = new HtmlDocument($html);

    expect($document->querySelector('.element'))->toBeInstanceOf(HtmlElement::class);
});

test('the querySelectorAll() method returns a NodeList of HtmlElement objects', function () {
    $html = '<!doctype html><html><head><title>foo</title></head><body><ul><li>foo</li><li>bar</li></ul></body></html>';

    $document = new HtmlDocument($html);

    $nodeList = $document->querySelectorAll('ul li');

    expect($nodeList)->toBeInstanceOf(NodeList::class);

    $anyNodesChecked = false;

    foreach ($nodeList as $node) {
        expect($node)->toBeInstanceOf(HtmlElement::class);

        $anyNodesChecked = true;
    }

    expect($anyNodesChecked)->toBeTrue();
});

test('the queryXPath() method returns a NodeList of HtmlElement objects', function () {
    $html = '<!doctype html><html><head><title>foo</title></head><body><ul><li>foo</li><li>bar</li></ul></body></html>';

    $document = new HtmlDocument($html);

    $nodeList = $document->queryXPath('//ul/li');

    expect($nodeList)->toBeInstanceOf(NodeList::class);

    $anyNodesChecked = false;

    foreach ($nodeList as $node) {
        expect($node)->toBeInstanceOf(HtmlElement::class);

        $anyNodesChecked = true;
    }

    expect($anyNodesChecked)->toBeTrue();
});


================================================
FILE: tests/Steps/Dom/HtmlElementTest.php
================================================
<?php

namespace tests\Steps\Dom;

use Crwlr\Crawler\Steps\Dom\HtmlDocument;
use Crwlr\Crawler\Steps\Dom\HtmlElement;
use Crwlr\Crawler\Steps\Dom\NodeList;

test('child nodes selected via querySelector() are HtmlElement instances', function () {
    $html = <<<HTML
        <!doctype html>
        <html>
        <head></head>
        <body>
        <div id="wrapper"><div class="element"></div></div>
        </body>
        </html>
        HTML;

    $document = new HtmlDocument($html);

    $wrapperElement = $document->querySelector('#wrapper');

    expect($wrapperElement)->toBeInstanceOf(HtmlElement::class)
        ->and($wrapperElement?->querySelector('.element'))->toBeInstanceOf(HtmlElement::class);
});

test('child nodes selected via querySelectorAll() are HtmlElement instances', function () {
    $html = <<<HTML
        <!doctype html>
        <html>
        <head></head>
        <body>
        <div id="wrapper">
            <div class="element">foo</div>
            <div class="element">bar</div>
        </div>
        </body>
        </html>
        HTML;

    $document = new HtmlDocument($html);

    $wrapperElement = $document->querySelector('#wrapper');

    expect($wrapperElement)->toBeInstanceOf(HtmlElement::class);

    $childNodeList = $wrapperElement?->querySelectorAll('.element');

    expect($childNodeList)->toBeInstanceOf(NodeList::class)
        ->and($childNodeList?->count())->toBe(2)
        ->and($childNodeList?->first())->toBeInstanceOf(HtmlElement::class)
        ->and($childNodeList?->last())->toBeInstanceOf(HtmlElement::class);
});

test('child nodes selected via queryXPath() are HtmlElement instances', function () {
    $html = <<<HTML
        <!doctype html>
        <html>
        <head></head>
        <body>
        <div id="wrapper">
            <div class="element">foo</div>
            <div class="element">bar</div>
        </div>
        </body>
        </html>
        HTML;

    $document = new HtmlDocument($html);

    $wrapperElement = $document->queryXPath('//*[@id="wrapper"]')->first();

    expect($wrapperElement)->toBeInstanceOf(HtmlElement::class);

    $childNodeList = $wrapperElement?->queryXPath('//*[contains(@class, "element")]');

    expect($childNodeList)->toBeInstanceOf(NodeList::class)
        ->and($childNodeList?->count())->toBe(2)
        ->and($childNodeList?->first())->toBeInstanceOf(HtmlElement::class)
        ->and($childNodeList?->first()?->text())->toBe('foo')
        ->and($childNodeList?->last())->toBeInstanceOf(HtmlElement::class)
        ->and($childNodeList?->last()?->text())->toBe('bar');
});

it('gets the node name', function () {
    $html = <<<HTML
        <!doctype html>
        <html>
        <head></head>
        <body>
        <div class="element"><span class="child"></span></div>
        </body>
        </html>
        HTML;

    $document = new HtmlDocument($html);

    $node = $document->querySelector('.element');

    expect($node?->nodeName())->toBe('div')
        ->and($node?->querySelector('.child')?->nodeName())->toBe('span');
});

it('gets the text of a node', function () {
    $html = <<<HTML
        <!doctype html>
        <html>
        <head></head>
        <body>
        <div class="element">
            bli bla <span>blub</span>
        </div>
        </body>
        </html>
        HTML;

    $document = new HtmlDocument($html);

    $node = $document->querySelector('.element');

    expect($node?->text())->toBe('bli bla blub');
});

it('gets the outer HTML of a node', function () {
    $html = <<<HTML
        <!doctype html>
        <html>
        <head></head>
        <body>
        <div class="element">
            bli bla <span>blub</span>
        </div>
        </body>
        </html>
        HTML;

    $document = new HtmlDocument($html);

    $node = $document->querySelector('.element');

    expect($node?->outerHtml())->toBe(
        '<div class="element">' . PHP_EOL .
        '    bli bla <span>blub</span>' . PHP_EOL .
        '</div>',
    );
});

it('gets the inner HTML of a node', function () {
    $html = <<<HTML
        <!doctype html>
        <html>
        <head></head>
        <body>
        <div class="element">
            bli bla <span>blub</span>
        </div>
        </body>
        </html>
        HTML;

    $document = new HtmlDocument($html);

    $node = $document->querySelector('.element');

    expect($node?->innerHtml())->toBe(
        PHP_EOL .
        '    bli bla <span>blub</span>' . PHP_EOL,
    );
});

it('gets an attribute from a node', function () {
    $html = <<<HTML
        <!doctype html>
        <html>
        <head></head>
        <body>
        <a href="/foo/bar" class="element">Link</a>
        </body>
        </html>
        HTML;

    $document = new HtmlDocument($html);

    $node = $document->querySelector('.element');

    expect($node?->getAttribute('href'))->toBe('/foo/bar');
});


================================================
FILE: tests/Steps/Dom/NodeListTest.php
================================================
<?php

namespace Tests\Steps\Dom;

use Crwlr\Crawler\Steps\Dom\HtmlDocument;
use Crwlr\Crawler\Steps\Dom\HtmlElement;
use Crwlr\Crawler\Steps\Dom\Node;
use Crwlr\Crawler\Steps\Dom\NodeList;
use DOMNode;
use Symfony\Component\DomCrawler\Crawler;

it('can be constructed from a symfony Crawler instance', function () {
    $html = <<<HTML
        <!doctype html>
        <html>
        <head></head>
        <body>
            <ul><li>foo</li><li>bar</li><li>baz</li></ul>
        </body>
        </html>
        HTML;

    $crawler = new Crawler($html);

    $filtered = $crawler->filter('ul li');

    $nodeList = new NodeList(
        $filtered,
        function (object $node): HtmlElement {
            /** @var \Dom\Node|DOMNode|Crawler $node */
            return new HtmlElement($node);
        },
    );

    expect($nodeList->count())->toBe(3)
        ->and($nodeList->first()?->text())->toBe('foo')
        ->and($nodeList->nth(2)?->text())->toBe('bar')
        ->and($nodeList->last()?->text())->toBe('baz')
        ->and($nodeList->each(fn($node) => $node->text()))->toBe(['foo', 'bar', 'baz']);
});

it('can be constructed from a \Dom\NodeList instance', function () {
    $html = <<<HTML
        <!doctype html>
        <html>
        <head></head>
        <body>
            <ul><li>foo</li><li>bar</li><li>baz</li></ul>
        </body>
        </html>
        HTML;

    $document = \Dom\HTMLDocument::createFromString($html, LIBXML_NOERROR);

    $nodeList = new NodeList(
        $document->querySelectorAll('ul li'),
        function (object $node): HtmlElement {
            /** @var \Dom\Node|DOMNode|Crawler $node */
            return new HtmlElement($node);
        },
    );

    expect($nodeList->count())->toBe(3)
        ->and($nodeList->first()?->text())->toBe('foo')
        ->and($nodeList->nth(2)?->text())->toBe('bar')
        ->and($nodeList->last()?->text())->toBe('baz')
        ->and($nodeList->each(fn($node) => $node->text()))->toBe(['foo', 'bar', 'baz']);
})->group('php84');

it('can be instantiated from an array of Nodes (object instances from this library)', function () {
    $html = <<<HTML
        <!doctype html>
        <html>
        <head></head>
        <body>
            <div class="list">
                <div class="element">foo</div><div class="element">bar</div><div class="element">baz</div>
            </div>
        </body>
        </html>
        HTML;

    $document = new HtmlDocument($html);

    $array = [];

    foreach ($document->querySelectorAll('.list .element') as $node) {
        $array[] = $node;
    }

    $newNodeList = new NodeList($array);

    expect($newNodeList->count())->toBe(3)
        ->and($newNodeList->first()?->text())->toBe('foo')
        ->and($newNodeList->last()?->text())->toBe('baz')
        ->and($newNodeList->nth(2)?->text())->toBe('bar');
});

it('gets the count of the node list', function () {
    $html = <<<HTML
        <!doctype html>
        <html>
        <head>
            <title>Foo</title>
        </head>
        <body>
            <ul><li>foo</li><li>bar</li><li>baz</li></ul>
        </body>
        </html>
        HTML;

    $document = new HtmlDocument($html);

    expect($document->querySelectorAll('ul li')->count())->toBe(3);
});

it('can be iterated and the elements are instances of Crwlr\Crawler\Steps\Dom\Node', function () {
    $html = <<<HTML
        <!doctype html>
        <html>
        <head>
            <title>Foo</title>
        </head>
        <body>
            <ul><li>foo</li><li>bar</li><li>baz</li></ul>
        </body>
        </html>
        HTML;

    $document = new HtmlDocument($html);

    $iteratesAnyNodes = false;

    foreach ($document->querySelectorAll('ul li') as $node) {
        expect($node)->toBeInstanceOf(Node::class);

        $iteratesAnyNodes = true;
    }

    expect($iteratesAnyNodes)->toBeTrue();
});

it(
    'can be iterated with the each() method and return values are returned as an array from the each() call',
    function () {
        $html = <<<HTML
            <!doctype html>
            <html>
            <head></head>
            <body>
                <div class="list">
                    <div class="element">foo</div>
                    <div class="element">bar</div>
                    <div class="element">baz</div>
                    <div class="element">quz</div>
                </div>
            </body>
            </html>
            HTML;

        $document = new HtmlDocument($html);

        $result = $document->querySelectorAll('.list .element')->each(function ($node) {
            return $node->text() . ' check';
        });

        expect($result)->toBe([
            'foo check',
            'bar check',
            'baz check',
            'quz check',
        ]);
    },
);

test('an empty NodeList can be iterated', function () {
    $html = <<<HTML
        <!doctype html>
        <html>
        <head>
            <title>Foo</title>
        </head>
        <body>
            <ul><li>foo</li><li>bar</li><li>baz</li></ul>
        </body>
        </html>
        HTML;

    $document = new HtmlDocument($html);

    $iteratesAnyNodes = false;

    foreach ($document->querySelectorAll('ul lulu') as $node) {
        $iteratesAnyNodes = true;
    }

    expect($iteratesAnyNodes)->toBeFalse();
});

it('returns the first, last and nth element of the NodeList', function () {
    $html = <<<HTML
        <!doctype html>
        <html>
        <head></head>
        <body>
            <div class="list">
                <div class="element">foo</div>
                <div class="element">bar</div>
                <div class="element">baz</div>
                <div class="element">quz</div>
            </div>
        </body>
        </html>
        HTML;

    $document = new HtmlDocument($html);

    $list = $document->querySelectorAll('.list .element');

    expect($list->first())->toBeInstanceOf(HtmlElement::class)
        ->and($list->first()?->text())->toBe('foo')
        ->and($list->nth(2))->toBeInstanceOf(HtmlElement::class)
        ->and($list->nth(2)?->text())->toBe('bar')
        ->and($list->nth(3))->toBeInstanceOf(HtmlElement::class)
        ->and($list->nth(3)?->text())->toBe('baz')
        ->and($list->last())->toBeInstanceOf(HtmlElement::class)
        ->and($list->last()?->text())->toBe('quz');
});


================================================
FILE: tests/Steps/Dom/NodeTest.php
================================================
<?php

namespace Tests\Steps\Dom;

use Crwlr\Crawler\Steps\Dom\HtmlElement;
use Crwlr\Crawler\Steps\Dom\Node;
use Crwlr\Crawler\Steps\Dom\NodeList;
use Crwlr\Crawler\Steps\Dom\XmlElement;
use Dom\Element;
use Dom\HTMLDocument;
use Dom\XMLDocument;
use DOMNode;
use Exception;
use Symfony\Component\DomCrawler\Crawler;
use tests\Steps\Dom\_Stubs\HtmlNodeStub;
use tests\Steps\Dom\_Stubs\XmlNodeStub;

use const DOM\HTML_NO_DEFAULT_NS;

function helper_getSymfonyCrawlerInstanceFromSource(string $source, string $selectNode = 'body'): Crawler
{
    return (new Crawler($source))->filter($selectNode)->first();
}

/**
 * @throws Exception
 */
function helper_getLegacyDomNodeInstanceFromSource(string $source, string $selectNode = 'body'): DOMNode
{
    $node = (new Crawler($source))->filter($selectNode)->first()->getNode(0);

    if (!$node) {
        throw new Exception('Can\'t get legacy node');
    }

    return $node;
}

function helper_getPhp84HtmlDomNodeInstanceFromSource(string $source, string $selectNode = 'body'): Element
{
    $node = HTMLDocument::createFromString($source, HTML_NO_DEFAULT_NS | LIBXML_NOERROR)->querySelector($selectNode);

    /** @var Element $node */

    return $node;
}

function helper_getPhp84XmlDomNodeInstanceFromSource(string $source, string $selectNode = 'body'): Element
{
    $node = XMLDocument::createFromString($source, LIBXML_NOERROR)->querySelector($selectNode);

    /** @var Element $node */

    return $node;
}

/**
 * @param \Dom\Node|Element|DOMNode|Crawler $originalNode
 */
function helper_getAbstractNodeInstance(object $originalNode, bool $html = true): HtmlNodeStub|XmlNodeStub
{
    if ($html) {
        return new HtmlNodeStub($originalNode);
    }

    return new XmlNodeStub($originalNode);
}

/* ----------------------------- Instantiation ----------------------------- */

it('can be created from a \DOM\Node instance', function () {
    $xml = <<<XML
        <?xml version="1.0" encoding="utf-8"?>
        <items>
            <item><id>1</id><title>Foo</title></item>
        </items>
        XML;

    $domNode = helper_getPhp84XmlDomNodeInstanceFromSource($xml, 'items item');

    expect($domNode)->toBeInstanceOf(\Dom\Node::class);

    $node = new class ($domNode) extends Node {
        protected function makeChildNodeInstance(object $node): Node
        {
            return new XmlElement($node);
        }
    };

    expect($node)->toBeInstanceOf(Node::class)
        ->and($node->text())->toBe('1Foo');
})->group('php84');

it('can be instantiated from a symfony Crawler instance', function () {
    $xml = <<<XML
        <?xml version="1.0" encoding="utf-8"?>
        <items>
            <item><id>1</id><title>Foo</title></item>
        </items>
        XML;

    $crawler = helper_getSymfonyCrawlerInstanceFromSource($xml, 'items item');

    expect($crawler)->toBeInstanceOf(Crawler::class);

    $node = new class ($crawler) extends Node {
        protected function makeChildNodeInstance(object $node): Node
        {
            return new XmlElement($node);
        }
    };

    expect($node)->toBeInstanceOf(Node::class)
        ->and($node->text())->toBe('1Foo');
});

it('can be instantiated from a DOMNode instance', function () {
    $xml = <<<XML
        <?xml version="1.0" encoding="utf-8"?>
        <items>
            <item><id>1</id><title>Foo</title></item>
        </items>
        XML;

    $domNode = helper_getLegacyDomNodeInstanceFromSource($xml, 'items item');

    expect($domNode)->toBeInstanceOf(DOMNode::class);

    $node = new class ($domNode) extends Node {
        protected function makeChildNodeInstance(object $node): Node
        {
            return new XmlElement($node);
        }
    };

    expect($node)->toBeInstanceOf(Node::class)
        ->and($node->text())->toBe('1Foo');
});

/* ----------------------------- querySelector(All)() ----------------------------- */

$html = <<<HTML
    <!doctype html>
    <html>
    <head>
        <title>Foo</title>
    </head>
    <body>
        <div class="foo">
            <h1>Title</h1>
        </div>
    </body>
    </html>
    HTML;

it('selects an element within a node via querySelector()', function (object $originalNode) {
    /** @var Crawler|DOMNode $originalNode */
    $node = helper_getAbstractNodeInstance($originalNode);

    $selectedNode = $node->querySelector('.foo h1');

    expect($selectedNode)->toBeInstanceOf(Node::class)
        ->and($selectedNode?->text())->toBe('Title');
})->with([
    [helper_getSymfonyCrawlerInstanceFromSource($html)],
    [helper_getLegacyDomNodeInstanceFromSource($html)],
]);

it('selects an element within a node via querySelector() in PHP >= 8.4', function () use ($html) {
    $originalNode = helper_getPhp84HtmlDomNodeInstanceFromSource($html);

    $node = helper_getAbstractNodeInstance($originalNode);

    $selectedNode = $node->querySelector('.foo h1');

    expect($selectedNode)->toBeInstanceOf(Node::class)
        ->and($selectedNode?->text())->toBe('Title');
})->group('php84');

$html = <<<HTML
    <!doctype html>
    <html>
    <head><title>Bar</title></head>
    <body>
        <div class="foo">
            <h2>Foo</h2>
        </div>
        <div class="foo">
            <h2>Bar</h2>
        </div>
    </body>
    </html>
    HTML;

test(
    'querySelector() selects the first element within a node, when multiple nodes match a selector',
    function (object $originalNode) {
        /** @var Crawler|DOMNode $originalNode */
        $node = helper_getAbstractNodeInstance($originalNode);

        $selectedNode = $node->querySelector('.foo h2');

        expect($selectedNode)->toBeInstanceOf(Node::class)
            ->and($selectedNode?->text())->toBe('Foo');
    },
)->with([
    [helper_getSymfonyCrawlerInstanceFromSource($html)],
    [helper_getLegacyDomNodeInstanceFromSource($html)],
]);

it(
    'selects the first element within a node using querySelector(), when multiple nodes match a selector in PHP >= 8.4',
    function () use ($html) {
        $originalNode = helper_getPhp84HtmlDomNodeInstanceFromSource($html);

        $node = helper_getAbstractNodeInstance($originalNode);

        $selectedNode = $node->querySelector('.foo h2');

        expect($selectedNode)->toBeInstanceOf(Node::class)
            ->and($selectedNode?->text())->toBe('Foo');
    },
)->group('php84');

$html = <<<HTML
    <!doctype html>
    <html>
    <head><title>Foo</title></head>
    <body>
        yo
    </body>
    </html>
    HTML;

it('returns null when the selector passed to querySelector() matches nothing', function (object $originalNode) {
    /** @var Crawler|DOMNode $originalNode */
    $node = helper_getAbstractNodeInstance($originalNode);

    $selectedNode = $node->querySelector('.foo h2');

    expect($selectedNode)->toBeNull();
})->with([
    [helper_getSymfonyCrawlerInstanceFromSource($html)],
    [helper_getLegacyDomNodeInstanceFromSource($html)],
]);

it('returns null when the selector passed to querySelector() matches nothing in PHP >= 8.4', function () use ($html) {
    $originalNode = helper_getPhp84HtmlDomNodeInstanceFromSource($html);

    $node = helper_getAbstractNodeInstance($originalNode);

    $selectedNode = $node->querySelector('.foo h2');

    expect($selectedNode)->toBeNull();
})->group('php84');

$xml = <<<XML
    <?xml version="1.0" encoding="utf-8"?>
    <feed>
      <items>
        <item><id>1</id><title>Foo</title></item>
        <item><id>2</id><title>Bar</title></item>
        <item><id>3</id><title>Baz</title></item>
      </items>
    </feed>
    XML;

it('selects all elements within a node, matching a selector using querySelectorAll()', function (object $originalNode) {
    /** @var Crawler|DOMNode $originalNode */
    $node = helper_getAbstractNodeInstance($originalNode);

    $selected = $node->querySelectorAll('items item title');

    expect($selected)->toBeInstanceOf(NodeList::class)
        ->and($selected->count())->toBe(3)
        ->and($selected->first()?->text())->toBe('Foo')
        ->and($selected->nth(2)?->text())->toBe('Bar')
        ->and($selected->last()?->text())->toBe('Baz');
})->with([
    [helper_getSymfonyCrawlerInstanceFromSource($xml, 'feed')],
    [helper_getLegacyDomNodeInstanceFromSource($xml, 'feed')],
]);

it(
    'selects all elements within a node, matching a selector using querySelectorAll() in PHP >= 8.4',
    function () use ($xml) {
        $originalNode = helper_getPhp84XmlDomNodeInstanceFromSource($xml, 'feed');

        $node = helper_getAbstractNodeInstance($originalNode);

        $selected = $node->querySelectorAll('items item title');

        expect($selected)->toBeInstanceOf(NodeList::class)
            ->and($selected->count())->toBe(3)
            ->and($selected->first()?->text())->toBe('Foo')
            ->and($selected->nth(2)?->text())->toBe('Bar')
            ->and($selected->last()?->text())->toBe('Baz');
    },
)->group('php84');

$xml = <<<XML
    <?xml version="1.0" encoding="utf-8"?>
    <feed>
        <items><item><id>1</id></item><item><id>2</id></item><item><id>3</id></item></items>
    </feed>
    XML;

it(
    'gets an empty NodeList when nothing matches the selector passed to querySelectorAll()',
    function (object $originalNode) {
        /** @var Crawler|DOMNode $originalNode */
        $node = helper_getAbstractNodeInstance($originalNode);

        $selected = $node->querySelectorAll('items item author');

        expect($selected)->toBeInstanceOf(NodeList::class)
            ->and($selected->count())->toBe(0);
    },
)->with([
    [helper_getSymfonyCrawlerInstanceFromSource($xml, 'feed')],
    [helper_getLegacyDomNodeInstanceFromSource($xml, 'feed')],
]);

it(
    'gets an empty NodeList when nothing matches the selector passed to querySelectorAll() in PHP >= 8.4',
    function () use ($xml) {
        $originalNode = helper_getPhp84XmlDomNodeInstanceFromSource($xml, 'feed');

        $node = helper_getAbstractNodeInstance($originalNode);

        $selected = $node->querySelectorAll('items item author');

        expect($selected)->toBeInstanceOf(NodeList::class)
            ->and($selected->count())->toBe(0);
    },
)->group('php84');

$html = <<<HTML
    <!doctype html>
    <html>
    <head><title>Lorem Ipsum</title></head>
    <body>
        <ul><li>hip</li><li>hop</li><li>hooray</li></ul>
    </body>
    </html>
    HTML;

/* ----------------------------- queryXPath() ----------------------------- */

it(
    'selects all elements within a node, matching an XPath query using queryXPath()',
    function (object $originalNode) {
        /** @var Crawler|DOMNode $originalNode */
        $node = helper_getAbstractNodeInstance($originalNode);

        $selected = $node->queryXPath('//ul/li');

        expect($selected)->toBeInstanceOf(NodeList::class)
            ->and($selected->count())->toBe(3)
            ->and($selected->first()?->text())->toBe('hip')
            ->and($selected->nth(2)?->text())->toBe('hop')
            ->and($selected->last()?->text())->toBe('hooray');
    },
)->with([
    [helper_getSymfonyCrawlerInstanceFromSource($html)],
    [helper_getLegacyDomNodeInstanceFromSource($html)],
]);

it(
    'selects all elements within a node, matching an XPath query using queryXPath() in PHP >= 8.4',
    function () use ($html) {
        $originalNode = helper_getPhp84HtmlDomNodeInstanceFromSource($html);

        $node = helper_getAbstractNodeInstance($originalNode);

        $selected = $node->queryXPath('//ul/li');

        expect($selected)->toBeInstanceOf(NodeList::class)
            ->and($selected->count())->toBe(3)
            ->and($selected->first()?->text())->toBe('hip')
            ->and($selected->nth(2)?->text())->toBe('hop')
            ->and($selected->last()?->text())->toBe('hooray');
    },
)->group('php84');

it('gets an empty NodeList when nothing matches the selector passed to queryXPath()', function (object $originalNode) {
    /** @var Crawler|DOMNode $originalNode */
    $node = helper_getAbstractNodeInstance($originalNode);

    $selected = $node->queryXPath('//ul/li/strong');

    expect($selected)->toBeInstanceOf(NodeList::class)
        ->and($selected->count())->toBe(0);
})->with([
    [helper_getSymfonyCrawlerInstanceFromSource($html)],
    [helper_getLegacyDomNodeInstanceFromSource($html)],
]);

it(
    'gets an empty NodeList when nothing matches the selector passed to queryXPath() in PHP => 8.4',
    function () use ($html) {
        $originalNode = helper_getPhp84HtmlDomNodeInstanceFromSource($html);

        $node = helper_getAbstractNodeInstance($originalNode);

        $selected = $node->queryXPath('//ul/li/strong');

        expect($selected)->toBeInstanceOf(NodeList::class)
            ->and($selected->count())->toBe(0);
    },
)->group('php84');

/* ----------------------------- removeNodesMatchingSelector() ----------------------------- */

$html = <<<HTML
    <!doctype html>
    <html>
    <head></head>
    <body>
        <ul id="list">
            <li class="remove">foo</li>
            <li>bar</li>
            <li>baz</li>
            <li class="remove">quz</li>
            <li>lorem</li>
        </ul>
    </body>
    </html>
    HTML;

it('removes all nodes that match a given CSS selector', function (object $originalNode) {
    /** @var Crawler|DOMNode $originalNode */
    $node = helper_getAbstractNodeInstance($originalNode);

    $node->removeNodesMatchingSelector('#list .remove');

    $sourceAfterRemoval = $node->outer();

    expect($sourceAfterRemoval)->toContain('<li>bar</li>')
        ->toContain('<li>baz</li>')
        ->not()->toContain('<li class="remove">')
        ->not()->toContain('foo')
        ->not()->toContain('quz');
})->with([
    [helper_getSymfonyCrawlerInstanceFromSource($html)],
    [helper_getLegacyDomNodeInstanceFromSource($html)],
]);

it('removes all nodes that match a given CSS selector in PHP >= 8.4', function () use ($html) {
    $originalNode = helper_getPhp84HtmlDomNodeInstanceFromSource($html);

    $node = helper_getAbstractNodeInstance($originalNode);

    $node->removeNodesMatchingSelector('#list .remove');

    $sourceAfterRemoval = $node->outer();

    expect($sourceAfterRemoval)->toContain('<li>bar</li>')
        ->toContain('<li>baz</li>')
        ->not()->toContain('<li class="remove">')
        ->not()->toContain('foo')
        ->not()->toContain('quz');
})->group('php84');

$xml = <<<XML
    <?xml version="1.0" encoding="utf-8"?>
    <feed>
        <items>
            <item>
                <id>1</id>
                <title>foo</title>
                <description>lorem</description>
            </item>
            <item>
                <id>2</id>
                <title>bar</title>
                <description>ipsum</description>
            </item>
            <item>
                <id>3</id>
                <title>baz</title>
                <description>dolor</description>
            </item>
        </items>
    </feed>
    XML;

it('removes all nodes that match a given CSS selector from XML', function (object $originalNode) {
    /** @var Crawler|DOMNode $originalNode */
    $node = helper_getAbstractNodeInstance($originalNode, false);

    $node->removeNodesMatchingSelector('feed items item title');

    $sourceAfterRemoval = $node->outer();

    expect($sourceAfterRemoval)->toContain('<id>')
        ->toContain('<description>')
        ->not()->toContain('<title>');
})->with([
    [helper_getSymfonyCrawlerInstanceFromSource($xml, 'feed')],
]);

it('removes all nodes that match a given CSS selector from XML in PHP >= 8.4', function () use ($xml) {
    $originalNode = helper_getPhp84XmlDomNodeInstanceFromSource($xml, 'feed');

    $node = helper_getAbstractNodeInstance($originalNode);

    $node->removeNodesMatchingSelector('feed items item title');

    $sourceAfterRemoval = $node->outer();

    expect($sourceAfterRemoval)->toContain('<id>')
        ->toContain('<description>')
        ->not()->toContain('<title>');
})->group('php84');

/* ----------------------------- removeNodesMatchingXPath() ----------------------------- */

$html = <<<HTML
    <!doctype html>
    <html>
    <head></head>
    <body>
        <ul id="list">
            <li class="remove">foo</li>
            <li>bar</li>
            <li>baz</li>
            <li class="remove">quz</li>
            <li>lorem</li>
        </ul>
    </body>
    </html>
    HTML;

it('removes all nodes that match a given XPath query', function (object $originalNode) {
    /** @var Crawler|DOMNode $originalNode */
    $node = helper_getAbstractNodeInstance($originalNode);

    $node->removeNodesMatchingXPath('//li[contains(@class, \'remove\')]');

    $sourceAfterRemoval = $node->outer();

    expect($sourceAfterRemoval)->toContain('<li>bar</li>')
        ->toContain('<li>baz</li>')
        ->not()->toContain('<li class="remove">')
        ->not()->toContain('foo')
        ->not()->toContain('quz');
})->with([
    [helper_getSymfonyCrawlerInstanceFromSource($html)],
    [helper_getLegacyDomNodeInstanceFromSource($html)],
]);

it('removes all nodes that match a given XPath query in PHP >= 8.4', function () use ($html) {
    $originalNode = helper_getPhp84HtmlDomNodeInstanceFromSource($html);

    $node = helper_getAbstractNodeInstance($originalNode);

    $node->removeNodesMatchingXPath('//li[contains(@class, \'remove\')]');

    $sourceAfterRemoval = $node->outer();

    expect($sourceAfterRemoval)->toContain('<li>bar</li>')
        ->toContain('<li>baz</li>')
        ->not()->toContain('<li class="remove">')
        ->not()->toContain('foo')
        ->not()->toContain('quz');
})->group('php84');

$xml = <<<XML
    <?xml version="1.0" encoding="utf-8"?>
    <feed>
        <items>
            <item>
                <id>1</id>
                <title>foo</title>
                <description>lorem</description>
            </item>
            <item>
                <id>2</id>
                <title>bar</title>
                <description>ipsum</description>
            </item>
            <item>
                <id>3</id>
                <title>baz</title>
                <description>dolor</description>
            </item>
        </items>
    </feed>
    XML;

it('removes all nodes that match a given XPath query from XML', function (object $originalNode) {
    /** @var Crawler|DOMNode $originalNode */
    $node = helper_getAbstractNodeInstance($originalNode);

    $node->removeNodesMatchingXPath('//feed/items/item/title');

    $sourceAfterRemoval = $node->outer();

    expect($sourceAfterRemoval)->toContain('<id>')
        ->toContain('<description>')
        ->not()->toContain('<title>');
})->with([
    [helper_getSymfonyCrawlerInstanceFromSource($xml, 'feed')],
]);

it('removes all nodes that match a given XPath query from XML in PHP >= 8.4', function () use ($xml) {
    $originalNode = helper_getPhp84XmlDomNodeInstanceFromSource($xml, 'feed');

    $node = helper_getAbstractNodeInstance($originalNode);

    $node->removeNodesMatchingXPath('//feed/items/item/title');

    $sourceAfterRemoval = $node->outer();

    expect($sourceAfterRemoval)->toContain('<id>')
        ->toContain('<description>')
        ->not()->toContain('<title>');
})->group('php84');

/* ----------------------------- getAttribute() ----------------------------- */

$html = <<<HTML
    <!doctype html>
    <html>
    <head><title>Foo</title></head>
    <body>
        <div class="element" data-test="hi"></div>
    </body>
    </html>
    HTML;

it('gets the value of an attribute', function (object $originalNode) {
    /** @var Crawler|DOMNode $originalNode */
    $node = helper_getAbstractNodeInstance($originalNode);

    expect($node->getAttribute('data-test'))->toBe('hi');
})->with([
    [helper_getSymfonyCrawlerInstanceFromSource($html, '.element')],
    [helper_getLegacyDomNodeInstanceFromSource($html, '.element')],
]);

it('gets the value of an attribute in PHP >= 8.4', function () use ($html) {
    $originalNode = helper_getPhp84HtmlDomNodeInstanceFromSource($html, '.element');

    $node = helper_getAbstractNodeInstance($originalNode);

    expect($node->getAttribute('data-test'))->toBe('hi');
})->group('php84');

$html = <<<HTML
    <!doctype html>
    <html>
    <head><title>Foo</title></head>
    <body><div class="element"></div></body>
    </html>
    HTML;

it('returns null when an attribute does not exist', function (object $originalNode) {
    /** @var Crawler|DOMNode $originalNode */
    $node = helper_getAbstractNodeInstance($originalNode);

    expect($node->getAttribute('data-test'))->toBeNull();
})->with([
    [helper_getSymfonyCrawlerInstanceFromSource($html, '.element')],
    [helper_getLegacyDomNodeInstanceFromSource($html, '.element')],
]);

it('returns null when an attribute does not exist in PHP >= 8.4', function () use ($html) {
    $originalNode = helper_getPhp84HtmlDomNodeInstanceFromSource($html, '.element');

    $node = helper_getAbstractNodeInstance($originalNode);

    expect($node->getAttribute('data-test'))->toBeNull();
})->group('php84');

/* ----------------------------- nodeName() ----------------------------- */

it('gets the name of a node', function (object $originalNode) {
    /** @var Crawler|DOMNode $originalNode */
    $node = helper_getAbstractNodeInstance($originalNode);

    expect($node->nodeName())->toBe('div');
})->with([
    [helper_getSymfonyCrawlerInstanceFromSource($html, '.element')],
    [helper_getLegacyDomNodeInstanceFromSource($html, '.element')],
]);

it('gets the name of a node in PHP >= 8.4', function () use ($html) {
    $originalNode = helper_getPhp84HtmlDomNodeInstanceFromSource($html, '.element');

    $node = helper_getAbstractNodeInstance($originalNode);

    expect($node->nodeName())->toBe('div');
})->group('php84');

$html = <<<HTML
    <!doctype html>
    <html>
    <head><title>Bar</title></head>
    <body>
        <article> <h1>Title</h1> <p>Lorem ipsum.</p> </article>
    </body>
    </html>
    HTML;

/* ----------------------------- text() ----------------------------- */

it('gets the text content of an HTML node', function (object $originalNode) {
    /** @var Crawler|DOMNode $originalNode */
    $node = helper_getAbstractNodeInstance($originalNode);

    expect($node->text())->toBe('Title Lorem ipsum.');
})->with([
    [helper_getSymfonyCrawlerInstanceFromSource($html, 'article')],
    [helper_getLegacyDomNodeInstanceFromSource($html, 'article')],
]);

it('gets the text content of an HTML node in PHP >= 8.4', function () use ($html) {
    $originalNode = helper_getPhp84HtmlDomNodeInstanceFromSource($html, 'article');

    $node = helper_getAbstractNodeInstance($originalNode);

    expect($node->text())->toBe('Title Lorem ipsum.');
})->group('php84');

/* ----------------------------- innerSource() ----------------------------- */

it('gets the inner source of an HTML node', function (object $originalNode) {
    /** @var Crawler|DOMNode $originalNode */
    $node = helper_getAbstractNodeInstance($originalNode);

    expect($node->inner())->toBe(' <h1>Title</h1> <p>Lorem ipsum.</p> ');
})->with([
    [helper_getSymfonyCrawlerInstanceFromSource($html, 'article')],
    [helper_getLegacyDomNodeInstanceFromSource($html, 'article')],
]);

it('gets the inner source of an HTML node in PHP >= 8.4', function () use ($html) {
    $originalNode = helper_getPhp84HtmlDomNodeInstanceFromSource($html, 'article');

    $node = helper_getAbstractNodeInstance($originalNode);

    expect($node->inner())->toBe(' <h1>Title</h1> <p>Lorem ipsum.</p> ');
})->group('php84');

/* ----------------------------- outerSource () ----------------------------- */

it('gets the outer source of an HTML node', function (object $originalNode) {
    /** @var Crawler|DOMNode $originalNode */
    $node = helper_getAbstractNodeInstance($originalNode);

    expect($node->outer())->toBe('<article> <h1>Title</h1> <p>Lorem ipsum.</p> </article>');
})->with([
    [helper_getSymfonyCrawlerInstanceFromSource($html, 'article')],
    [helper_getLegacyDomNodeInstanceFromSource($html, 'article')],
]);

it('gets the outer source of an HTML node in PHP >= 8.4', function () use ($html) {
    $originalNode = helper_getPhp84HtmlDomNodeInstanceFromSource($html, 'article');

    $node = helper_getAbstractNodeInstance($originalNode);

    expect($node->outer())->toBe('<article> <h1>Title</h1> <p>Lorem ipsum.</p> </article>');
})->group('php84');

$xml = <<<XML
    <?xml version="1.0" encoding="utf-8"?>
    <items> <item> <id>1</id> <title>Lorem Ipsum</title> </item> </items>
    XML;

/* ----------------------------- text() ----------------------------- */

it('gets the text content of an XML node', function (object $originalNode) {
    /** @var Crawler|DOMNode $originalNode */
    $node = helper_getAbstractNodeInstance($originalNode);

    expect($node->text())->toBe('1 Lorem Ipsum');
})->with([
    [helper_getSymfonyCrawlerInstanceFromSource($xml, 'items item')],
    [helper_getLegacyDomNodeInstanceFromSource($xml, 'items item')],
]);

it('gets the text content of an XML node in PHP >= 8.4', function () use ($xml) {
    $originalNode = helper_getPhp84XmlDomNodeInstanceFromSource($xml, 'items item');

    $node = helper_getAbstractNodeInstance($originalNode);

    expect($node->text())->toBe('1 Lorem Ipsum');
})->group('php84');

/* ----------------------------- innerSource() XML ----------------------------- */

it('gets the inner source of an XML node', function (object $originalNode) {
    /** @var Crawler|DOMNode $originalNode */
    $node = helper_getAbstractNodeInstance($originalNode);

    expect($node->inner())->toBe(' <id>1</id> <title>Lorem Ipsum</title> ');
})->with([
    [helper_getSymfonyCrawlerInstanceFromSource($xml, 'items item')],
    [helper_getLegacyDomNodeInstanceFromSource($xml, 'items item')],
]);

it('gets the inner source of an XML node in PHP >= 8.4', function () use ($xml) {
    $originalNode = helper_getPhp84XmlDomNodeInstanceFromSource($xml, 'items item');

    $node = helper_getAbstractNodeInstance($originalNode);

    expect($node->inner())->toBe(' <id>1</id> <title>Lorem Ipsum</title> ');
})->group('php84');

/* ----------------------------- outerSource() XML ----------------------------- */

it('gets the outer source of an XML node', function (object $originalNode) {
    /** @var Crawler|DOMNode $originalNode */
    $node = helper_getAbstractNodeInstance($originalNode);

    expect($node->outer())->toBe('<item> <id>1</id> <title>Lorem Ipsum</title> </item>');
})->with([
    [helper_getSymfonyCrawlerInstanceFromSource($xml, 'items item')],
    [helper_getLegacyDomNodeInstanceFromSource($xml, 'items item')],
]);

it('gets the outer source of an XML node in PHP >= 8.4', function () use ($xml) {
    $originalNode = helper_getPhp84XmlDomNodeInstanceFromSource($xml, 'items item');

    $node = helper_getAbstractNodeInstance($originalNode);

    expect($node->outer())->toBe('<item> <id>1</id> <title>Lorem Ipsum</title> </item>');
})->group('php84');

$html = <<<HTML
    <!doctype html>
    <html>
    <head><title>Bar</title></head>
    <body>
        <ul><li class="foo">one</li></ul>

        <ul><li>foo</li></ul>
    </body>
    </html>
    HTML;

/* ------------ :has() :not() CSS pseudo class selectors in PHP 8.4 ------------- */

it('selects elements using a CSS selector containing the :has() pseudo class', function () use ($html) {
    $originalNode = helper_getPhp84HtmlDomNodeInstanceFromSource($html);

    $node = helper_getAbstractNodeInstance($originalNode);

    $selected = $node->querySelector('ul:has(.foo)');

    expect($selected)->toBeInstanceOf(HtmlElement::class)
        ->and($selected?->text())->toBe('one');
})->group('php84');

it('selects elements using a CSS selector containing the :not() pseudo class', function () use ($html) {
    $originalNode = helper_getPhp84HtmlDomNodeInstanceFromSource($html);

    $node = helper_getAbstractNodeInstance($originalNode);

    $selected = $node->querySelector('ul:not(:has(.foo))');

    expect($selected)->toBeInstanceOf(HtmlElement::class)
        ->and($selected?->text())->toBe('foo');
})->group('php84');


================================================
FILE: tests/Steps/Dom/XmlDocumentTest.php
================================================
<?php

namespace tests\Steps\Dom;

use Crwlr\Crawler\Steps\Dom\NodeList;
use Crwlr\Crawler\Steps\Dom\XmlDocument;
use Crwlr\Crawler\Steps\Dom\XmlElement;

test('the querySelector() method returns an XmlElement object', function () {
    $xml = <<<XML
        <?xml version="1.0" encoding="utf-8"?>
        <feed>
            <items><item><id>1</id></item></items>
        </feed>
        XML;

    $document = new XmlDocument($xml);

    expect($document->querySelector('feed items item'))->toBeInstanceOf(XmlElement::class);
});

test('the querySelectorAll() method returns a NodeList of XmlElement objects', function () {
    $xml = <<<XML
        <?xml version="1.0" encoding="utf-8"?>
        <feed>
            <items><item><id>1</id></item><item><id>2</id></item><item><id>3</id></item></items>
        </feed>
        XML;

    $document = new XmlDocument($xml);

    $nodeList = $document->querySelectorAll('feed items item');

    expect($nodeList)->toBeInstanceOf(NodeList::class);

    $anyNodesChecked = false;

    foreach ($nodeList as $node) {
        expect($node)->toBeInstanceOf(XmlElement::class);

        $anyNodesChecked = true;
    }

    expect($anyNodesChecked)->toBeTrue();
});

test('the queryXPath() method returns a NodeList of XmlElement objects', function () {
    $xml = <<<XML
        <?xml version="1.0" encoding="utf-8"?>
        <feed>
            <items><item><id>1</id></item><item><id>2</id></item><item><id>3</id></item></items>
        </feed>
        XML;

    $document = new XmlDocument($xml);

    $nodeList = $document->queryXPath('//feed/items/item');

    expect($nodeList)->toBeInstanceOf(NodeList::class);

    $anyNodesChecked = false;

    foreach ($nodeList as $node) {
        expect($node)->toBeInstanceOf(XmlElement::class);

        $anyNodesChecked = true;
    }

    expect($anyNodesChecked)->toBeTrue();
});

it('is able to parse documents containing characters that aren\'t valid within XML documents', function (string $char) {
    $xml = <<<XML
        <?xml version="1.0" encoding="UTF-8"?>
        <rss>
        <channel>
        <items>
        <item>
        <title><![CDATA[foo - {$char} - bar]]></title>
        </item>
        </items>
        </channel>
        </rss>
        XML;

    $document = new XmlDocument($xml);

    $titles = $document->querySelectorAll('channel item title');

    expect($titles)->toBeInstanceOf(NodeList::class)
        ->and($titles->count())->toBe(1)
        ->and($titles->first()?->text())->toStartWith('foo - ')
        ->and($titles->first()?->text())->toEndWith(' - bar');
})->with([
    [mb_chr(0)],
    [mb_chr(6)],
    [mb_chr(12)],
    [mb_chr(20)],
    [mb_chr(31)],
    [mb_chr(128)],
    [mb_chr(157)],
    [mb_chr(195)],
    [mb_chr(253)],
]);


================================================
FILE: tests/Steps/Dom/XmlElementTest.php
================================================
<?php

namespace tests\Steps\Dom;

use Crwlr\Crawler\Steps\Dom\NodeList;
use Crwlr\Crawler\Steps\Dom\XmlDocument;
use Crwlr\Crawler\Steps\Dom\XmlElement;

$xml = <<<XML
    <?xml version="1.0" encoding="utf-8"?>
    <feed>
        <channelName>foo</channelName>
        <channelIdentifier>foo</channelIdentifier>
        <items>
            <item>
                <id>abc-123</id>
                <updated>2024-11-07T11:00:31Z</updated>
                <title lang="en">Foo bar baz!</title>
                <someUrl>https://www.example.com/item-1?utm_source=foo&amp;utm_medium=feed-xml</someUrl>
                <foo>  <baRbaz>test</baRbaz>  </foo>
            </item>
            <item>
                <id>abc-124</id>
                <updated>2024-12-04T22:43:14Z</updated>
                <title>Lorem Ipsum!</title>
                <someUrl>https://www.example.com/item-2?utm_source=foo&amp;utm_medium=feed-xml</someUrl>
                <foo><baRbaz>hey</baRbaz><quz>ho</quz></foo>
            </item>
        </items>
    </feed>
    XML;

test('child nodes selected via querySelector() are HtmlElement instances', function () use ($xml) {
    $document = new XmlDocument($xml);

    $wrapperElement = $document->querySelector('feed');

    expect($wrapperElement)->toBeInstanceOf(XmlElement::class)
        ->and($wrapperElement?->querySelector('items item'))->toBeInstanceOf(XmlElement::class);
});

test('child nodes selected via querySelectorAll() are HtmlElement instances', function () use ($xml) {
    $document = new XmlDocument($xml);

    $wrapperElement = $document->querySelector('feed');

    expect($wrapperElement)->toBeInstanceOf(XmlElement::class);

    $childNodeList = $wrapperElement?->querySelectorAll('items item');

    expect($childNodeList)->toBeInstanceOf(NodeList::class)
        ->and($childNodeList?->count())->toBe(2)
        ->and($childNodeList?->first())->toBeInstanceOf(XmlElement::class)
        ->and($childNodeList?->last())->toBeInstanceOf(XmlElement::class);
});

it('gets the node name', function () use ($xml) {
    $document = new XmlDocument($xml);

    $node = $document->querySelector('feed');

    expect($node?->nodeName())->toBe('feed')
        ->and($node?->querySelector('items item')?->nodeName())->toBe('item');
});

it('gets the text of a node', function () use ($xml) {
    $document = new XmlDocument($xml);

    $node = $document->querySelector('feed items item:nth-child(2) foo');

    expect($node?->text())->toBe('heyho');
});

it('gets the outer XML of a node', function () use ($xml) {
    $document = new XmlDocument($xml);

    $node = $document->querySelector('feed items item foo baRbaz');

    expect($node?->outerXml())->toBe('<baRbaz>test</baRbaz>');
});

it('gets the inner XML of a node', function () use ($xml) {
    $document = new XmlDocument($xml);

    $node = $document->querySelector('feed items item foo');

    expect($node?->innerXml())->toBe('  <baRbaz>test</baRbaz>  ');
});

it('gets an attribute from a node', function () use ($xml) {
    $document = new XmlDocument($xml);

    $node = $document->querySelector('feed items item:first-child title');

    expect($node?->getAttribute('lang'))->toBe('en');
});


================================================
FILE: tests/Steps/Dom/_Stubs/HtmlNodeStub.php
================================================
<?php

namespace tests\Steps\Dom\_Stubs;

use Crwlr\Crawler\Steps\Dom\HtmlElement;
use Crwlr\Crawler\Steps\Dom\Node;

class HtmlNodeStub extends Node
{
    public function inner(): string
    {
        return $this->innerSource();
    }

    public function outer(): string
    {
        return $this->outerSource();
    }

    protected function makeChildNodeInstance(object $node): Node
    {
        return new HtmlElement($node);
    }
}


================================================
FILE: tests/Steps/Dom/_Stubs/XmlNodeStub.php
================================================
<?php

namespace tests\Steps\Dom\_Stubs;

use Crwlr\Crawler\Steps\Dom\Node;
use Crwlr\Crawler\Steps\Dom\XmlElement;

class XmlNodeStub extends Node
{
    public function inner(): string
    {
        return $this->innerSource();
    }

    public function outer(): string
    {
        return $this->outerSource();
    }

    protected function makeChildNodeInstance(object $node): Node
    {
        return new XmlElement($node);
    }
}


================================================
FILE: tests/Steps/DomTest.php
================================================
<?php

namespace tests\Steps;

use Crwlr\Crawler\Loader\Http\Messages\RespondedRequest;
use Crwlr\Crawler\Input;
use Crwlr\Crawler\Steps\Dom;
use Crwlr\Crawler\Steps\Html\CssSelector;
use Crwlr\Crawler\Steps\Html\DomQuery;
use Crwlr\Crawler\Steps\Html\XPathQuery;
use GuzzleHttp\Psr7\Request;
use GuzzleHttp\Psr7\Response;
use stdClass;
use tests\_Stubs\DummyLogger;

use function tests\helper_getStepFilesContent;
use function tests\helper_invokeStepWithInput;
use function tests\helper_traverseIterable;

/**
 * @param mixed[] $mapping
 */
function helper_getDomStepInstance(array $mapping = []): Dom
{
    return new class ($mapping) extends Dom {
        protected function makeDefaultDomQueryInstance(string $query): DomQuery
        {
            return new CssSelector($query);
        }
    };
}

test('string is valid input', function () {
    $html = '<!DOCTYPE html><html><head></head><body><h1>Überschrift</h1></body>';

    $output = helper_invokeStepWithInput(helper_getDomStepInstance()::root(), $html);

    expect($output[0]->get())->toBe([]);
});

test('ResponseInterface is a valid input', function () {
    $output = helper_invokeStepWithInput(helper_getDomStepInstance()::root(), new Response());

    expect($output[0]->get())->toBe([]);
});

test('RespondedRequest is a valid input', function () {
    $output = helper_invokeStepWithInput(
        helper_getDomStepInstance()::root(),
        new RespondedRequest(new Request('GET', '/'), new Response()),
    );

    expect($output[0]->get())->toBe([]);
});

test('For other inputs an error message is logged', function (mixed $input) {
    $logger = new DummyLogger();

    helper_traverseIterable(helper_getDomStepInstance()::root()->addLogger($logger)->invokeStep(new Input($input)));

    expect($logger->messages)->not->toBeEmpty()
        ->and($logger->messages[0]['message'])->toStartWith('A step was called with input that it can not work with: ')
        ->and($logger->messages[0]['message'])->toEndWith('. The invalid input is of type ' . gettype($input) . '.');
})->with([
    [123],
    [123.456],
    [new stdClass()],
]);

it('outputs a single string when argument for extract is a selector string matching only one element', function () {
    $outputs = helper_invokeStepWithInput(
        helper_getDomStepInstance()::root()->extract('.list .item:first-child .match'),
        helper_getStepFilesContent('Html/basic.html'),
    );

    expect($outputs)->toHaveCount(1)
        ->and($outputs[0]->get())->toBe('match 2');
});

it('outputs multiple strings when argument for extract is a selector string matching multiple elements', function () {
    $outputs = helper_invokeStepWithInput(
        helper_getDomStepInstance()::root()->extract('.match'),
        helper_getStepFilesContent('Html/basic.html'),
    );

    expect($outputs)->toHaveCount(3)
        ->and($outputs[0]->get())->toBe('match 1')
        ->and($outputs[2]->get())->toBe('match 3');
});

it('also takes a DomQuery instance as argument for extract', function () {
    $outputs = helper_invokeStepWithInput(
        helper_getDomStepInstance()::root()->extract(Dom::cssSelector('.list .item:first-child .match')),
        helper_getStepFilesContent('Html/basic.html'),
    );

    expect($outputs)->toHaveCount(1)
        ->and($outputs[0]->get())->toBe('match 2');
});

test('Extracting with single selector also works with each', function () {
    $outputs = helper_invokeStepWithInput(
        helper_getDomStepInstance()::each('.list .item')->extract('.match'),
        helper_getStepFilesContent('Html/basic.html'),
    );

    expect($outputs)->toHaveCount(2)
        ->and($outputs[0]->get())->toBe('match 2')
        ->and($outputs[1]->get())->toBe('match 3');
});

test('Extracting with single selector also works with first', function () {
    $outputs = helper_invokeStepWithInput(
        helper_getDomStepInstance()::first('.list .item')->extract('.match'),
        helper_getStepFilesContent('Html/basic.html'),
    );

    expect($outputs)->toHaveCount(1)
        ->and($outputs[0]->get())->toBe('match 2');
});

test('Extracting with single selector also works with last', function () {
    $outputs = helper_invokeStepWithInput(
        helper_getDomStepInstance()::last('.list .item')->extract('.match'),
        helper_getStepFilesContent('Html/basic.html'),
    );

    expect($outputs)->toHaveCount(1)
        ->and($outputs[0]->get())->toBe('match 3');
});

test('Extracting with single selector that doesn\'t match anything doesn\'t yield any output', function () {
    $outputs = helper_invokeStepWithInput(
        helper_getDomStepInstance()::last('.list .item')->extract('.m\ätch'),
        helper_getStepFilesContent('Html/basic.html'),
    );

    expect($outputs)->toHaveCount(0);
});

it('extracts one result from the root node when the root method is used', function () {
    $output = helper_invokeStepWithInput(
        helper_getDomStepInstance()::root()->extract(['matches' => '.match']),
        helper_getStepFilesContent('Html/basic.html'),
    );

    expect($output)->toHaveCount(1)
        ->and($output[0]->get())->toBe(['matches' => ['match 1', 'match 2', 'match 3']]);
});

it('extracts each matching result when the each method is used', function () {
    $output = helper_invokeStepWithInput(
        helper_getDomStepInstance()::each('.list .item')->extract(['match' => '.match']),
        helper_getStepFilesContent('Html/basic.html'),
    );

    expect($output)->toHaveCount(2)
        ->and($output[0]->get())->toBe(['match' => 'match 2'])
        ->and($output[1]->get())->toBe(['match' => 'match 3']);
});

it('logs a warning, when the each() method is used with an empty selector', function (string|DomQuery $selector) {
    $logger = new DummyLogger();

    $step = helper_getDomStepInstance()::each($selector)->extract(['match' => '.match']);

    $step->addLogger($logger);

    $outputs = helper_invokeStepWithInput($step, helper_getStepFilesContent('Html/basic.html'));

    expect($outputs)->toHaveCount(1)
        ->and($outputs[0]->get())->toBe(['match' => ['match 1', 'match 2', 'match 3']])
        ->and($logger->messages[0]['level'])->toBe('warning')
        ->and($logger->messages[0]['message'])
        ->toStartWith('The selector you provided for the ‘each’ option is empty.');
})->with([
    [''],
    [Dom::cssSelector('')],
    [Dom::xPath('')],
]);

it('extracts the first matching result when the first method is used', function () {
    $output = helper_invokeStepWithInput(
        helper_getDomStepInstance()::first('.list .item')->extract(['match' => '.match']),
        helper_getStepFilesContent('Html/basic.html'),
    );

    expect($output)->toHaveCount(1)
        ->and($output[0]->get())->toBe(['match' => 'match 2']);
});

it('logs a warning, when the first() method is used with an empty selector', function (string|DomQuery $selector) {
    $logger = new DummyLogger();

    $step = helper_getDomStepInstance()::first($selector)->extract(['match' => '.match']);

    $step->addLogger($logger);

    $outputs = helper_invokeStepWithInput($step, helper_getStepFilesContent('Html/basic.html'));

    expect($outputs)->toHaveCount(1)
        ->and($outputs[0]->get())->toBe(['match' => ['match 1', 'match 2', 'match 3']])
        ->and($logger->messages[0]['level'])->toBe('warning')
        ->and($logger->messages[0]['message'])
        ->toStartWith('The selector you provided for the ‘first’ option is empty.');
})->with([
    [''],
    [Dom::cssSelector('')],
    [Dom::xPath('')],
]);

it('extracts the last matching result when the last method is used', function () {
    $output = helper_invokeStepWithInput(
        helper_getDomStepInstance()::last('.list .item')->extract(['match' => '.match']),
        helper_getStepFilesContent('Html/basic.html'),
    );

    expect($output)->toHaveCount(1)
        ->and($output[0]->get())->toBe(['match' => 'match 3']);
});

it('logs a warning, when the last() method is used with an empty selector', function (string|DomQuery $selector) {
    $logger = new DummyLogger();

    $step = helper_getDomStepInstance()::last($selector)->extract(['match' => '.match']);

    $step->addLogger($logger);

    $outputs = helper_invokeStepWithInput($step, helper_getStepFilesContent('Html/basic.html'));

    expect($outputs)->toHaveCount(1)
        ->and($outputs[0]->get())->toBe(['match' => ['match 1', 'match 2', 'match 3']])
        ->and($logger->messages[0]['level'])->toBe('warning')
        ->and($logger->messages[0]['message'])
        ->toStartWith('The selector you provided for the ‘last’ option is empty.');
})->with([
    [''],
    [Dom::cssSelector('')],
    [Dom::xPath('')],
]);

it('doesn\'t yield any output when the each selector doesn\'t match anything', function () {
    $output = helper_invokeStepWithInput(
        helper_getDomStepInstance()::each('.list .ytem')->extract(['match' => '.match']),
        helper_getStepFilesContent('Html/basic.html'),
    );

    expect($output)->toHaveCount(0);
});

it('doesn\'t yield any output when the first selector doesn\'t match anything', function () {
    $output = helper_invokeStepWithInput(
        helper_getDomStepInstance()::first('.list .ytem')->extract(['match' => '.match']),
        helper_getStepFilesContent('Html/basic.html'),
    );

    expect($output)->toHaveCount(0);
});

it('doesn\'t yield any output when the last selector doesn\'t match anything', function () {
    $output = helper_invokeStepWithInput(
        helper_getDomStepInstance()::last('.list .otem')->extract(['match' => '.match']),
        helper_getStepFilesContent('Html/basic.html'),
    );

    expect($output)->toHaveCount(0);
});

it('returns an array with null values when selectors in an extract array mapping don\'t match anything', function () {
    $output = helper_invokeStepWithInput(
        helper_getDomStepInstance()::last('.list .item')->extract(['match' => '.match', 'noMatch' => '.doesntMatch']),
        helper_getStepFilesContent('Html/basic.html'),
    );

    expect($output)->toHaveCount(1)
        ->and($output[0]->get())->toBe(['match' => 'match 3', 'noMatch' => null]);
});

test('The static cssSelector method returns an instance of CssSelector using the provided selector', function () {
    $cssSelector = Dom::cssSelector('.item');

    expect($cssSelector)->toBeInstanceOf(CssSelector::class);

    $itemContent = $cssSelector->apply(new Dom\HtmlDocument('<span class="item">yes</span>'));

    expect($itemContent)->toBe('yes');
});

test('The static xPath method returns an instance of XPathQuery using the provided query', function () {
    $xPathQuery = Dom::xPath('//item');

    expect($xPathQuery)->toBeInstanceOf(XPathQuery::class);

    $itemContent = $xPathQuery->apply(new Dom\XmlDocument('<item>yes</item>'));

    expect($itemContent)->toBe('yes');
});

it('uses the keys of the provided mapping as keys in the returned output', function () {
    $output = helper_invokeStepWithInput(
        helper_getDomStepInstance()::root()->extract(['foo' => '.foo', 'notBar' => '.bar', '.baz']),
        '<p class="foo">foo content</p><p class="bar">bar content</p><p class="baz">baz content</p>',
    );

    expect($output)->toHaveCount(1)
        ->and($output[0]->get())->toBe(['foo' => 'foo content', 'notBar' => 'bar content', 0 => 'baz content']);
});

it('trims the extracted data', function () {
    $output = helper_invokeStepWithInput(
        helper_getDomStepInstance()::root()->extract(['foo' => '.foo']),
        "<p class=\"foo\">  \n   foo content   \n   \n</p>",
    );

    expect($output)->toHaveCount(1)
        ->and($output[0]->get())->toBe(['foo' => 'foo content']);
});

it('automatically passes on the base url to dom query instances when the input is a RespondedRequest', function () {
    $output = helper_invokeStepWithInput(
        helper_getDomStepInstance()::root()->extract([
            'one' => Dom::cssSelector('#one')->attribute('href')->toAbsoluteUrl(),
            'two' => Dom::cssSelector('#two')->link(),
        ]),
        new RespondedRequest(
            new Request('GET', 'https://www.example.com/home'),
            new Response(body: '<p><a id="one" href="/foo/bar">foo bar</a> <a id="two" href="yo/lo">yolo</a></p>'),
        ),
    );

    expect($output)->toHaveCount(1)
        ->and($output[0]->get())->toBe([
            'one' => 'https://www.example.com/foo/bar',
            'two' => 'https://www.example.com/yo/lo',
        ]);
});

it('removes the fragment part from URLs when the withoutFragment method is called on a DomQuery instance', function () {
    $body = <<<HTML
        <p>
            <a id="one" href="/foo#foo">one</a> <br>
            <a id="two" href="/bar#bar">two</a> <br>
            <a id="three" href="/baz#baz">three</a> <br>
            <a id="four" href="/quz#quz">four</a> <br>
        </p>
        HTML;

    $output = helper_invokeStepWithInput(
        helper_getDomStepInstance()::root()->extract([
            'one' => Dom::cssSelector('#one')->link(),
            'two' => Dom::xPath('//a[@id=\'two\']')->link(),
            'three' => Dom::cssSelector('#three')->link()->withoutFragment(),
            'four' => Dom::xPath('//a[@id=\'four\']')->link()->withoutFragment(),
        ]),
        new RespondedRequest(
            new Request('GET', 'https://www.example.com/home'),
            new Response(body: $body),
        ),
    );

    expect($output)->toHaveCount(1)
        ->and($output[0]->get())->toBe([
            'one' => 'https://www.example.com/foo#foo',
            'two' => 'https://www.example.com/bar#bar',
            'three' => 'https://www.example.com/baz',
            'four' => 'https://www.example.com/quz',
        ]);
});


================================================
FILE: tests/Steps/Filters/ArrayFilterTest.php
================================================
<?php

namespace tests\Steps\Filters;

use Crwlr\Crawler\Steps\Filters\Filter;

it('filters an array of string values', function (array $values, bool $evaluationResult) {
    $filter = Filter::arrayHasElement()->where(Filter::equal('foo'));

    expect($filter->evaluate($values))->toBe($evaluationResult);
})->with([
    [['foo', 'bar', 'baz'], true],
    [['bar', 'baz', 'quz'], false],
]);

it('filters a multi-level array by a key of the array elements (which are also arrays)', function () {
    $values = [
        ['foo' => 'one', 'bar' => 'two'],
        ['foo' => 'two', 'bar' => 'three'],
        ['foo' => 'three', 'bar' => 'four'],
    ];

    $filter = Filter::arrayHasElement()->where('foo', Filter::equal('four'));

    expect($filter->evaluate($values))->toBeFalse();

    $filter = Filter::arrayHasElement()->where('foo', Filter::equal('two'));

    expect($filter->evaluate($values))->toBeTrue();
});

it('applies multiple complex filters on a multi-level array', function () {
    $values = [
        [
            'id' => '123',
            'name' => 'abc',
            'tags' => [
                ['type' => 'companyId', 'value' => '123'],
                ['type' => 'type', 'value' => 'job-ad'],
                ['type' => 'companyId', 'value' => '125'],
            ],
        ],
        [
            'id' => '124',
            'name' => 'abd',
            'tags' => [
                ['type' => 'companyId', 'value' => '123'],
                ['type' => 'type', 'value' => 'blog-post'],
                ['type' => 'author', 'value' => 'John Doe'],
            ],
        ],
        [
            'id' => '125',
            'name' => 'abf',
            'tags' => [
                ['type' => 'companyId', 'value' => '123'],
                ['type' => 'companyId', 'value' => '124'],
                ['type' => 'type', 'value' => 'job-ad'],
                ['type' => 'companyId', 'value' => '125'],
            ],
        ],
    ];

    $filter = Filter::arrayHasElement()
        ->where(
            'tags',
            Filter::arrayHasElement()
                ->where('type', Filter::equal('companyId'))
                ->where('value', Filter::equal('123')),
        )
        ->where(
            'tags',
            Filter::arrayHasElement()
                ->where('type', Filter::equal('companyId'))
                ->where('value', Filter::equal('124'))
                ->negate(),
        )
        ->where(
            'tags',
            Filter::arrayHasElement()
                ->where('type', Filter::equal('type'))
                ->where('value', Filter::equal('job-ad')),
        );

    expect($filter->evaluate($values))->toBeTrue();

    $filter = Filter::arrayHasElement()
        ->where(
            'tags',
            Filter::arrayHasElement()
                ->where('type', Filter::equal('companyId'))
                ->where('value', Filter::equal('123')),
        )
        ->where(
            'tags',
            Filter::arrayHasElement()
                ->where('type', Filter::equal('companyId'))
                ->where('value', Filter::equal('125'))
                ->negate(),
        )
        ->where(
            'tags',
            Filter::arrayHasElement()
                ->where('type', Filter::equal('type'))
                ->where('value', Filter::equal('job-ad')),
        );

    expect($filter->evaluate($values))->toBeFalse();
});


================================================
FILE: tests/Steps/Filters/ClosureFilterTest.php
================================================
<?php

namespace tests\Steps\Filters;

use Crwlr\Crawler\Steps\Filters\ClosureFilter;

use function tests\helper_getStdClassWithData;

it('evaluates with a scalar value', function () {
    $closure = new ClosureFilter(function (mixed $value) {
        return in_array($value, ['one', 'two', 'three'], true);
    });

    expect($closure->evaluate('one'))->toBeTrue();

    expect($closure->evaluate('four'))->toBeFalse();
});

it('evaluates with a value from an array by key', function () {
    $closure = new ClosureFilter(function (mixed $value) {
        return in_array($value, ['one', 'two', 'three'], true);
    });

    $closure->useKey('bar');

    expect($closure->evaluate(['foo' => 'one', 'bar' => 'two']))->toBeTrue();

    expect($closure->evaluate(['foo' => 'three', 'bar' => 'four']))->toBeFalse();
});

it('compares a value from an object by key', function () {
    $closure = new ClosureFilter(function (mixed $value) {
        return in_array($value, ['one', 'two', 'three'], true);
    });

    $closure->useKey('bar');

    expect($closure->evaluate(helper_getStdClassWithData(['foo' => 'one', 'bar' => 'two'])))->toBeTrue();

    expect($closure->evaluate(helper_getStdClassWithData(['foo' => 'three', 'bar' => 'four'])))->toBeFalse();
});


================================================
FILE: tests/Steps/Filters/ComparisonFilterTest.php
================================================
<?php

namespace tests\Steps\Filters;

use Crwlr\Crawler\Steps\Filters\ComparisonFilter;
use Crwlr\Crawler\Steps\Filters\Enums\ComparisonFilterRule;

use function tests\helper_getStdClassWithData;

it('compares a single value', function () {
    $comparison = new ComparisonFilter(ComparisonFilterRule::GreaterThan, 3);

    expect($comparison->evaluate(4))->toBeTrue()
        ->and($comparison->evaluate(2))->toBeFalse();
});

it('compares a value from an array by key', function () {
    $comparison = new ComparisonFilter(ComparisonFilterRule::NotEqual, 'barValue');

    $comparison->useKey('bar');

    expect($comparison->evaluate(['foo' => 'fooValue', 'bar' => 'barValue']))->toBeFalse()
        ->and($comparison->evaluate(['foo' => 'fooValue', 'bar' => 'barzValue']))->toBeTrue();
});

it('compares a value from an object by key', function () {
    $comparison = new ComparisonFilter(ComparisonFilterRule::NotEqual, 'barValue');

    $comparison->useKey('bar');

    expect($comparison->evaluate(helper_getStdClassWithData(['foo' => 'fooValue', 'bar' => 'barValue'])))->toBeFalse()
        ->and($comparison->evaluate(helper_getStdClassWithData(['foo' => 'fooValue', 'bar' => 'barzValue'])))->toBeTrue();
});


================================================
FILE: tests/Steps/Filters/Enums/ComparisonFilterRuleTest.php
================================================
<?php

namespace tests\Steps\Filters\Enums;

use Crwlr\Crawler\Steps\Filters\Enums\ComparisonFilterRule;

it('correctly applies equal operator', function (bool $expectedResult, mixed $value1, mixed $value2) {
    $comparisonFilterRule = ComparisonFilterRule::Equal;

    expect($comparisonFilterRule->evaluate($value1, $value2))->toBe($expectedResult);
})->with([
    [true, 1, 1],
    [true, 'one', 'one'],
    [true, 1.12, 1.12],
    [false, 1, 2],
    [false, 1, '1'],
    [false, 'one', 'two'],
    [false, 1.12, 1.122],
]);

it('correctly applies not equal operator', function (bool $expectedResult, mixed $value1, mixed $value2) {
    $comparisonFilterRule = ComparisonFilterRule::NotEqual;

    expect($comparisonFilterRule->evaluate($value1, $value2))->toBe($expectedResult);
})->with([
    [false, 1, 1],
    [false, 'one', 'one'],
    [false, 1.12, 1.12],
    [true, 1, 2],
    [true, 1, '1'],
    [true, 'one', 'two'],
    [true, 1.12, 1.122],
]);

it('correctly applies greater than operator', function (bool $expectedResult, mixed $value1, mixed $value2) {
    $comparisonFilterRule = ComparisonFilterRule::GreaterThan;

    expect($comparisonFilterRule->evaluate($value1, $value2))->toBe($expectedResult);
})->with([
    [true, 1, 0],
    [true, 12, 3],
    [true, 1.12, 1.11],
    [false, 11, 11],
    [false, 0, 1],
    [false, 3.59, 3.591],
    [true, '123', '122'],
    [true, '123', 122],
    [true, 123, '122'],
    [false, '123', '124'],
    [false, '123', 124],
    [false, 123, '124'],
    [true, '123.45', '123.44'],
    [true, '123.45', 123.44],
    [true, 123.45, '123.44'],
    [false, '123.45', '123.46'],
    [false, '123.45', 123.46],
    [false, 123.45, '123.46'],
]);

it('correctly applies greater than or equal operator', function (bool $expectedResult, mixed $value1, mixed $value2) {
    $comparisonFilterRule = ComparisonFilterRule::GreaterThanOrEqual;

    expect($comparisonFilterRule->evaluate($value1, $value2))->toBe($expectedResult);
})->with([
    [true, 1, 0],
    [true, 12, 3],
    [true, 1.12, 1.11],
    [true, 11, 11],
    [false, 0, 1],
    [false, 3.59, 3.591],
    [true, '123', '122'],
    [true, '123', 122],
    [true, 123, '123'],
    [false, '123', '124'],
    [false, '123', 124],
    [false, 123, '124'],
    [true, '123.45', '123.44'],
    [true, '123.44', 123.44],
    [true, 123.45, '123.44'],
    [false, '123.45', '123.46'],
    [false, '123.45', 123.46],
    [false, 123.45, '123.46'],
]);

it('correctly applies less than operator', function (bool $expectedResult, mixed $value1, mixed $value2) {
    $comparisonFilterRule = ComparisonFilterRule::LessThan;

    expect($comparisonFilterRule->evaluate($value1, $value2))->toBe($expectedResult);
})->with([
    [true, 0, 1],
    [true, 4, 5],
    [true, 5.79, 5.7901],
    [false, 11, 11],
    [false, 1, 0],
    [false, 9.2901, 9.29],
    [true, '123', '124'],
    [true, '123', 124],
    [true, 123, '124'],
    [false, '123', '122'],
    [false, '123', 122],
    [false, 123, '122'],
    [true, '123.45', '123.46'],
    [true, '123.45', 123.46],
    [true, 123.45, '123.46'],
    [false, '123.45', '123.44'],
    [false, '123.45', 123.44],
    [false, 123.45, '123.44'],
]);

it('correctly applies less than or equal operator', function (bool $expectedResult, mixed $value1, mixed $value2) {
    $comparisonFilterRule = ComparisonFilterRule::LessThanOrEqual;

    expect($comparisonFilterRule->evaluate($value1, $value2))->toBe($expectedResult);
})->with([
    [true, 0, 1],
    [true, 4, 5],
    [true, 5.79, 5.7901],
    [true, 11, 11],
    [false, 1, 0],
    [false, 9.2901, 9.29],
    [true, '123', '124'],
    [true, '123', 124],
    [true, 123, '123'],
    [false, '123', '122'],
    [false, '123', 122],
    [false, 123, '122'],
    [true, '123.45', '123.46'],
    [true, '123.45', 123.45],
    [true, 123.45, '123.46'],
    [false, '123.45', '123.44'],
    [false, '123.45', 123.44],
    [false, 123.45, '123.44'],
]);


================================================
FILE: tests/Steps/Filters/Enums/StringFilterRuleTest.php
================================================
<?php

namespace tests\Steps\Filters\Enums;

use Crwlr\Crawler\Steps\Filters\Enums\StringFilterRule;

it('checks if a string contains another string', function (
    bool $expectedResult,
    mixed $haystack,
    mixed $needle,
) {
    $stringFilterRule = StringFilterRule::Contains;

    expect($stringFilterRule->evaluate($haystack, $needle))->toBe($expectedResult);
})->with([
    [true, 'foobarbaz', 'foo'],
    [true, 'foo bar baz', 'foo'],
    [true, 'foo bar baz', 'bar'],
    [true, 'foo bar baz', 'baz'],
    [false, 'foo bar baz', 'Foo'],
]);

it('checks if a string starts with another string', function (
    bool $expectedResult,
    mixed $haystack,
    mixed $needle,
) {
    $stringFilterRule = StringFilterRule::StartsWith;

    expect($stringFilterRule->evaluate($haystack, $needle))->toBe($expectedResult);
})->with([
    [true, 'foobarbaz', 'foo'],
    [true, 'foo bar baz', 'foo'],
    [true, 'foo bar baz', 'foo bar'],
    [false, 'foo bar baz', 'bar'],
    [false, 'foo bar baz', 'baz'],
    [false, 'foo bar baz', 'Foo'],
]);

it('checks if a string ends with another string', function (
    bool $expectedResult,
    mixed $haystack,
    mixed $needle,
) {
    $stringFilterRule = StringFilterRule::EndsWith;

    expect($stringFilterRule->evaluate($haystack, $needle))->toBe($expectedResult);
})->with([
    [true, 'foobarbaz', 'baz'],
    [true, 'foo bar baz', 'baz'],
    [true, 'foo bar baz', 'bar baz'],
    [false, 'foo bar baz', 'bar'],
    [false, 'foo bar baz', 'foo'],
    [false, 'foo bar baz', 'Baz'],
]);


================================================
FILE: tests/Steps/Filters/Enums/StringLengthFilterRuleTest.php
================================================
<?php

namespace tests\Steps\Filters\Enums;

use Crwlr\Crawler\Steps\Filters\Enums\StringLengthFilterRule;

it('correctly applies equal rule', function (bool $expectedResult, mixed $value1, mixed $value2) {
    $comparisonFilterRule = StringLengthFilterRule::Equal;

    expect($comparisonFilterRule->evaluate($value1, $value2))->toBe($expectedResult);
})->with([
    [true, 'foo', 3],
    [true, 'lorem', 5],
    [true, 'foo bar', 7],
    [false, 'bar', 4],
    [false, 'baz quz', 6],
]);

it('correctly applies not equal rule', function (bool $expectedResult, mixed $value1, mixed $value2) {
    $comparisonFilterRule = StringLengthFilterRule::NotEqual;

    expect($comparisonFilterRule->evaluate($value1, $value2))->toBe($expectedResult);
})->with([
    [true, 'foo', 2],
    [true, 'foo bar', 8],
    [false, 'foo', 3],
    [false, 'lorem ipsum', 11],
]);

it('correctly applies greater than rule', function (bool $expectedResult, mixed $value1, mixed $value2) {
    $comparisonFilterRule = StringLengthFilterRule::GreaterThan;

    expect($comparisonFilterRule->evaluate($value1, $value2))->toBe($expectedResult);
})->with([
    [true, 'foo', 2],
    [true, 'foo bar', 6],
    [false, 'foo', 3],
    [false, 'foo bar', 7],
]);

it('correctly applies greater than or equal operator', function (bool $expectedResult, mixed $value1, mixed $value2) {
    $comparisonFilterRule = StringLengthFilterRule::GreaterThanOrEqual;

    expect($comparisonFilterRule->evaluate($value1, $value2))->toBe($expectedResult);
})->with([
    [true, 'foo', 2],
    [true, 'foo', 3],
    [true, 'foo bar', 6],
    [true, 'foo bar', 7],
    [false, 'foo', 4],
    [false, 'foo bar', 8],
]);

it('correctly applies less than operator', function (bool $expectedResult, mixed $value1, mixed $value2) {
    $comparisonFilterRule = StringLengthFilterRule::LessThan;

    expect($comparisonFilterRule->evaluate($value1, $value2))->toBe($expectedResult);
})->with([
    [true, 'foo', 4],
    [true, 'foo bar', 8],
    [false, 'foo', 3],
    [false, 'foo bar', 7],
]);

it('correctly applies less than or equal operator', function (bool $expectedResult, mixed $value1, mixed $value2) {
    $comparisonFilterRule = StringLengthFilterRule::LessThanOrEqual;

    expect($comparisonFilterRule->evaluate($value1, $value2))->toBe($expectedResult);
})->with([
    [true, 'foo', 4],
    [true, 'foo', 3],
    [true, 'foo bar', 8],
    [true, 'foo bar', 7],
    [false, 'foo', 2],
    [false, 'foo bar', 6],
]);


================================================
FILE: tests/Steps/Filters/Enums/UrlFilterRuleTest.php
================================================
<?php

namespace tests\Steps\Filters\Enums;

use Crwlr\Crawler\Steps\Filters\Enums\UrlFilterRule;

it('checks if a URL has a certain scheme', function (bool $expectedResult, mixed $haystack, mixed $needle) {
    $urlFilterRule = UrlFilterRule::Scheme;

    expect($urlFilterRule->evaluate($haystack, $needle))->toBe($expectedResult);
})->with([
    [true, 'https://www.example.com', 'https'],
    [true, 'http://www.example.com', 'http'],
    [true, 'ftp://user:password@example.com:21/path', 'ftp'],
    [false, 'https://www.example.com', 'http'],
]);

it('checks if a URL has a certain host', function (bool $expectedResult, mixed $haystack, mixed $needle) {
    $urlFilterRule = UrlFilterRule::Host;

    expect($urlFilterRule->evaluate($haystack, $needle))->toBe($expectedResult);
})->with([
    [true, 'https://www.example.com', 'www.example.com'],
    [true, 'https://jobs.example.com', 'jobs.example.com'],
    [true, 'https://pew.pew.pew.example.com:8080/pew', 'pew.pew.pew.example.com'],
    [false, 'https://jobs.example.com', 'www.example.com'],
]);

it('checks if a URL has a certain domain', function (bool $expectedResult, mixed $haystack, mixed $needle) {
    $urlFilterRule = UrlFilterRule::Domain;

    expect($urlFilterRule->evaluate($haystack, $needle))->toBe($expectedResult);
})->with([
    [true, 'https://www.example.com', 'example.com'],
    [true, 'https://jobs.example.com', 'example.com'],
    [true, 'https://pew.pew.pew.example.com:8080/pew', 'example.com'],
    [false, 'https://www.example.com', 'yolo.com'],
    [false, 'https://www.example.com', 'www.example.com'],
]);

it('checks if a URL has a certain path', function (bool $expectedResult, mixed $haystack, mixed $needle) {
    $urlFilterRule = UrlFilterRule::Path;

    expect($urlFilterRule->evaluate($haystack, $needle))->toBe($expectedResult);
})->with([
    [true, 'https://www.example.com/foo/bar', '/foo/bar'],
    [false, 'https://www.example.com/foo/bar/baz', '/foo/bar'],
]);

it('checks if a URL path starts with a certain path', function (bool $expectedResult, mixed $haystack, mixed $needle) {
    $urlFilterRule = UrlFilterRule::PathStartsWith;

    expect($urlFilterRule->evaluate($haystack, $needle))->toBe($expectedResult);
})->with([
    [true, 'https://www.example.com/foo/bar', '/foo/bar'],
    [true, 'https://www.example.com/foo/bar', '/foo'],
    [false, 'https://www.example.com/foo/bar', '/bar'],
]);

it('checks if a URL path matches a regex pattern', function (bool $expectedResult, mixed $haystack, mixed $needle) {
    $urlFilterRule = UrlFilterRule::PathMatches;

    expect($urlFilterRule->evaluate($haystack, $needle))->toBe($expectedResult);
})->with([
    [true, 'https://www.example.com/foo/bar', '^/foo/'],
    [true, 'https://www.example.com/56/something/foo', '^/\d{1,5}/[a-z]{1,20}'],
    [false, 'https://www.example.com/56/some-thing/foo', '^/\d{1,5}/[a-z]{1,20}/'],
]);


================================================
FILE: tests/Steps/Filters/FilterTest.php
================================================
<?php

namespace tests\Steps\Filters;

use Crwlr\Crawler\Steps\Filters\AbstractFilter;
use Exception;
use InvalidArgumentException;

use function tests\helper_getStdClassWithData;

class TestFilter extends AbstractFilter
{
    public string $value = '';

    public function evaluate(mixed $valueInQuestion): bool
    {
        $this->value = $this->getKey($valueInQuestion);

        return true;
    }
}

it('gets a key from an array', function () {
    $filter = new TestFilter();

    $filter->useKey('foo');

    $filter->evaluate(['foo' => 'fooValue', 'bar' => 'barValue']);

    expect($filter->value)->toBe('fooValue');
});

it('gets a key from an object', function () {
    $filter = new TestFilter();

    $filter->useKey('foo');

    $filter->evaluate(helper_getStdClassWithData(['foo' => 'fooValue', 'bar' => 'barValue']));

    expect($filter->value)->toBe('fooValue');
});

it('throws an exception when the value in question is not array or object when a key to use was defined', function () {
    $filter = new TestFilter();

    $filter->useKey('foo');

    $filter->evaluate('foo');
})->throws(InvalidArgumentException::class);

it('throws an exception when the key to use is not contained in an array', function () {
    $filter = new TestFilter();

    $filter->useKey('foo');

    $filter->evaluate(['bar' => 'barValue', 'baz' => 'bazValue']);
})->throws(Exception::class);

it('throws an exception when the key to use is not contained in an object', function () {
    $filter = new TestFilter();

    $filter->useKey('foo');

    $filter->evaluate(helper_getStdClassWithData(['bar' => 'barValue', 'baz' => 'bazValue']));
})->throws(Exception::class);


================================================
FILE: tests/Steps/Filters/NegatedFilterTest.php
================================================
<?php

namespace tests\Steps\Filters;

use Crwlr\Crawler\Steps\Filters\Filter;
use Crwlr\Crawler\Steps\Filters\NegatedFilter;

it('wraps another filter and negates it', function () {
    $filter = Filter::equal('foo');

    $negatedFilter = new NegatedFilter($filter);

    expect($filter->evaluate('foo'))->toBeTrue();

    expect($negatedFilter->evaluate('foo'))->toBeFalse();

    expect($filter->evaluate('bar'))->toBeFalse();

    expect($negatedFilter->evaluate('bar'))->toBeTrue();
});


================================================
FILE: tests/Steps/Filters/StringFilterTest.php
================================================
<?php

namespace tests\Steps\Filters;

use Crwlr\Crawler\Steps\Filters\Enums\StringFilterRule;
use Crwlr\Crawler\Steps\Filters\StringFilter;

use function tests\helper_getStdClassWithData;

it('checks a string', function () {
    $stringCheck = new StringFilter(StringFilterRule::Contains, 'bar');

    expect($stringCheck->evaluate('foo bar baz'))->toBeTrue();

    expect($stringCheck->evaluate('lorem ipsum'))->toBeFalse();
});

it('checks a string from an array using a key', function () {
    $stringCheck = new StringFilter(StringFilterRule::StartsWith, 'waldo');

    $stringCheck->useKey('bar');

    expect($stringCheck->evaluate(['foo' => 'something', 'bar' => 'waldo check', 'baz' => 'test']))->toBeTrue();

    expect($stringCheck->evaluate(['foo' => 'something', 'bar' => 'check waldo', 'baz' => 'test']))->toBeFalse();
});

it('checks a string from an object using a key', function () {
    $stringCheck = new StringFilter(StringFilterRule::EndsWith, 'waldo');

    $stringCheck->useKey('bar');

    $object = helper_getStdClassWithData(['foo' => 'something', 'bar' => 'check waldo', 'baz' => 'test']);

    expect($stringCheck->evaluate($object))->toBeTrue();

    $object = helper_getStdClassWithData(['foo' => 'something', 'bar' => 'waldo check', 'baz' => 'test']);

    expect($stringCheck->evaluate($object))->toBeFalse();
});


================================================
FILE: tests/Steps/Filters/StringLengthFilterTest.php
================================================
<?php

namespace tests\Steps\Filters;

use Crwlr\Crawler\Steps\Filters\Enums\StringLengthFilterRule;
use Crwlr\Crawler\Steps\Filters\StringLengthFilter;

use function tests\helper_getStdClassWithData;

it('checks a string', function () {
    $stringCheck = new StringLengthFilter(StringLengthFilterRule::GreaterThan, 10);

    expect($stringCheck->evaluate('foo'))->toBeFalse();

    expect($stringCheck->evaluate('lorem ipsum'))->toBeTrue();
});

it('checks a string from an array using a key', function () {
    $stringCheck = new StringLengthFilter(StringLengthFilterRule::GreaterThan, 10);

    $stringCheck->useKey('bar');

    expect($stringCheck->evaluate(['foo' => 'one', 'bar' => 'two', 'baz' => 'three']))->toBeFalse();

    expect($stringCheck->evaluate(['foo' => 'one', 'bar' => 'lorem ipsum', 'baz' => 'three']))->toBeTrue();
});

it('checks a string from an object using a key', function () {
    $stringCheck = new StringLengthFilter(StringLengthFilterRule::GreaterThan, 10);

    $stringCheck->useKey('bar');

    $object = helper_getStdClassWithData(['foo' => 'one', 'bar' => 'two', 'baz' => 'three']);

    expect($stringCheck->evaluate($object))->toBeFalse();

    $object = helper_getStdClassWithData(['foo' => 'one', 'bar' => 'lorem ipsum', 'baz' => 'three']);

    expect($stringCheck->evaluate($object))->toBeTrue();
});


================================================
FILE: tests/Steps/Filters/UrlFilterTest.php
================================================
<?php

namespace tests\Steps\Filters;

use Crwlr\Crawler\Steps\Filters\Enums\UrlFilterRule;
use Crwlr\Crawler\Steps\Filters\UrlFilter;

use function tests\helper_getStdClassWithData;

it('evaluates an url', function () {
    $urlFilter = new UrlFilter(UrlFilterRule::Domain, 'crwlr.software');

    expect($urlFilter->evaluate('https://www.crwlr.software/packages'))->toBeTrue();

    expect($urlFilter->evaluate('https://www.example.com/something'))->toBeFalse();
});

it('evaluates an url from an array using a key', function () {
    $urlFilter = (new UrlFilter(UrlFilterRule::Scheme, 'https'))->useKey('bar');

    expect($urlFilter->evaluate(['foo' => 'yo', 'bar' => 'https://www.example.com']))->toBeTrue();

    expect($urlFilter->evaluate(['foo' => 'yo', 'bar' => 'http://www.example.com']))->toBeFalse();
});

it('evaluates a string from an object using a key', function () {
    $urlFilter = (new UrlFilter(UrlFilterRule::PathStartsWith, '/foo'))->useKey('bar');

    expect($urlFilter->evaluate(
        helper_getStdClassWithData(['foo' => 'yo', 'bar' => 'https://www.example.com/foo/bar/baz']),
    ))->toBeTrue();

    expect($urlFilter->evaluate(
        helper_getStdClassWithData(['foo' => 'yo', 'bar' => 'https://www.example.com/articles/1']),
    ))->toBeFalse();
});

it('doesnt throw an exception when value is not a valid url', function () {
    $urlFilter = new UrlFilter(UrlFilterRule::Host, 'invalid');

    expect($urlFilter->evaluate('https*://invalid'))->toBeFalse();
});


================================================
FILE: tests/Steps/GroupTest.php
================================================
<?php

namespace tests\Steps;

use Closure;
use Crwlr\Crawler\Crawler;
use Crwlr\Crawler\Input;
use Crwlr\Crawler\Loader\Http\HttpLoader;
use Crwlr\Crawler\Logger\CliLogger;
use Crwlr\Crawler\Output;
use Crwlr\Crawler\Steps\Group;
use Crwlr\Crawler\Steps\Refiners\StringRefiner;
use Crwlr\Crawler\Steps\Step;
use Crwlr\Crawler\Steps\StepInterface;
use Crwlr\Crawler\UserAgents\BotUserAgent;
use Generator;
use Mockery;

use function tests\helper_getInputReturningStep;
use function tests\helper_getLoadingStep;
use function tests\helper_getStdClassWithData;
use function tests\helper_getStepYieldingObjectWithNumber;
use function tests\helper_getValueReturningStep;
use function tests\helper_invokeStepWithInput;

function helper_addStepsToGroup(Group $group, Step ...$steps): Group
{
    foreach ($steps as $step) {
        $group->addStep($step);
    }

    return $group;
}

function helper_addUpdateInputUsingOutputCallbackToSteps(Closure $callback, Step ...$steps): void
{
    foreach ($steps as $step) {
        $step->updateInputUsingOutput($callback);
    }
}

function helper_getStepThatRemembersIfItWasCalled(): Step
{
    return new class extends Step {
        public bool $called = false;

        protected function invoke(mixed $input): Generator
        {
            $this->called = true;

            yield 'test';
        }
    };
}

test('You can add a step and it passes on the logger', function () {
    $step = Mockery::mock(StepInterface::class);

    $step->shouldReceive('addLogger')->once();

    $step->shouldNotReceive('setLoader');

    $group = new Group();

    $group->addLogger(new CliLogger());

    $group->addStep($step);
});

it('also passes on a new logger to all steps when the logger is added after the steps', function () {
    $step1 = Mockery::mock(StepInterface::class);

    $step1->shouldReceive('addLogger')->once();

    $step2 = Mockery::mock(StepInterface::class);

    $step2->shouldReceive('addLogger')->once();

    $group = new Group();

    $group->addStep($step1);

    $group->addStep($step2);

    $group->addLogger(new CliLogger());
});

it('also passes on the loader to the step when setLoader method exists in step', function () {
    $step = Mockery::mock(helper_getLoadingStep());

    $step->shouldReceive('addLogger')->once();

    $step->shouldReceive('setLoader')->once();

    $group = new Group();

    $group->addLogger(new CliLogger());

    $group->setLoader(new HttpLoader(new BotUserAgent('MyBot')));

    /** @var Step $step */

    $group->addStep($step);
});

it('also passes on a new loader to all steps when it is added after the steps', function () {
    $step1 = Mockery::mock(helper_getLoadingStep());

    $step1->shouldReceive('setLoader')->once();

    $step2 = Mockery::mock(helper_getLoadingStep());

    $step2->shouldReceive('setLoader')->once();

    $group = new Group();

    /** @var Step $step1 */

    $group->addStep($step1);

    /** @var Step $step2 */

    $group->addStep($step2);

    $group->setLoader(new HttpLoader(new BotUserAgent('MyBot')));
});

test('The factory method returns a Group object instance', function () {
    expect(Crawler::group())->toBeInstanceOf(Group::class);
});

test('You can add multiple steps and invokeStep calls all of them', function () {
    $step1 = helper_getStepThatRemembersIfItWasCalled();

    $step2 = helper_getStepThatRemembersIfItWasCalled();

    $step3 = helper_getStepThatRemembersIfItWasCalled();

    $group = new Group();

    $group->addStep($step1)->addStep($step2)->addStep($step3);

    helper_invokeStepWithInput($group);

    expect($step1->called)->toBeTrue()     // @phpstan-ignore-line
        ->and($step2->called)->toBeTrue()  // @phpstan-ignore-line
        ->and($step3->called)->toBeTrue(); // @phpstan-ignore-line
});

it('combines the outputs of all it\'s steps into one output containing an array', function () {
    $step1 = helper_getValueReturningStep('lorem');

    $step2 = helper_getValueReturningStep('ipsum');

    $step3 = helper_getValueReturningStep('dolor');

    $group = new Group();

    $group->addStep($step1)->addStep($step2)->addStep($step3);

    $output = helper_invokeStepWithInput($group, 'gogogo');

    expect($output)->toHaveCount(1)
        ->and($output[0])->toBeInstanceOf(Output::class)
        ->and($output[0]->get())->toBe(['lorem', 'ipsum', 'dolor']);
});

test(
    'When defining keys for the steps via $step->outputKey(), the combined output array has those keys',
    function () {
        $step1 = helper_getValueReturningStep('ich');

        $step2 = helper_getValueReturningStep('bin');

        $step3 = helper_getValueReturningStep('ein berliner');

        $group = (new Group())
            ->addStep($step1->outputKey('foo'))
            ->addStep($step2->outputKey('bar'))
            ->addStep($step3->outputKey('baz'));

        $output = helper_invokeStepWithInput($group, 'https://www.gogo.go');

        expect($output)->toHaveCount(1)
            ->and($output[0])->toBeInstanceOf(Output::class);

        $expectedOutputAndResultArray = ['foo' => 'ich', 'bar' => 'bin', 'baz' => 'ein berliner'];

        expect($output[0]->get())->toBe($expectedOutputAndResultArray);
    },
);

it('merges array outputs with string keys to one array', function () {
    $step1 = helper_getValueReturningStep(['foo' => 'fooValue', 'bar' => 'barValue']);

    $step2 = helper_getValueReturningStep(['baz' => 'bazValue', 'yo' => 'lo']);

    $group = (new Group())
        ->addStep($step1)
        ->addStep($step2);

    $output = helper_invokeStepWithInput($group);

    expect($output)->toHaveCount(1)
        ->and($output[0]->get())->toBe([
            'foo' => 'fooValue',
            'bar' => 'barValue',
            'baz' => 'bazValue',
            'yo' => 'lo',
        ]);
});

it('doesn\'t invoke twice with duplicate inputs when uniqueInput was called', function () {
    $step1 = helper_getValueReturningStep('one');

    $step2 = helper_getValueReturningStep('two');

    $group = helper_addStepsToGroup(new Group(), $step1, $step2);

    $outputs = helper_invokeStepWithInput($group, 'foo');

    expect($outputs)->toHaveCount(1);

    $outputs = helper_invokeStepWithInput($group, 'foo');

    expect($outputs)->toHaveCount(1);

    $group->resetAfterRun();

    $group->uniqueInputs();

    $outputs = helper_invokeStepWithInput($group, 'foo');

    expect($outputs)->toHaveCount(1);

    $outputs = helper_invokeStepWithInput($group, 'foo');

    expect($outputs)->toHaveCount(0);
});

it(
    'doesn\'t invoke twice with array inputs with duplicate keys when uniqueInput was called with that key',
    function () {
        $step1 = helper_getValueReturningStep('one');

        $step2 = helper_getValueReturningStep('two');

        $group = helper_addStepsToGroup(new Group(), $step1, $step2);

        $group->uniqueInputs();

        $outputs = helper_invokeStepWithInput($group, ['foo' => 'bar', 'bttfc' => 'marty']);

        expect($outputs)->toHaveCount(1);

        $outputs = helper_invokeStepWithInput($group, ['foo' => 'bar', 'bttfc' => 'doc']);

        expect($outputs)->toHaveCount(1);

        $group->resetAfterRun();

        $group->uniqueInputs('foo');

        $outputs = helper_invokeStepWithInput($group, ['foo' => 'bar', 'bttfc' => 'marty']);

        expect($outputs)->toHaveCount(1);

        $outputs = helper_invokeStepWithInput($group, ['foo' => 'bar', 'bttfc' => 'doc']);

        expect($outputs)->toHaveCount(0);
    },
);

it(
    'doesn\'t invoke twice with object inputs with duplicate keys when uniqueInput was called with that key',
    function () {
        $step1 = helper_getValueReturningStep('one');

        $step2 = helper_getValueReturningStep('two');

        $group = helper_addStepsToGroup(new Group(), $step1, $step2);

        $group->uniqueInputs();

        $outputs = helper_invokeStepWithInput($group, helper_getStdClassWithData(['foo' => 'bar', 'bttfc' => 'marty']));

        expect($outputs)->toHaveCount(1);

        $outputs = helper_invokeStepWithInput($group, helper_getStdClassWithData(['foo' => 'bar', 'bttfc' => 'doc']));

        expect($outputs)->toHaveCount(1);

        $group->resetAfterRun();

        $group->uniqueInputs('foo');

        $outputs = helper_invokeStepWithInput($group, helper_getStdClassWithData(['foo' => 'bar', 'bttfc' => 'marty']));

        expect($outputs)->toHaveCount(1);

        $outputs = helper_invokeStepWithInput($group, helper_getStdClassWithData(['foo' => 'bar', 'bttfc' => 'doc']));

        expect($outputs)->toHaveCount(0);
    },
);

it('returns only unique outputs when uniqueOutput was called', function () {
    $step1 = helper_getInputReturningStep();

    $step2 = helper_getValueReturningStep('test');

    $group = helper_addStepsToGroup(new Group(), $step1, $step2)->uniqueOutputs();

    $outputs = helper_invokeStepWithInput($group, 'foo');

    expect($outputs)->toHaveCount(1);

    $outputs = helper_invokeStepWithInput($group, 'bar');

    expect($outputs)->toHaveCount(1);

    $outputs = helper_invokeStepWithInput($group, 'foo');

    expect($outputs)->toHaveCount(0);
});

it('returns only unique outputs when outputs are arrays and uniqueOutput was called', function () {
    $step1 = helper_getInputReturningStep();

    $step2 = helper_getValueReturningStep(['lorem' => 'ipsum']);

    $group = helper_addStepsToGroup(new Group(), $step1, $step2)->uniqueOutputs();

    $outputs = helper_invokeStepWithInput($group, ['foo' => 'bar']);

    expect($outputs)->toHaveCount(1);

    $outputs = helper_invokeStepWithInput($group, ['baz' => 'quz']);

    expect($outputs)->toHaveCount(1);

    $outputs = helper_invokeStepWithInput($group, ['foo' => 'bar']);

    expect($outputs)->toHaveCount(0);
});

it(
    'returns only unique outputs when outputs are arrays and uniqueOutput was called with a key from the output arrays',
    function () {
        $step1 = helper_getInputReturningStep();

        $step2 = helper_getValueReturningStep(['lorem' => 'ipsum']);

        $group = helper_addStepsToGroup(new Group(), $step1, $step2)->uniqueOutputs('foo');

        $outputs = helper_invokeStepWithInput($group, ['foo' => 'bar']);

        expect($outputs)->toHaveCount(1);

        $outputs = helper_invokeStepWithInput($group, ['foo' => 'baz']);

        expect($outputs)->toHaveCount(1);

        $outputs = helper_invokeStepWithInput($group, ['foo' => 'bar', 'something' => 'else']);

        expect($outputs)->toHaveCount(0);
    },
);

it('returns only unique outputs when outputs are objects and uniqueOutput was called', function () {
    $step1 = helper_getStepYieldingObjectWithNumber(10);

    $step2 = helper_getStepYieldingObjectWithNumber(11);

    $group = helper_addStepsToGroup(new Group(), $step1, $step2);

    expect(helper_invokeStepWithInput($group))->toHaveCount(1);

    $group->uniqueOutputs();

    expect(helper_invokeStepWithInput($group))->toHaveCount(1)
        ->and(helper_invokeStepWithInput($group))->toHaveCount(0);

    $incrementNumberCallback = function (mixed $input) {
        return $input + 1;
    };

    helper_addUpdateInputUsingOutputCallbackToSteps($incrementNumberCallback, $step1, $step2);

    expect(helper_invokeStepWithInput($group, new Input(1)))->toHaveCount(1);
});

it(
    'returns only unique outputs when outputs are objects and uniqueOutput was called with a property name from the ' .
    'output objects',
    function () {
        $step1 = helper_getStepYieldingObjectWithNumber(21);

        $step2 = helper_getStepYieldingObjectWithNumber(23);

        $group = helper_addStepsToGroup(new Group(), $step1, $step2);

        expect(helper_invokeStepWithInput($group))->toHaveCount(1);

        $group->resetAfterRun();

        $group->uniqueOutputs('number');

        expect(helper_invokeStepWithInput($group))->toHaveCount(1)
            ->and(helper_invokeStepWithInput($group))->toHaveCount(0);

        $group->resetAfterRun();

        $incrementNumberCallback = function (mixed $input) {
            return $input + 1;
        };

        helper_addUpdateInputUsingOutputCallbackToSteps($incrementNumberCallback, $step1, $step2);

        expect(helper_invokeStepWithInput($group, new Input(1)))->toHaveCount(1);
    },
);

it(
    'excludes the output of a step from the combined group output, when the excludeFromGroupOutput() method was called',
    function () {
        $step1 = helper_getValueReturningStep(['foo' => 'one']);

        $step2 = helper_getValueReturningStep(['bar' => 'two'])->excludeFromGroupOutput();

        $step3 = helper_getValueReturningStep(['baz' => 'three']);

        $group = helper_addStepsToGroup(new Group(), $step1, $step2, $step3);

        $outputs = helper_invokeStepWithInput($group);

        expect($outputs)->toHaveCount(1)
            ->and($outputs[0]->get())->toBe(['foo' => 'one', 'baz' => 'three']);
    },
);

test('You can update the input for further steps with the output of a step that is before those steps', function () {
    $step1 = helper_getValueReturningStep(' rocks')
        ->updateInputUsingOutput(function (mixed $input, mixed $output) {
            return $input . $output['foo'];
        });

    $step2 = helper_getInputReturningStep();

    $group = (new Group())
        ->addStep($step1->outputKey('foo'))
        ->addStep($step2->outputKey('bar'));

    $outputs = helper_invokeStepWithInput($group, 'crwlr.software');

    expect($outputs)->toHaveCount(1)
        ->and($outputs[0]->get())->toBe(['foo' => ' rocks', 'bar' => 'crwlr.software rocks']);
});

it('uses a key from array input when defined', function () {
    $step = helper_getInputReturningStep();

    $group = (new Group())
        ->addStep($step->outputKey('test'))
        ->useInputKey('bar');

    $outputs = helper_invokeStepWithInput($group, new Input(
        ['foo' => 'fooValue', 'bar' => 'barValue', 'baz' => 'bazValue'],
    ));

    expect($outputs)->toHaveCount(1)
        ->and($outputs[0]->get())->toBe(['test' => 'barValue']);
});

it('keeps the combined output with a certain key when keepAs() is used', function () {
    $step1 = helper_getValueReturningStep('foo');

    $step2 = helper_getValueReturningStep('bar');

    $group = (new Group())
        ->addStep($step1->outputKey('key1'))
        ->addStep($step2->outputKey('key2'))
        ->keepAs('test');

    $output = helper_invokeStepWithInput($group);

    expect($output)->toHaveCount(1)
        ->and($output[0]->keep)->toBe(['test' => ['key1' => 'foo', 'key2' => 'bar']]);
});

it('keeps all keys from a combined array output when keep() was called without argument', function () {
    $step1 = helper_getValueReturningStep(['foo' => 'fooValue', 'bar' => 'barValue']);

    $step2 = helper_getValueReturningStep(['baz' => 'bazValue', 'yo' => 'lo']);

    $group = (new Group())
        ->addStep($step1)
        ->addStep($step2)
        ->keep();

    $output = helper_invokeStepWithInput($group);

    expect($output)->toHaveCount(1)
        ->and($output[0]->keep)->toBe([
            'foo' => 'fooValue',
            'bar' => 'barValue',
            'baz' => 'bazValue',
            'yo' => 'lo',
        ]);
});

it('keeps all defined keys from a combined array output when keep() was called with keys', function () {
    $step1 = helper_getValueReturningStep(['foo' => 'fooValue', 'bar' => 'barValue']);

    $step2 = helper_getValueReturningStep(['baz' => 'bazValue', 'yo' => 'lo']);

    $group = (new Group())
        ->addStep($step1)
        ->addStep($step2)
        ->keep(['foo', 'baz', 'yo']);

    $output = helper_invokeStepWithInput($group);

    expect($output)->toHaveCount(1)
        ->and($output[0]->keep)->toBe([
            'foo' => 'fooValue',
            'baz' => 'bazValue',
            'yo' => 'lo',
        ]);
});

it('keeps data, when keep() is called on child steps', function () {
    $step1 = helper_getValueReturningStep(['foo' => 'fooValue', 'bar' => 'barValue']);

    $step2 = helper_getValueReturningStep(['baz' => 'bazValue', 'quz' => 'quzValue']);

    $group = (new Group())
        ->addStep($step1->keep('foo'))
        ->addStep($step2->keep(['baz', 'quz']));

    $output = helper_invokeStepWithInput($group);

    expect($output)->toHaveCount(1)
        ->and($output[0]->keep)->toBe([
            'foo' => 'fooValue',
            'baz' => 'bazValue',
            'quz' => 'quzValue',
        ]);
});

it('keeps data, when keepAs() is called on child steps', function () {
    $step1 = helper_getValueReturningStep('fooValue');

    $step2 = helper_getValueReturningStep(['bar' => 'barValue', 'baz' => 'bazValue']);

    $group = (new Group())
        ->addStep($step1->keepAs('foo'))
        ->addStep($step2->keepAs('quz'));

    $output = helper_invokeStepWithInput($group);

    expect($output)->toHaveCount(1)
        ->and($output[0]->keep)->toBe([
            'foo' => 'fooValue',
            'quz' => [
                'bar' => 'barValue',
                'baz' => 'bazValue',
            ],
        ]);
});

test(
    'when steps yield multiple outputs it combines the first output from first step with first output from second ' .
        'step and so on.',
    function () {
        $step1 = new class extends Step {
            protected function invoke(mixed $input): Generator
            {
                yield ['one' => 'foo'];

                yield ['two' => 'bar'];
            }
        };

        $step2 = new class extends Step {
            protected function invoke(mixed $input): Generator
            {
                yield ['three' => 'baz'];

                yield ['four' => 'quz'];
            }
        };

        $group = (new Group())
            ->addStep($step1)
            ->addStep($step2);

        $output = helper_invokeStepWithInput($group);

        expect($output)->toHaveCount(2)
            ->and($output[0]->get())->toBe(['one' => 'foo', 'three' => 'baz'])
            ->and($output[1]->get())->toBe(['two' => 'bar', 'four' => 'quz']);
    },
);

it('ignores the key set via outputKey because group step output is always an array', function () {
    $step1 = helper_getValueReturningStep(['one' => 'foo']);

    $step2 = helper_getValueReturningStep(['two' => 'bar']);

    $group = (new Group())
        ->addStep($step1)
        ->addStep($step2)
        ->outputKey('baz');

    $output = helper_invokeStepWithInput($group);

    expect($output)->toHaveCount(1)
        ->and($output[0]->get())->toBe(['one' => 'foo', 'two' => 'bar']);
});

it(
    'keeps input data when keepFromInput() was called when outputs are combined',
    function () {
        $step1 = helper_getValueReturningStep(['foo' => 'one']);

        $step2 = helper_getValueReturningStep(['bar' => 'two']);

        $group = (new Group())
            ->addStep($step1)
            ->addStep($step2)
            ->keepFromInput();

        $output = helper_invokeStepWithInput($group, new Input(['baz' => 'three']));

        expect($output)->toHaveCount(1)
            ->and($output[0]->get())->toBe(['foo' => 'one', 'bar' => 'two'])
            ->and($output[0]->keep)->toBe(['baz' => 'three']);
    },
);

it('keeps non array input data in array output with key', function () {
    $step1 = helper_getValueReturningStep(['foo' => 'one']);

    $step2 = helper_getValueReturningStep(['bar' => 'two']);

    $group = (new Group())
        ->addStep($step1)
        ->addStep($step2)
        ->keepInputAs('baz');

    $output = helper_invokeStepWithInput($group, new Input('three'));

    expect($output)->toHaveCount(1)
        ->and($output[0]->get())->toBe(['foo' => 'one', 'bar' => 'two'])
        ->and($output[0]->keep)->toBe(['baz' => 'three']);
});

it('keeps a value with unnamed key, when non array input should be kept but no key is defined', function () {
    $step1 = helper_getValueReturningStep(['foo' => 'one']);

    $step2 = helper_getValueReturningStep(['bar' => 'two']);

    $group = (new Group())
        ->addStep($step1)
        ->addStep($step2)
        ->keepFromInput();

    $output = helper_invokeStepWithInput($group, new Input('three'));

    expect($output)->toHaveCount(1)
        ->and($output[0]->get())->toBe(['foo' => 'one', 'bar' => 'two'])
        ->and($output[0]->keep)->toBe(['unnamed1' => 'three']);
});

it('contains an element with a numeric key when it contains a step that yields non array output', function () {
    $step1 = helper_getValueReturningStep('one');

    $step2 = helper_getValueReturningStep(['bar' => 'two']);

    $group = (new Group())
        ->addStep($step1)
        ->addStep($step2);

    $output = helper_invokeStepWithInput($group);

    expect($output)->toHaveCount(1)
        ->and($output[0]->get())->toBe([0 => 'one', 'bar' => 'two']);
});

it('keeps array input data when some output is non array but converted to array using outputKey()', function () {
    $step1 = helper_getValueReturningStep('one');

    $step2 = helper_getValueReturningStep(['bar' => 'two']);

    $group = (new Group())
        ->addStep($step1->outputKey('foo'))
        ->addStep($step2)
        ->keepFromInput();

    $output = helper_invokeStepWithInput($group, new Input(['baz' => 'three']));

    expect($output)->toHaveCount(1)
        ->and($output[0]->get())->toBe(['foo' => 'one', 'bar' => 'two'])
        ->and($output[0]->keep)->toBe(['baz' => 'three']);
});

it(
    'keeps an input value with an unnamed key, when it is a non array value and no key is defined (via keepInputAs())',
    function () {
        $step1 = helper_getValueReturningStep('one');

        $step2 = helper_getValueReturningStep(['bar' => 'two']);

        $group = (new Group())
            ->addStep($step1)
            ->addStep($step2)
            ->keepFromInput();

        $output = helper_invokeStepWithInput($group, new Input('three'));

        expect($output)->toHaveCount(1)
            ->and($output[0]->get())->toBe([0 => 'one', 'bar' => 'two'])
            ->and($output[0]->keep)->toBe(['unnamed1' => 'three']);
    },
);

it('keeps the original input data when useInputKey() is used', function () {
    $step1 = helper_getValueReturningStep(['foo' => 'one']);

    $step2 = helper_getValueReturningStep(['bar' => 'two']);

    $group = (new Group())
        ->addStep($step1)
        ->addStep($step2)
        ->useInputKey('baz')
        ->keepFromInput();

    $output = helper_invokeStepWithInput($group, new Input(['baz' => 'three', 'quz' => 'four']));

    expect($output)->toHaveCount(1)
        ->and($output[0]->get())->toBe(['foo' => 'one', 'bar' => 'two'])
        ->and($output[0]->keep)->toBe(['baz' => 'three', 'quz' => 'four']);
});

it('applies a Closure refiner to the steps output', function () {
    $step1 = helper_getValueReturningStep(['foo' => 'one']);

    $step2 = helper_getValueReturningStep(['bar' => 'two']);

    $group = (new Group())
        ->addStep($step1)
        ->addStep($step2)
        ->refineOutput(function (mixed $outputValue) {
            $outputValue['baz'] = 'three';

            $outputValue['bar'] .= ' refined';

            return $outputValue;
        });

    $outputs = helper_invokeStepWithInput($group);

    expect($outputs[0]->get())->toBe(['foo' => 'one', 'bar' => 'two refined', 'baz' => 'three']);
});

it('applies an instance of the RefinerInterface to the steps output', function () {
    $step1 = helper_getValueReturningStep(['foo' => 'lorem ipsum dolor']);

    $step2 = helper_getValueReturningStep(['bar' => 'two']);

    $group = (new Group())
        ->addStep($step1)
        ->addStep($step2)
        ->refineOutput('foo', StringRefiner::betweenFirst('lorem', 'dolor'));

    $outputs = helper_invokeStepWithInput($group);

    expect($outputs[0]->get())->toBe(['foo' => 'ipsum', 'bar' => 'two']);
});

it('applies multiple refiners to the steps output in the order they\'re added', function () {
    $step1 = helper_getValueReturningStep(['foo' => 'lorem ipsum dolor']);

    $step2 = helper_getValueReturningStep(['bar' => 'two']);

    $group = (new Group())
        ->addStep($step1)
        ->addStep($step2)
        ->refineOutput('foo', StringRefiner::betweenFirst('lorem', 'dolor'))
        ->refineOutput('bar', fn(mixed $outputValue) => $outputValue . ' refined');

    $outputs = helper_invokeStepWithInput($group);

    expect($outputs[0]->get())->toBe(['foo' => 'ipsum', 'bar' => 'two refined']);
});

test('you can apply multiple refiners to the same output array key', function () {
    $step1 = helper_getValueReturningStep(['foo' => 'lorem ipsum dolor']);

    $step2 = helper_getValueReturningStep(['bar' => 'two']);

    $group = (new Group())
        ->addStep($step1)
        ->addStep($step2)
        ->refineOutput('foo', StringRefiner::betweenFirst('lorem', 'dolor'))
        ->refineOutput('foo', fn(mixed $outputValue) => $outputValue . ' refined');

    $outputs = helper_invokeStepWithInput($group);

    expect($outputs[0]->get())->toBe(['foo' => 'ipsum refined', 'bar' => 'two']);
});

it(
    'uses the original input value when applying a refiner, not only the value of an input array key chosen via ' .
    'useInputKey()',
    function () {
        $step1 = helper_getValueReturningStep(['foo' => 'one']);

        $step2 = helper_getValueReturningStep(['bar' => 'two']);

        $group = (new Group())
            ->addStep($step1)
            ->addStep($step2)
            ->refineOutput(fn(mixed $outputValue, mixed $originalInputValue) => $originalInputValue);

        $outputs = helper_invokeStepWithInput($group, ['yo' => 'lo']);

        expect($outputs[0]->get())->toBe(['yo' => 'lo']);
    },
);

it('stops calling its steps and producing outputs when maxOutputs is reached', function () {
    $step1 = new class extends Step {
        public int $called = 0;

        protected function invoke(mixed $input): Generator
        {
            yield ['foo' => 'one'];

            $this->called++;
        }
    };

    $step2 = new class extends Step {
        public int $called = 0;

        protected function invoke(mixed $input): Generator
        {
            yield ['bar' => 'two'];

            $this->called++;
        }
    };

    $group = (new Group())
        ->addStep($step1)
        ->addStep($step2)
        ->maxOutputs(2);

    expect(helper_invokeStepWithInput($group, 'hey'))->toHaveCount(1)
        ->and(helper_invokeStepWithInput($group, 'ho'))->toHaveCount(1)
        ->and(helper_invokeStepWithInput($group, 'hey'))->toHaveCount(0)
        ->and($step1->called)->toBe(2)
        ->and($step2->called)->toBe(2);
});

it(
    'also stops creating outputs when maxOutputs is reached, when maxOutputs() was called before addStep()',
    function () {
        $step1 = new class extends Step {
            public int $called = 0;

            protected function invoke(mixed $input): Generator
            {
                yield ['foo' => 'one'];

                $this->called++;
            }
        };

        $step2 = new class extends Step {
            public int $called = 0;

            protected function invoke(mixed $input): Generator
            {
                yield ['bar' => 'two'];

                $this->called++;
            }
        };

        $group = (new Group())
            ->maxOutputs(2)
            ->addStep($step1)
            ->addStep($step2);

        expect(helper_invokeStepWithInput($group, 'hey'))->toHaveCount(1)
            ->and(helper_invokeStepWithInput($group, 'ho'))->toHaveCount(1)
            ->and(helper_invokeStepWithInput($group, 'hey'))->toHaveCount(0)
            ->and($step1->called)->toBe(2)
            ->and($step2->called)->toBe(2);
    },
);


================================================
FILE: tests/Steps/Html/CssSelectorTest.php
================================================
<?php

namespace tests\Steps\Html;

use Crwlr\Crawler\Steps\Dom\HtmlDocument;
use Crwlr\Crawler\Steps\Html\CssSelector;
use Crwlr\Crawler\Steps\Html\Exceptions\InvalidDomQueryException;
use Crwlr\Html2Text\Html2Text;

use function tests\helper_getSimpleListHtml;

it('throws an exception when created with an invalid CSS Selector', function ($selector) {
    new CssSelector($selector);
})->throws(InvalidDomQueryException::class)->with(['.foo;', '.foo:before']);

test('The apply method returns a string for a single match', function () {
    $html = '<div class="item">test</div>';

    expect((new CssSelector('.item'))->apply(new HtmlDocument($html)))->toBe('test');
});

test('The apply method returns an array of strings for multiple matches', function () {
    $html = '<div class="item">test</div><div class="item">test 2 <span>sub</span></div><div class="item">test 3</div>';

    expect((new CssSelector('.item'))->apply(new HtmlDocument($html)))->toBe(['test', 'test 2 sub', 'test 3']);
});

test('The apply method returns null if nothing matches', function () {
    $html = '<div class="item">test</div>';

    expect((new CssSelector('.aitem'))->apply(new HtmlDocument($html)))->toBeNull();
});

it('trims whitespace', function () {
    $html = <<<HTML
        <div class="item">
            test
        </div>
        HTML;

    expect((new CssSelector('.item'))->apply(new HtmlDocument($html)))->toBe('test');
});

it('contains inner tags when the html method is called', function () {
    $html = '<div class="item">test <span>sub</span></div>';

    expect((new CssSelector('.item'))->html()->apply(new HtmlDocument($html)))->toBe('test <span>sub</span>');
});

it('contains also the outer tag when the outerHtml method is called', function () {
    $html = '<div class="item">test <span>sub</span></div>';

    expect((new CssSelector('.item'))->outerHtml()->apply(new HtmlDocument($html)))
        ->toBe('<div class="item">test <span>sub</span></div>');
});

it('returns formatted text when formattedText() is called', function () {
    $html = '<article id="a"><h1>headline</h1><p>paragraph</p><ul><li>item 1</li><li>item 2</li></ul></article>';

    expect((new CssSelector('#a'))->formattedText()->apply(new HtmlDocument($html)))
        ->toBe(<<<TEXT
        # headline

        paragraph

        * item 1
        * item 2
        TEXT);
});

test('you can provide your own converter instance to get formattedText()', function () {
    $html = '<article id="a"><h1>headline</h1><p>paragraph</p><ul><li>item 1</li><li>item 2</li></ul></article>';

    $converter = new Html2Text();

    $converter->removeConverter('ul');

    expect((new CssSelector('#a'))->formattedText($converter)->apply(new HtmlDocument($html)))
        ->toBe(<<<TEXT
        # headline

        paragraph

        item 1
        item 2
        TEXT);
});

it('gets the contents of an attribute using the attribute method', function () {
    $html = '<div class="item" data-attr="content">test</div>';

    expect((new CssSelector('.item'))->attribute('data-attr')->apply(new HtmlDocument($html)))->toBe('content');
});

test('getting an attribute value returns an empty string when the attribute does not exist', function () {
    $html = '<div class="item">test</div>';

    expect((new CssSelector('.item'))->attribute('foo')->apply(new HtmlDocument($html)))->toBe('');
});

it('turns the value into an absolute url when toAbsoluteUrl() is called', function () {
    $html = '<a href="/packages/crawler/v0.4/getting-started">getting started</a>';

    $document = new HtmlDocument($html);

    $selector = new CssSelector('a');

    $selector->setBaseUrl('https://www.crwlr.software/')
        ->attribute('href');

    expect($selector->apply($document))->toBe('/packages/crawler/v0.4/getting-started');

    $selector->toAbsoluteUrl();

    expect($selector->apply($document))->toBe('https://www.crwlr.software/packages/crawler/v0.4/getting-started');
});

it(
    'turns the value into the correct absolute url when toAbsoluteUrl() is called and the HTML contains a base tag',
    function () {
        $html = <<<HTML
            <!DOCTYPE html>
            <html>
            <head>
            <base href="/c/d" />
            </head>
            <body><a href="e">link</a></body>
            </html>
            HTML;

        $document = new HtmlDocument($html);

        $selector = new CssSelector('a');

        $selector->setBaseUrl('https://www.example.com/a/b')
            ->attribute('href');

        expect($selector->apply($document))->toBe('e');

        $selector->toAbsoluteUrl();

        expect($selector->apply($document))->toBe('https://www.example.com/c/e');
    },
);

it('gets an absolute link from the href attribute of a link element, when the link() method is called', function () {
    $html = '<div id="foo"><a class="bar" href="/foo/bar">Foo</a></div>';

    $document = new HtmlDocument($html);

    $selector = new CssSelector('#foo .bar');

    $selector->setBaseUrl('https://www.example.com/');

    expect($selector->apply($document))->toBe('Foo');

    $selector->link();

    expect($selector->apply($document))->toBe('https://www.example.com/foo/bar');
});

it('gets only the first matching element when the first() method is called', function () {
    $selector = (new CssSelector('#list .item'))->first();

    expect($selector->apply(new HtmlDocument(helper_getSimpleListHtml())))->toBe('one');
});

it('gets only the last matching element when the last() method is called', function () {
    $selector = (new CssSelector('#list .item'))->last();

    expect($selector->apply(new HtmlDocument(helper_getSimpleListHtml())))->toBe('four');
});

it('gets only the nth matching element when the nth() method is called', function () {
    $selector = (new CssSelector('#list .item'))->nth(3);

    expect($selector->apply(new HtmlDocument(helper_getSimpleListHtml())))->toBe('three');
});

it('returns null when no nth matching element exists', function () {
    $selector = (new CssSelector('#list .item'))->nth(5);

    expect($selector->apply(new HtmlDocument(helper_getSimpleListHtml())))->toBeNull();
});

it('gets only even matching elements when the even() method is called', function () {
    $selector = (new CssSelector('#list .item'))->even();

    expect($selector->apply(new HtmlDocument(helper_getSimpleListHtml())))->toBe(['two', 'four']);
});

it('gets only odd matching elements when the odd() method is called', function () {
    $selector = (new CssSelector('#list .item'))->odd();

    expect($selector->apply(new HtmlDocument(helper_getSimpleListHtml())))->toBe(['one', 'three']);
});


================================================
FILE: tests/Steps/Html/Exceptions/InvalidDomQueryExceptionTest.php
================================================
<?php

namespace tests\Steps\Html\Exceptions;

use Crwlr\Crawler\Steps\Html\Exceptions\InvalidDomQueryException;
use Symfony\Component\CssSelector\Exception\ExpressionErrorException;
use Symfony\Component\CssSelector\Exception\SyntaxErrorException;

it('can be created from a symfony ExpressionErrorException', function () {
    $exception = InvalidDomQueryException::fromSymfonyException('.foo:before', new ExpressionErrorException('error'));

    expect($exception->getDomQuery())
        ->toBe('.foo:before')
        ->and($exception->getMessage())
        ->toBe('error');
});

it('can be created from a symfony SyntaxErrorException', function () {
    $exception = InvalidDomQueryException::fromSymfonyException('.foo;', new SyntaxErrorException('error message'));

    expect($exception->getDomQuery())
        ->toBe('.foo;')
        ->and($exception->getMessage())
        ->toBe('error message');
});

it('can be created from a message and a query', function () {
    $exception = InvalidDomQueryException::make('message', '.foo > .bar;');

    expect($exception->getDomQuery())
        ->toBe('.foo > .bar;')
        ->and($exception->getMessage())
        ->toBe('message');
});


================================================
FILE: tests/Steps/Html/GetLinkTest.php
================================================
<?php

namespace tests\Steps\Html;

use Crwlr\Crawler\Loader\Http\Messages\RespondedRequest;
use Crwlr\Crawler\Input;
use Crwlr\Crawler\Steps\Html\GetLink;
use GuzzleHttp\Psr7\Request;
use GuzzleHttp\Psr7\Response;
use tests\_Stubs\DummyLogger;

use function tests\helper_invokeStepWithInput;
use function tests\helper_traverseIterable;

it('works with a RespondedRequest as input', function () {
    $step = (new GetLink());

    $link = helper_invokeStepWithInput($step, new RespondedRequest(
        new Request('GET', 'https://www.crwl.io/foo/bar'),
        new Response(200, [], '<a href="/blog">link</a>'),
    ));

    expect($link)->toHaveCount(1)
        ->and($link[0]->get())->toBe('https://www.crwl.io/blog');
});

it('logs an error message when fed with invalid input', function () {
    $logger = new DummyLogger();

    $step = (new GetLink())->addLogger($logger);

    helper_traverseIterable($step->invokeStep(new Input(new Response())));

    expect($logger->messages)->not->toBeEmpty()
        ->and($logger->messages[0]['message'])->toBe(
            'The Crwlr\Crawler\Steps\Html\GetLink step was called with input that it can not work with: Input must ' .
            'be an instance of RespondedRequest.',
        );
});

test('When called without selector it just returns the first link', function () {
    $step = (new GetLink());

    $link = helper_invokeStepWithInput($step, new RespondedRequest(
        new Request('GET', 'https://www.crwlr.software/packages/url/'),
        new Response(
            200,
            [],
            '<div><a href="v0.1">v0.1</a><a href="v1.0">v1.0</a><a href="v1.1">v1.1</a></div>',
        ),
    ));

    expect($link[0]->get())->toBe('https://www.crwlr.software/packages/url/v0.1');
});

test('When passing a CSS selector it selects the first matching link', function () {
    $step = (new GetLink('.matchingLink'));

    $responseHtml = <<<HTML
        <div>
            <a class="matchingLink" href="jobs">Jobs</a>
            <a class="matchingLink" href="numbers">Numbers</a>
            <a class="nonMatchingLink" href="/products">Products</a>
        </div>
        HTML;

    $link = helper_invokeStepWithInput($step, new RespondedRequest(
        new Request('GET', 'https://www.foo.bar/company/about'),
        new Response(200, [], $responseHtml),
    ));

    expect($link[0]->get())->toBe('https://www.foo.bar/company/jobs');
});

test('When selector matches on a non-link element it\'s ignored', function () {
    $step = (new GetLink('.link'));

    $link = helper_invokeStepWithInput($step, new RespondedRequest(
        new Request('GET', 'https://www.otsch.codes'),
        new Response(200, [], '<span class="link">not a link</span><a class="link" href="foo">link</a>'),
    ));

    expect($link)->toHaveCount(1)
        ->and($link[0]->get())->toBe('https://www.otsch.codes/foo');
});

it('finds only links on the same domain when onSameDomain() was called', function () {
    $html = <<<HTML
        <a href="https://www.crwlr.software/packages">link1</a>
        <a href="https://blog.otsch.codes/articles">link2</a>
        HTML;

    $step = (new GetLink())->onSameDomain();

    $link = helper_invokeStepWithInput($step, new RespondedRequest(
        new Request('GET', 'https://www.otsch.codes'),
        new Response(200, [], $html),
    ));

    expect($link)->toHaveCount(1)
        ->and($link[0]->get())->toBe('https://blog.otsch.codes/articles');
});

it('doesn\'t find a link on the same domain when notOnSameDomain() was called', function () {
    $html = <<<HTML
        <a href="https://www.otsch.codes/contact">link1</a>
        <a href="https://www.crwlr.software/packages">link2</a>
        HTML;

    $step = (new GetLink())->notOnSameDomain();

    $link = helper_invokeStepWithInput($step, new RespondedRequest(
        new Request('GET', 'https://www.otsch.codes'),
        new Response(200, [], $html),
    ));

    expect($link)->toHaveCount(1)
        ->and($link[0]->get())->toBe('https://www.crwlr.software/packages');
});

it('finds only links from domains the onDomain() method was called with', function () {
    $html = <<<HTML
        <a href="https://www.otsch.codes/contact">link1</a>
        <a href="https://www.crwlr.software/packages">link2</a>
        <a href="https://www.crwl.io">link3</a>
        <a href="https://www.example.com">link4</a>
        HTML;

    $step = (new GetLink())->onDomain('example.com');

    $links = helper_invokeStepWithInput($step, new RespondedRequest(
        new Request('GET', 'https://www.otsch.codes'),
        new Response(200, [], $html),
    ));

    expect($links)->toHaveCount(1)
        ->and($links[0]->get())->toBe('https://www.example.com');
});

test('onDomain() also takes an array of domains', function () {
    $html = <<<HTML
        <a href="https://www.otsch.codes/contact">link1</a>
        <a href="https://www.crwlr.software/packages">link2</a>
        HTML;

    $step = (new GetLink())->onDomain(['otsch.codes', 'example.com']);

    $links = helper_invokeStepWithInput($step, new RespondedRequest(
        new Request('GET', 'https://www.otsch.codes'),
        new Response(200, [], $html),
    ));

    expect($links)->toHaveCount(1)
        ->and($links[0]->get())->toBe('https://www.otsch.codes/contact');

    $html = <<<HTML
        <a href="https://www.crwlr.software/packages">link1</a>
        <a href="https://www.example.com/foo">link2</a>
        HTML;

    $links = helper_invokeStepWithInput($step, new RespondedRequest(
        new Request('GET', 'https://www.otsch.codes'),
        new Response(200, [], $html),
    ));

    expect($links)->toHaveCount(1)
        ->and($links[0]->get())->toBe('https://www.example.com/foo');
});

test('onDomain() can be called multiple times and merges all domains it was called with', function () {
    $html = <<<HTML
        <a href="https://www.otsch.codes/contact">link1</a>
        HTML;

    $step = (new GetLink())->onDomain('crwl.io');

    $links = helper_invokeStepWithInput($step, new RespondedRequest(
        new Request('GET', 'https://www.otsch.codes'),
        new Response(200, [], $html),
    ));

    expect($links)->toHaveCount(0);

    $step->onDomain(['otsch.codes', 'crwlr.software']);

    $html = <<<HTML
        <a href="https://www.crwl.io">link1</a>
        <a href="https://www.otsch.codes/contact">link2</a>
        HTML;

    $links = helper_invokeStepWithInput($step, new RespondedRequest(
        new Request('GET', 'https://www.otsch.codes'),
        new Response(200, [], $html),
    ));

    expect($links)->toHaveCount(1)
        ->and($links[0]->get())->toBe('https://www.crwl.io');

    $html = <<<HTML
        <a href="https://www.otsch.codes/contact">link1</a>
        <a href="https://www.crwl.io">link2</a>
        HTML;

    $links = helper_invokeStepWithInput($step, new RespondedRequest(
        new Request('GET', 'https://www.otsch.codes'),
        new Response(200, [], $html),
    ));

    expect($links)->toHaveCount(1)
        ->and($links[0]->get())->toBe('https://www.otsch.codes/contact');
});

it('finds only links on the same host when onSameHost() was called', function () {
    $html = <<<HTML
        <a href="https://www.crwlr.software/packages">link1</a>
        <a href="https://jobs.otsch.codes">link2</a>
        <a href="https://www.otsch.codes/contact">link3</a>
        HTML;

    $step = (new GetLink())->onSameHost();

    $link = helper_invokeStepWithInput($step, new RespondedRequest(
        new Request('GET', 'https://www.otsch.codes'),
        new Response(200, [], $html),
    ));

    expect($link)->toHaveCount(1)
        ->and($link[0]->get())->toBe('https://www.otsch.codes/contact');
});

it('doesn\'t find a link on the same host when notOnSameHost() was called', function () {
    $html = <<<HTML
        <a href="https://www.otsch.codes/contact">link1</a>
        <a href="https://jobs.otsch.codes">link2</a>
        HTML;

    $step = (new GetLink())->notOnSameHost();

    $link = helper_invokeStepWithInput($step, new RespondedRequest(
        new Request('GET', 'https://www.otsch.codes'),
        new Response(200, [], $html),
    ));

    expect($link)->toHaveCount(1)
        ->and($link[0]->get())->toBe('https://jobs.otsch.codes');
});

it('finds only links from hosts the onHost() method was called with', function () {
    $html = <<<HTML
        <a href="https://www.otsch.codes/contact">link1</a>
        <a href="https://www.crwlr.software/packages">link2</a>
        <a href="https://www.crwl.io">link3</a>
        <a href="https://www.example.com">link4</a>
        HTML;

    $step = (new GetLink())->onHost('www.example.com');

    $links = helper_invokeStepWithInput($step, new RespondedRequest(
        new Request('GET', 'https://www.otsch.codes'),
        new Response(200, [], $html),
    ));

    expect($links)->toHaveCount(1)
        ->and($links[0]->get())->toBe('https://www.example.com');
});

test('onHost() also takes an array of hosts', function () {
    $html = <<<HTML
        <a href="https://www.otsch.codes/contact">link1</a>
        <a href="https://www.crwlr.software/packages">link2</a>
        HTML;

    $step = (new GetLink())->onHost(['www.otsch.codes', 'blog.example.com']);

    $links = helper_invokeStepWithInput($step, new RespondedRequest(
        new Request('GET', 'https://www.otsch.codes'),
        new Response(200, [], $html),
    ));

    expect($links)->toHaveCount(1)
        ->and($links[0]->get())->toBe('https://www.otsch.codes/contact');

    $html = <<<HTML
        <a href="https://www.example.com/foo">link1</a>
        <a href="https://www.crwlr.software/packages">link2</a>
        <a href="https://blog.example.com/articles/1">link3</a>
        HTML;

    $links = helper_invokeStepWithInput($step, new RespondedRequest(
        new Request('GET', 'https://www.otsch.codes'),
        new Response(200, [], $html),
    ));

    expect($links)->toHaveCount(1)
        ->and($links[0]->get())->toBe('https://blog.example.com/articles/1');
});

test('onHost() can be called multiple times and merges all hosts it was called with', function () {
    $html = <<<HTML
        <a href="https://www.otsch.codes/contact">link1</a>
        HTML;

    $step = (new GetLink())->onHost('www.crwl.io');

    $links = helper_invokeStepWithInput($step, new RespondedRequest(
        new Request('GET', 'https://www.otsch.codes'),
        new Response(200, [], $html),
    ));

    expect($links)->toHaveCount(0);

    $step->onHost(['www.otsch.codes', 'www.crwlr.software']);

    $html = <<<HTML
        <a href="https://www.crwl.io">link1</a>
        HTML;

    $links = helper_invokeStepWithInput($step, new RespondedRequest(
        new Request('GET', 'https://www.otsch.codes'),
        new Response(200, [], $html),
    ));

    expect($links)->toHaveCount(1)
        ->and($links[0]->get())->toBe('https://www.crwl.io');

    $html = <<<HTML
        <a href="https://www.otsch.codes/blog">link1</a>
        <a href="https://www.crwl.io">link2</a>
        HTML;

    $links = helper_invokeStepWithInput($step, new RespondedRequest(
        new Request('GET', 'https://www.otsch.codes'),
        new Response(200, [], $html),
    ));

    expect($links)->toHaveCount(1)
        ->and($links[0]->get())->toBe('https://www.otsch.codes/blog');
});

it('works correctly when HTML contains a base tag', function () {
    $html = <<<HTML
        <!DOCTYPE html>
        <html>
        <head>
        <base href="/c/d" />
        </head>
        <body><a href="e">link</a></body>
        </html>
        HTML;

    $step = (new GetLink());

    $links = helper_invokeStepWithInput($step, new RespondedRequest(
        new Request('GET', 'https://www.example.com/a/b'),
        new Response(200, [], $html),
    ));

    expect($links[0]->get())->toBe('https://www.example.com/c/e');
});

it('throws away the URL fragment part when withoutFragment() was called', function () {
    $html = <<<HTML
        <!DOCTYPE html>
        <html>
        <head></head>
        <body><a href="/foo/bar#fragment">link</a></body>
        </html>
        HTML;

    $step = (new GetLink());

    $respondedRequest = new RespondedRequest(
        new Request('GET', 'https://www.example.com/foo/baz'),
        new Response(200, [], $html),
    );

    $links = helper_invokeStepWithInput($step, $respondedRequest);

    expect($links[0]->get())->toBe('https://www.example.com/foo/bar#fragment');

    $step->withoutFragment();

    $links = helper_invokeStepWithInput($step, $respondedRequest);

    expect($links[0]->get())->toBe('https://www.example.com/foo/bar');
});

it('ignores special non HTTP links', function () {
    $html = <<<HTML
        <!DOCTYPE html>
        <html>
        <head></head>
        <body>
        <a href="mailto:somebody@example.com">mailto link</a>
        <a href="javascript:alert('hello');">javascript link</a>
        <a href="tel:+499123456789">phone link</a>
        <a href="/foo/bar">link</a>
        </body>
        </html>
        HTML;

    $step = (new GetLink());

    $respondedRequest = new RespondedRequest(
        new Request('GET', 'https://www.example.com/home'),
        new Response(200, [], $html),
    );

    $links = helper_invokeStepWithInput($step, $respondedRequest);

    expect($links[0]->get())->toBe('https://www.example.com/foo/bar');
});


================================================
FILE: tests/Steps/Html/GetLinksTest.php
================================================
<?php

namespace tests\Steps\Html;

use Crwlr\Crawler\Loader\Http\Messages\RespondedRequest;
use Crwlr\Crawler\Input;
use Crwlr\Crawler\Steps\Html\GetLinks;
use GuzzleHttp\Psr7\Request;
use GuzzleHttp\Psr7\Response;
use stdClass;
use tests\_Stubs\DummyLogger;

use function tests\helper_invokeStepWithInput;
use function tests\helper_traverseIterable;

it('works with a RespondedRequest as input', function () {
    $step = (new GetLinks());

    $links = helper_invokeStepWithInput($step, new RespondedRequest(
        new Request('GET', 'https://www.example.com/home'),
        new Response(200, [], '<a href="/blog">link</a>'),
    ));

    expect($links)->toHaveCount(1)
        ->and($links[0]->get())->toBe('https://www.example.com/blog');
});

it('logs an error message when fed with invalid input', function () {
    $logger = new DummyLogger();

    $step = (new GetLinks())->addLogger($logger);

    helper_traverseIterable($step->invokeStep(new Input(new stdClass())));

    expect($logger->messages)->not->toBeEmpty()
        ->and($logger->messages[0]['message'])->toBe(
            'The Crwlr\Crawler\Steps\Html\GetLinks step was called with input that it can not work with: Input must ' .
            'be an instance of RespondedRequest.',
        );
});

test('When called without selector it just gets all links', function () {
    $step = (new GetLinks());

    $links = helper_invokeStepWithInput($step, new RespondedRequest(
        new Request('GET', 'https://www.crwlr.software/packages/url/'),
        new Response(
            200,
            [],
            '<div><a href="v0.1">v0.1</a><a href="v1.0">v1.0</a><a href="v1.1">v1.1</a></div>',
        ),
    ));

    expect($links[0]->get())->toBe('https://www.crwlr.software/packages/url/v0.1')
        ->and($links[1]->get())->toBe('https://www.crwlr.software/packages/url/v1.0')
        ->and($links[2]->get())->toBe('https://www.crwlr.software/packages/url/v1.1');
});

test('When passing a CSS selector it only selects matching links', function () {
    $step = (new GetLinks('.matchingLink'));

    $responseHtml = <<<HTML
        <div>
            <a class="matchingLink" href="jobs">Jobs</a>
            <a class="matchingLink" href="numbers">Numbers</a>
            <a class="notMatchingLink" href="/products">Products</a>
            <a class="matchingLink" href="/team">Team</a>
        </div>
        HTML;

    $outputs = helper_invokeStepWithInput($step, new RespondedRequest(
        new Request('GET', 'https://www.example.com/company/about'),
        new Response(200, [], $responseHtml),
    ));

    expect($outputs)->toHaveCount(3)
        ->and($outputs[0]->get())->toBe('https://www.example.com/company/jobs')
        ->and($outputs[1]->get())->toBe('https://www.example.com/company/numbers')
        ->and($outputs[2]->get())->toBe('https://www.example.com/team');
});

test('When selector matches on a non-link element it\'s ignored', function () {
    $step = (new GetLinks('.link'));

    $links = helper_invokeStepWithInput($step, new RespondedRequest(
        new Request('GET', 'https://www.otsch.codes'),
        new Response(200, [], '<a class="link" href="foo">Foo</a><span class="link">Bar</span>'),
    ));

    expect($links)->toHaveCount(1)
        ->and($links[0]->get())->toBe('https://www.otsch.codes/foo');
});

it('finds only links on the same domain when onSameDomain() was called', function () {
    $html = <<<HTML
        <a href="https://www.crwlr.software/packages">link1</a>
        <a href="https://blog.otsch.codes/articles">link2</a>
        <a href="https://www.otsch.codes/blog">link3</a>
        HTML;

    $step = (new GetLinks())->onSameDomain();

    $link = helper_invokeStepWithInput($step, new RespondedRequest(
        new Request('GET', 'https://www.otsch.codes'),
        new Response(200, [], $html),
    ));

    expect($link)->toHaveCount(2)
        ->and($link[0]->get())->toBe('https://blog.otsch.codes/articles')
        ->and($link[1]->get())->toBe('https://www.otsch.codes/blog');
});

it('doesn\'t find links on the same domain when notOnSameDomain() was called', function () {
    $html = <<<HTML
        <a href="https://www.otsch.codes/contact">link1</a>
        <a href="https://www.crwlr.software/packages">link2</a>
        <a href="https://www.example.com/foo">link3</a>
        HTML;

    $step = (new GetLinks())->notOnSameDomain();

    $link = helper_invokeStepWithInput($step, new RespondedRequest(
        new Request('GET', 'https://www.otsch.codes'),
        new Response(200, [], $html),
    ));

    expect($link)->toHaveCount(2)
        ->and($link[0]->get())->toBe('https://www.crwlr.software/packages')
        ->and($link[1]->get())->toBe('https://www.example.com/foo');
});

it('finds only links from domains the onDomain() method was called with', function () {
    $html = <<<HTML
        <a href="https://www.otsch.codes/contact">link1</a>
        <a href="https://www.crwlr.software/packages">link2</a>
        <a href="https://www.crwl.io">link3</a>
        <a href="https://www.crwlr.software/blog">link4</a>
        HTML;

    $step = (new GetLinks())->onDomain('crwlr.software');

    $links = helper_invokeStepWithInput($step, new RespondedRequest(
        new Request('GET', 'https://www.otsch.codes'),
        new Response(200, [], $html),
    ));

    expect($links)->toHaveCount(2)
        ->and($links[0]->get())->toBe('https://www.crwlr.software/packages')
        ->and($links[1]->get())->toBe('https://www.crwlr.software/blog');
});

test('onDomain() also takes an array of domains', function () {
    $html = <<<HTML
        <a href="https://www.otsch.codes/contact">link1</a>
        <a href="https://www.crwlr.software/packages">link2</a>
        <a href="https://www.example.com/yolo">link3</a>
        HTML;

    $step = (new GetLinks())->onDomain(['otsch.codes', 'crwlr.software']);

    $links = helper_invokeStepWithInput($step, new RespondedRequest(
        new Request('GET', 'https://www.otsch.codes'),
        new Response(200, [], $html),
    ));

    expect($links)->toHaveCount(2)
        ->and($links[0]->get())->toBe('https://www.otsch.codes/contact')
        ->and($links[1]->get())->toBe('https://www.crwlr.software/packages');
});

test('onDomain() can be called multiple times and merges all domains it was called with', function () {
    $html = <<<HTML
        <a href="https://www.otsch.codes/contact">link1</a>
        <a href="https://www.crwlr.software/packages">link2</a>
        <a href="https://www.example.com/yolo">link3</a>
        HTML;

    $step = (new GetLinks())->onDomain('crwl.io');

    $links = helper_invokeStepWithInput($step, new RespondedRequest(
        new Request('GET', 'https://www.otsch.codes'),
        new Response(200, [], $html),
    ));

    expect($links)->toHaveCount(0);

    $step->onDomain(['otsch.codes', 'crwlr.software']);

    $links = helper_invokeStepWithInput($step, new RespondedRequest(
        new Request('GET', 'https://www.otsch.codes'),
        new Response(200, [], $html),
    ));

    expect($links)->toHaveCount(2);

    $step->onDomain('example.com');

    $links = helper_invokeStepWithInput($step, new RespondedRequest(
        new Request('GET', 'https://www.otsch.codes'),
        new Response(200, [], $html),
    ));

    expect($links)->toHaveCount(3);
});

it('finds only links on the same host when onSameHost() was called', function () {
    $html = <<<HTML
        <a href="https://www.crwlr.software/packages">link1</a>
        <a href="https://www.otsch.codes/contact">link2</a>
        <a href="https://jobs.otsch.codes">link3</a>
        <a href="https://www.otsch.codes/blog">link4</a>
        HTML;

    $step = (new GetLinks())->onSameHost();

    $link = helper_invokeStepWithInput($step, new RespondedRequest(
        new Request('GET', 'https://www.otsch.codes'),
        new Response(200, [], $html),
    ));

    expect($link)->toHaveCount(2)
        ->and($link[0]->get())->toBe('https://www.otsch.codes/contact')
        ->and($link[1]->get())->toBe('https://www.otsch.codes/blog');
});

it('doesn\'t find links on the same host when notOnSameHost() was called', function () {
    $html = <<<HTML
        <a href="https://www.otsch.codes/contact">link1</a>
        <a href="https://jobs.otsch.codes">link2</a>
        <a href="https://www.crwlr.software/packages">link3</a>
        HTML;

    $step = (new GetLinks())->notOnSameHost();

    $link = helper_invokeStepWithInput($step, new RespondedRequest(
        new Request('GET', 'https://www.otsch.codes'),
        new Response(200, [], $html),
    ));

    expect($link)->toHaveCount(2)
        ->and($link[0]->get())->toBe('https://jobs.otsch.codes')
        ->and($link[1]->get())->toBe('https://www.crwlr.software/packages');
});

it('finds only links from hosts the onHost() method was called with', function () {
    $html = <<<HTML
        <a href="https://www.otsch.codes/contact">link1</a>
        <a href="https://www.crwlr.software/packages">link2</a>
        <a href="https://blog.crwlr.software">link3</a>
        <a href="https://www.crwlr.software/packages/crawler/v0.4/getting-started">link4</a>
        HTML;

    $step = (new GetLinks())->onHost('www.crwlr.software');

    $links = helper_invokeStepWithInput($step, new RespondedRequest(
        new Request('GET', 'https://www.otsch.codes'),
        new Response(200, [], $html),
    ));

    expect($links)->toHaveCount(2)
        ->and($links[0]->get())->toBe('https://www.crwlr.software/packages')
        ->and($links[1]->get())->toBe('https://www.crwlr.software/packages/crawler/v0.4/getting-started');
});

test('onHost() also takes an array of hosts', function () {
    $html = <<<HTML
        <a href="https://www.otsch.codes/contact">link1</a>
        <a href="https://www.crwlr.software/packages">link2</a>
        HTML;

    $step = (new GetLinks())->onHost(['www.otsch.codes', 'blog.example.com']);

    $links = helper_invokeStepWithInput($step, new RespondedRequest(
        new Request('GET', 'https://www.otsch.codes'),
        new Response(200, [], $html),
    ));

    expect($links)->toHaveCount(1)
        ->and($links[0]->get())->toBe('https://www.otsch.codes/contact');

    $html = <<<HTML
        <a href="https://www.otsch.codes/contact">link1</a>
        <a href="https://www.crwlr.software/packages">link2</a>
        <a href="https://blog.example.com/articles/1">link3</a>
        HTML;

    $links = helper_invokeStepWithInput($step, new RespondedRequest(
        new Request('GET', 'https://www.otsch.codes'),
        new Response(200, [], $html),
    ));

    expect($links)->toHaveCount(2)
        ->and($links[1]->get())->toBe('https://blog.example.com/articles/1');
});

test('onHost() can be called multiple times and merges all hosts it was called with', function () {
    $html = <<<HTML
        <a href="https://www.otsch.codes/contact">link1</a>
        HTML;

    $step = (new GetLinks())->onHost('www.crwl.io');

    $links = helper_invokeStepWithInput($step, new RespondedRequest(
        new Request('GET', 'https://www.otsch.codes'),
        new Response(200, [], $html),
    ));

    expect($links)->toHaveCount(0);

    $step->onHost(['www.otsch.codes', 'www.crwlr.software']);

    $html = <<<HTML
        <a href="https://www.crwl.io">link1</a>
        HTML;

    $links = helper_invokeStepWithInput($step, new RespondedRequest(
        new Request('GET', 'https://www.otsch.codes'),
        new Response(200, [], $html),
    ));

    expect($links)->toHaveCount(1)
        ->and($links[0]->get())->toBe('https://www.crwl.io');

    $html = <<<HTML
        <a href="https://www.otsch.codes/blog">link1</a>
        <a href="https://www.crwl.io">link2</a>
        HTML;

    $links = helper_invokeStepWithInput($step, new RespondedRequest(
        new Request('GET', 'https://www.otsch.codes'),
        new Response(200, [], $html),
    ));

    expect($links)->toHaveCount(2)
        ->and($links[0]->get())->toBe('https://www.otsch.codes/blog')
        ->and($links[1]->get())->toBe('https://www.crwl.io');
});

it('works correctly when HTML contains a base tag', function () {
    $html = <<<HTML
        <!DOCTYPE html>
        <html>
        <head>
        <base href="/c/d" />
        </head>
        <body>
        <a href="e">link</a>
        <a href="/f/g">link2</a>
        <a href="./h">link3</a>
        </body>
        </html>
        HTML;

    $step = (new GetLinks());

    $links = helper_invokeStepWithInput($step, new RespondedRequest(
        new Request('GET', 'https://www.example.com/a/b'),
        new Response(200, [], $html),
    ));

    expect($links[0]->get())->toBe('https://www.example.com/c/e')
        ->and($links[1]->get())->toBe('https://www.example.com/f/g')
        ->and($links[2]->get())->toBe('https://www.example.com/c/h');
});

it('throws away the URL fragment part when withoutFragment() was called', function () {
    $html = <<<HTML
        <!DOCTYPE html>
        <html>
        <head></head>
        <body>
            <a href="/foo/bar#fragment">link</a> <br>
            <a href="/baz#quz-fragment">another link</a> <br>
        </body>
        </html>
        HTML;

    $step = (new GetLinks());

    $respondedRequest = new RespondedRequest(
        new Request('GET', 'https://www.example.com/foo/baz'),
        new Response(200, [], $html),
    );

    $links = helper_invokeStepWithInput($step, $respondedRequest);

    expect($links[0]->get())->toBe('https://www.example.com/foo/bar#fragment')
        ->and($links[1]->get())->toBe('https://www.example.com/baz#quz-fragment');

    $step->withoutFragment();

    $links = helper_invokeStepWithInput($step, $respondedRequest);

    expect($links[0]->get())->toBe('https://www.example.com/foo/bar')
        ->and($links[1]->get())->toBe('https://www.example.com/baz');
});

it('ignores special non HTTP links', function () {
    $html = <<<HTML
        <!DOCTYPE html>
        <html>
        <head></head>
        <body>
        <a href="mailto:somebody@example.com">mailto link</a>
        <a href="/one">link one</a>
        <a href="javascript:alert('hello');">javascript link</a>
        <a href="/two">link two</a>
        <a href="tel:+499123456789">phone link</a>
        <a href="/three">link three</a>
        </body>
        </html>
        HTML;

    $step = (new GetLinks());

    $respondedRequest = new RespondedRequest(
        new Request('GET', 'https://www.example.com/home'),
        new Response(200, [], $html),
    );

    $links = helper_invokeStepWithInput($step, $respondedRequest);

    expect($links)->toHaveCount(3)
        ->and($links[0]->get())->toBe('https://www.example.com/one')
        ->and($links[1]->get())->toBe('https://www.example.com/two')
        ->and($links[2]->get())->toBe('https://www.example.com/three');
});


================================================
FILE: tests/Steps/Html/MetaDataTest.php
================================================
<?php

namespace tests\Steps\Html;

use Crwlr\Crawler\Steps\Html;
use Crwlr\Crawler\Steps\Html\MetaData;

use function tests\helper_invokeStepWithInput;

it('returns an array with key title and empty string if the HTML document doesn\'t even contain a title', function () {
    $html = <<<HTML
        <!DOCTYPE html>
        <html>
        <head>
        </head>
        <body>Hello World!</body>
        </html>
        HTML;

    $outputs = helper_invokeStepWithInput(new MetaData(), $html);

    expect($outputs[0]->get())->toBe(['title' => '']);
});

it('returns an array with the title and all meta tags having a name or property attribute', function () {
    $html = <<<HTML
        <!DOCTYPE html>
        <html>
        <head>
        <meta charset="UTF-8">
        <title>
            Hello World!
        </title>
        <meta name="description" content="This is a page saying: Hello World!" />
        <meta name="keywords" content="lorem, ipsum, hello, world" />
        <meta property="og:title" content="Hello World!" />
        <meta property="og:type" content="website" />
        </head>
        <body>Hello World!</body>
        </html>
        HTML;

    $outputs = helper_invokeStepWithInput(new MetaData(), $html);

    expect($outputs[0]->get())->toBe([
        'title' => 'Hello World!',
        'description' => 'This is a page saying: Hello World!',
        'keywords' => 'lorem, ipsum, hello, world',
        'og:title' => 'Hello World!',
        'og:type' => 'website',
    ]);
});

it('returns only the meta tags defined via the only() method', function () {
    $html = <<<HTML
        <!DOCTYPE html>
        <html>
        <head>
        <meta charset="UTF-8">
        <title>
            Hello World!
        </title>
        <meta name="description" content="This is a page saying: Hello World!" />
        <meta name="keywords" content="lorem, ipsum, hello, world" />
        <meta property="og:title" content="Hello World!" />
        <meta property="og:type" content="website" />
        </head>
        <body>Hello World!</body>
        </html>
        HTML;

    $outputs = helper_invokeStepWithInput(Html::metaData()->only(['description', 'og:title']), $html);

    expect($outputs[0]->get())->toBe([
        'description' => 'This is a page saying: Hello World!',
        'og:title' => 'Hello World!',
    ]);
});


================================================
FILE: tests/Steps/Html/SchemaOrgTest.php
================================================
<?php

namespace tests\Steps\Html;

use Crwlr\Crawler\Steps\Html;
use Spatie\SchemaOrg\Article;
use Spatie\SchemaOrg\JobPosting;

use function tests\helper_invokeStepWithInput;

function helper_schemaOrgExampleOneJobPostingInBody(): string
{
    return <<<HTML
        <!DOCTYPE html>
        <html lang="de">
        <head><title>Foo Bar</title></head>
        <body>
        <script type="application/ld+json">{"@context":"https:\/\/schema.org","@type":"JobPosting","title":"Senior Full Stack PHP Developer (w\/m\/d)","employmentType":["FULL_TIME"],"datePosted":"2022-07-25","description":"foo bar baz","hiringOrganization":{"@type":"Organization","name":"Foo Ltd.","logo":"https:\/\/www.example.com\/logo.png"},"jobLocation":{"@type":"Place","address":{"@type":"PostalAddress","addressLocality":"Linz","addressRegion":"Upper Austria","addressCountry":"Austria"}},"identifier":{"@type":"PropertyValue","name":"foo","value":123456},"directApply":true} </script>
        <h1>Baz</h1> <p>Other content</p>
        </body>
        </html>
        HTML;
}

function helper_schemaOrgExampleMultipleObjects(): string
{
    return <<<HTML
        <!DOCTYPE html>
        <html lang="de-AT">
        <head>
        <title>Foo Bar</title>
        <script type="application/ld+json">
        {
            "mainEntity": [{
                "name": "Some Question?",
                "acceptedAnswer": {
                    "text": "bli bla blub!",
                    "@type": "Answer"
                },
                "@type": "Question"
            }, {
                "name": "Another question?",
                "acceptedAnswer": {
                    "text": "bla blu blo!",
                    "@type": "Answer"
                },
                "@type": "Question"
            }],
            "@type": "FAQPage",
            "@context": "http://schema.org"
        }
        </script>
        <meta property="og:title" content="Some Article" />
        <meta property="og:type" content="website" />
        <script type="application/ld+json">
        { "@context": "http://schema.org",
        "@type": "Organization",
        "name": "Example Company",
        "url": "https://www.example.com",
        "logo": "https://www.example.com/logo.png", "sameAs": [ "https://some.social-media.app/example-company" ] }
        </script>
        </head>
        <body>
        <h1>Some Article</h1>
        <h2>This is some article about something.</h2>
        <script type="application/ld+json">
        {
            "@context": "https:\/\/schema.org",
            "@type": "Article",
            "name": "Some Article",
            "url": "https:\/\/de.example.org\/articles\/some",
            "sameAs": "http:\/\/www.example.org\/articles\/A123456789",
            "mainEntity": "http:\/\/www.example.org\/articles\/A123456789",
            "author": {
                "@type": "Person",
                "name": "Jane Doe",
                "url": "https://example.com/profile/janedoe123"
            },
            "publisher": {
                "@type": "Organization",
                "name": "Some Organization, Inc.",
                "logo": {
                    "@type": "ImageObject",
                    "url": "https:\/\/www.example.org\/images\/organization-logo.png"
                }
            },
            "datePublished": "2023-09-07T21:57:44Z",
            "image": "https:\/\/images.example.org\/2023\/A123456789.jpg",
            "headline": "This is some article about something."
        }
        </script>
        </body>
        </html>
        HTML;
}

it('extracts schema.org data in JSON-LD format from an HTML document', function () {
    $html = helper_schemaOrgExampleOneJobPostingInBody();

    $outputs = helper_invokeStepWithInput(Html::schemaOrg(), $html);

    expect($outputs)->toHaveCount(1);

    expect($outputs[0]->get())->toBeInstanceOf(JobPosting::class);
});

it('converts the spatie schema.org objects to arrays when calling the toArray() method', function () {
    $html = helper_schemaOrgExampleOneJobPostingInBody();

    $outputs = helper_invokeStepWithInput(Html::schemaOrg()->toArray(), $html);

    expect($outputs)->toHaveCount(1);

    expect($outputs[0]->get())->toBeArray();

    expect($outputs[0]->get()['hiringOrganization'])->toBeArray();

    expect($outputs[0]->get()['hiringOrganization'])->toHaveKey('name');

    expect($outputs[0]->get()['hiringOrganization']['name'])->toBe('Foo Ltd.');
});

it('gets all the schema.org objects contained in a document', function () {
    $html = helper_schemaOrgExampleMultipleObjects();

    $outputs = helper_invokeStepWithInput(Html::schemaOrg(), $html);

    expect($outputs)->toHaveCount(3);
});

it('gets only schema.org objects of a certain type if you use the onlyType method', function () {
    $html = helper_schemaOrgExampleMultipleObjects();

    $outputs = helper_invokeStepWithInput(
        Html::schemaOrg()->onlyType('Article'),
        $html,
    );

    expect($outputs)->toHaveCount(1);

    expect($outputs[0]->get())->toBeInstanceOf(Article::class);
});

it('also finds schema.org objects of a certain type in children of another schema.org object', function () {
    $html = helper_schemaOrgExampleMultipleObjects();

    $outputs = helper_invokeStepWithInput(
        Html::schemaOrg()->onlyType('Organization'),
        $html,
    );

    expect($outputs)->toHaveCount(2);

    expect($outputs[0]->get()->getProperty('name'))->toBe('Example Company');

    expect($outputs[1]->get()->getProperty('name'))->toBe('Some Organization, Inc.');
});

it('extracts certain data from schema.org objects when using the extract() method', function () {
    $html = helper_schemaOrgExampleMultipleObjects();

    $outputs = helper_invokeStepWithInput(
        Html::schemaOrg()->onlyType('Article')->extract(['url', 'headline', 'publisher' => 'publisher.name']),
        $html,
    );

    expect($outputs)->toHaveCount(1);

    expect($outputs[0]->get())->toBe([
        'url' => 'https://de.example.org/articles/some',
        'headline' => 'This is some article about something.',
        'publisher' => 'Some Organization, Inc.',
    ]);
});

test('If an object doesn\'t contain a property from the extract mapping, it\'s just null in the output', function () {
    $html = helper_schemaOrgExampleMultipleObjects();

    $outputs = helper_invokeStepWithInput(
        Html::schemaOrg()->onlyType('Article')->extract(['url', 'headline', 'alternativeHeadline']),
        $html,
    );

    expect($outputs)->toHaveCount(1);

    expect($outputs[0]->get())->toBe([
        'url' => 'https://de.example.org/articles/some',
        'headline' => 'This is some article about something.',
        'alternativeHeadline' => null,
    ]);
});


================================================
FILE: tests/Steps/Html/XPathQueryTest.php
================================================
<?php

namespace tests\Steps\Html;

use Crwlr\Crawler\Steps\Dom\HtmlDocument;
use Crwlr\Crawler\Steps\Dom\XmlDocument;
use Crwlr\Crawler\Steps\Html\Exceptions\InvalidDomQueryException;
use Crwlr\Crawler\Steps\Html\XPathQuery;

use function tests\helper_getSimpleListHtml;

it('throws an exception when created with an invalid XPath query', function () {
    new XPathQuery('//a/@@bob/uncle');
})->throws(InvalidDomQueryException::class);

test('The apply method returns a string for a single match', function () {
    $xml = '<item>test</item>';

    expect((new XPathQuery('//item'))->apply(new XmlDocument($xml)))->toBe('test');
});

test('The apply method returns an array of strings for multiple matches', function () {
    $html = '<item>test</item><item>test 2 <test>sub</test></item><item>test 3</item>';

    expect((new XPathQuery('//item'))->apply(new HtmlDocument($html)))->toBe(['test', 'test 2 sub', 'test 3']);
});

test('The apply method returns null if nothing matches', function () {
    $xml = '<item>test</item>';

    expect((new XPathQuery('//aitem'))->apply(new XmlDocument($xml)))->toBeNull();
});

it('trims whitespace', function () {
    $xml = <<<XML
        <item>
            test
        </item>
        XML;

    expect((new XPathQuery('//item'))->apply(new XmlDocument($xml)))->toBe('test');
});

it('contains inner tags when the html method is called', function () {
    $xml = '<item>test <sub>sub</sub></item>';

    expect((new XPathQuery('//item'))->html()->apply(new XmlDocument($xml)))->toBe('test <sub>sub</sub>');
});

it('contains also the outer tag when the outerHtml method is called', function () {
    $xml = '<item>test <sub>sub</sub></item>';

    expect((new XPathQuery('//item'))->outerHtml()->apply(new XmlDocument($xml)))->toBe('<item>test <sub>sub</sub></item>');
});

it('gets the contents of an attribute using the attribute method', function () {
    $xml = '<item attr="content">test</item>';

    expect((new XPathQuery('//item'))->attribute('attr')->apply(new XmlDocument($xml)))->toBe('content');
});

test('getting an attribute value returns an empty string when the attribute does not exist', function () {
    $xml = '<item>test</item>';

    expect((new XPathQuery('//item'))->attribute('attr')->apply(new XmlDocument($xml)))->toBe('');
});

it('turns the value into an absolute url when toAbsoluteUrl() is called', function () {
    $xml = '<item>/foo/bar</item>';

    $document = new XmlDocument($xml);

    $query = (new XPathQuery('//item'))
        ->setBaseUrl('https://www.example.com');

    expect($query->apply($document))->toBe('/foo/bar');

    $query->toAbsoluteUrl();

    expect($query->apply($document))->toBe('https://www.example.com/foo/bar');
});

it('gets an absolute link from the href attribute of a link element, when the link() method is called', function () {
    $html = '<div id="foo"><a class="bar" href="/foo/bar">Foo</a></div>';

    $document = new HtmlDocument($html);

    $selector = (new XPathQuery('//*[@id=\'foo\']/a[@class=\'bar\']'))
        ->setBaseUrl('https://www.example.com/');

    expect($selector->apply($document))->toBe('Foo');

    $selector->link();

    expect($selector->apply($document))->toBe('https://www.example.com/foo/bar');
});

it('gets only the first matching element when the first() method is called', function () {
    $selector = (new XPathQuery("//*[@id = 'list']/*[contains(@class, 'item')]"))->first();

    expect($selector->apply(new HtmlDocument(helper_getSimpleListHtml())))->toBe('one');
});

it('gets only the last matching element when the last() method is called', function () {
    $selector = (new XPathQuery("//*[@id = 'list']/*[contains(@class, 'item')]"))->last();

    expect($selector->apply(new HtmlDocument(helper_getSimpleListHtml())))->toBe('four');
});

it('gets only the nth matching element when the nth() method is called', function () {
    $selector = (new XPathQuery("//*[@id = 'list']/*[contains(@class, 'item')]"))->nth(3);

    expect($selector->apply(new HtmlDocument(helper_getSimpleListHtml())))->toBe('three');
});

it('returns null when no nth matching element exists', function () {
    $selector = (new XPathQuery("//*[@id = 'list']/*[contains(@class, 'item')]"))->nth(5);

    expect($selector->apply(new HtmlDocument(helper_getSimpleListHtml())))->toBeNull();
});

it('gets only even matching elements when the even() method is called', function () {
    $selector = (new XPathQuery("//*[@id = 'list']/*[contains(@class, 'item')]"))->even();

    expect($selector->apply(new HtmlDocument(helper_getSimpleListHtml())))->toBe(['two', 'four']);
});

it('gets only odd matching elements when the odd() method is called', function () {
    $selector = (new XPathQuery("//*[@id = 'list']/*[contains(@class, 'item')]"))->odd();

    expect($selector->apply(new HtmlDocument(helper_getSimpleListHtml())))->toBe(['one', 'three']);
});


================================================
FILE: tests/Steps/HtmlTest.php
================================================
<?php

namespace tests\Steps;

use Crwlr\Crawler\Loader\Http\Messages\RespondedRequest;
use Crwlr\Crawler\Steps\Dom;
use Crwlr\Crawler\Steps\Html;
use Crwlr\Crawler\Steps\Html\GetLink;
use Crwlr\Crawler\Steps\Html\GetLinks;
use GuzzleHttp\Psr7\Request;
use GuzzleHttp\Psr7\Response;

use function tests\helper_invokeStepWithInput;

function helper_getHtmlContent(string $fileName): string
{
    $content = file_get_contents(__DIR__ . '/_Files/Html/' . $fileName);

    if ($content === false) {
        return '';
    }

    return $content;
}

it('returns single strings when extract is called with a selector only', function () {
    $output = helper_invokeStepWithInput(
        Html::each('#bookstore .book')->extract('.title'),
        helper_getHtmlContent('bookstore.html'),
    );

    expect($output)->toHaveCount(4)
        ->and($output[0]->get())->toBe('Everyday Italian')
        ->and($output[3]->get())->toBe('Learning XML');
});

it('extracts data from an HTML document with CSS selectors by default', function () {
    $output = helper_invokeStepWithInput(
        Html::each('#bookstore .book')->extract(['title' => '.title', 'author' => '.author', 'year' => '.year']),
        helper_getHtmlContent('bookstore.html'),
    );

    expect($output)->toHaveCount(4)
        ->and($output[0]->get())->toBe(
            ['title' => 'Everyday Italian', 'author' => 'Giada De Laurentiis', 'year' => '2005'],
        )
        ->and($output[1]->get())->toBe(['title' => 'Harry Potter', 'author' => 'J K. Rowling', 'year' => '2005'])
        ->and($output[2]->get())->toBe(
            [
                'title' => 'XQuery Kick Start',
                'author' => ['James McGovern', 'Per Bothner', 'Kurt Cagle', 'James Linn', 'Vaidyanathan Nagarajan'],
                'year' => '2003',
            ],
        )
        ->and($output[3]->get())->toBe(['title' => 'Learning XML', 'author' => 'Erik T. Ray', 'year' => '2003']);
});

it('can also extract data using XPath queries', function () {
    $output = helper_invokeStepWithInput(
        Html::each(Dom::xPath('//div[@id=\'bookstore\']/div[@class=\'book\']'))->extract([
            'title' => Dom::xPath('//h3[@class=\'title\']'),
            'author' => Dom::xPath('//*[@class=\'author\']'),
            'year' => Dom::xPath('//span[@class=\'year\']'),
        ]),
        helper_getHtmlContent('bookstore.html'),
    );

    expect($output)->toHaveCount(4)
        ->and($output[2]->get())->toBe(
            [
                'title' => 'XQuery Kick Start',
                'author' => ['James McGovern', 'Per Bothner', 'Kurt Cagle', 'James Linn', 'Vaidyanathan Nagarajan'],
                'year' => '2003',
            ],
        );
});

it('returns only one (compound) output when the root method is used', function () {
    $output = helper_invokeStepWithInput(
        Html::root()->extract(['title' => '.title', 'author' => '.author', 'year' => '.year',]),
        helper_getHtmlContent('bookstore.html'),
    );

    expect($output)->toHaveCount(1)
        ->and($output[0]->get()['title'])->toBe(['Everyday Italian', 'Harry Potter', 'XQuery Kick Start', 'Learning XML']);
});

it('extracts the data of the first matching element when the first method is used', function () {
    $output = helper_invokeStepWithInput(
        Html::first('#bookstore .book')->extract(['title' => '.title', 'author' => '.author', 'year' => '.year']),
        helper_getHtmlContent('bookstore.html'),
    );

    expect($output)->toHaveCount(1)
        ->and($output[0]->get())->toBe(
            ['title' => 'Everyday Italian', 'author' => 'Giada De Laurentiis', 'year' => '2005'],
        );
});

it('extracts the data of the last matching element when the last method is used', function () {
    $output = helper_invokeStepWithInput(
        Html::last('#bookstore .book')->extract(['title' => '.title', 'author' => '.author', 'year' => '.year']),
        helper_getHtmlContent('bookstore.html'),
    );

    expect($output)->toHaveCount(1)
        ->and($output[0]->get())->toBe(['title' => 'Learning XML', 'author' => 'Erik T. Ray', 'year' => '2003']);
});

test(
    'you can extract data in a second level to the output array using another Html step as an element in the mapping ' .
    'array',
    function () {
        $response = new RespondedRequest(
            new Request('GET', 'https://www.example.com/meetups/some-meetup/'),
            new Response(body: helper_getHtmlContent('event.html')),
        );

        $output = helper_invokeStepWithInput(
            Html::root()->extract([
                'title' => '#event h1',
                'location' => '#event .location',
                'date' => '#event .date',
                'talks' => Html::each('#event .talks .talk')->extract([
                    'title' => '.title',
                    'speaker' => '.speaker',
                    'slides' => Dom::cssSelector('.slidesLink')->attribute('href')->toAbsoluteUrl(),
                ]),
            ]),
            $response,
        );

        expect($output)->toHaveCount(1)
            ->and($output[0]->get())->toBe([
                'title' => 'Some Meetup',
                'location' => 'Somewhere',
                'date' => '2023-01-14 21:00',
                'talks' => [
                    [
                        'title' => 'Sophisticated talk title',
                        'speaker' => 'Super Mario',
                        'slides' => 'https://www.example.com/meetups/some-meetup/slides/talk1.pdf',
                    ],
                    [
                        'title' => 'Simple beginner talk',
                        'speaker' => 'Luigi',
                        'slides' => 'https://www.example.com/meetups/some-meetup/slides/talk2.pdf',
                    ],
                    [
                        'title' => 'Fun talk',
                        'speaker' => 'Princess Peach',
                        'slides' => 'https://www.example.com/meetups/some-meetup/slides/talk3.pdf',
                    ],
                ],
            ]);
    },
);

test(
    'When a child step is nested in the extraction and does not use each(), the extracted value is an array with ' .
    'the keys defined in extract(), rather than an array of such arrays as it would be with each().',
    function () {
        $xml = <<<HTML
            <!DOCTYPE html>
            <html lang="en">
                <head><title>something</title></head>
                <body>
                <div class="company">
                    <div class="name">ABCDEFGmbH</div>
                    <div class="founded">1984</div>
                    <div class="location">
                        <span class="country">Germany</span>, <span class="city">Frankfurt</span>
                    </div>
                </div>
                <div class="company">
                    <div class="name">Saubär GmbH</div>
                    <div class="founded">2014</div>
                    <div class="location">
                        <span class="country">Austria</span>, <span class="city">Klagenfurt</span>
                    </div>
                </div>
                </body>
            </html>
            HTML;

        $expectedCompany1 = [
            'name' => 'ABCDEFGmbH',
            'founded' => '1984',
            'location' => ['country' => 'Germany', 'city' => 'Frankfurt'],
        ];

        $expectedCompany2 = [
            'name' => 'Saubär GmbH',
            'founded' => '2014',
            'location' => ['country' => 'Austria', 'city' => 'Klagenfurt'],
        ];

        // With base root()
        $step = Html::each('.company')->extract([
            'name' => '.name',
            'founded' => '.founded',
            'location' => Html::root()->extract(['country' => '.location .country', 'city' => '.location .city']),
        ]);

        $outputs = helper_invokeStepWithInput($step, $xml);

        expect($outputs)->toHaveCount(2)
            ->and($outputs[0]->get())->toBe($expectedCompany1)
            ->and($outputs[1]->get())->toBe($expectedCompany2);

        // With base first()
        $step = Html::each('.company')->extract([
            'name' => '.name',
            'founded' => '.founded',
            'location' => Html::first('.location')->extract(['country' => '.country', 'city' => '.city']),
        ]);

        $outputs = helper_invokeStepWithInput($step, $xml);

        expect($outputs)->toHaveCount(2)
            ->and($outputs[0]->get())->toBe($expectedCompany1)
            ->and($outputs[1]->get())->toBe($expectedCompany2);

        // With base last()
        $step = Html::each('.company')->extract([
            'name' => '.name',
            'founded' => '.founded',
            'location' => Html::last('.location')->extract(['country' => '.country', 'city' => '.city']),
        ]);

        $outputs = helper_invokeStepWithInput($step, $xml);

        expect($outputs)->toHaveCount(2)
            ->and($outputs[0]->get())->toBe($expectedCompany1)
            ->and($outputs[1]->get())->toBe($expectedCompany2);
    },
);

test(
    'when selecting elements with each(), you can reference the element already selected within the each() selector ' .
    'itself, in sub selectors',
    function () {
        $html = <<<HTML
            <!DOCTYPE html>
            <html lang="en">
            <head>
                <title>Bookstore Example in HTML :)</title>
            </head>
            <body>
                <div id="list">
                    <div class="element" data-attr="yo">
                        <a href="/bar">direct element child</a>
                        <div class="sub-element">
                            <a href="/baz">sub child</a>
                        </div>
                    </div>
                </div>
            </body>
            </html>
            HTML;

        $response = new RespondedRequest(
            new Request('GET', 'https://www.example.com/foo'),
            new Response(body: $html),
        );

        $output = helper_invokeStepWithInput(
            Html::each('#list .element')->extract([
                // This is what this test is about. The element already selected in each (.element) can be
                // referenced in these child selectors.
                'link' => Dom::cssSelector('.element > a')->link(),
                'attribute' => Dom::cssSelector('')->attribute('data-attr'),
            ]),
            $response,
        );

        expect($output)->toHaveCount(1)
            ->and($output[0]->get())->toBe([
                'link' => 'https://www.example.com/bar',
                'attribute' => 'yo',
            ]);
    },
);

test('the static getLink method works without argument', function () {
    expect(Html::getLink())->toBeInstanceOf(GetLink::class);
});

test('the static getLinks method works without argument', function () {
    expect(Html::getLinks())->toBeInstanceOf(GetLinks::class);
});


================================================
FILE: tests/Steps/JsonTest.php
================================================
<?php

namespace tests\Steps;

use Crwlr\Crawler\Loader\Http\Messages\RespondedRequest;
use Crwlr\Crawler\Logger\CliLogger;
use Crwlr\Crawler\Steps\Json;
use GuzzleHttp\Psr7\Request;
use GuzzleHttp\Psr7\Response;
use GuzzleHttp\Psr7\Utils;
use PHPUnit\Framework\TestCase;

use function tests\helper_invokeStepWithInput;

/** @var TestCase $this */

it('accepts RespondedRequest as input', function () {
    $json = '{ "data": { "foo": "bar" } }';

    $respondedRequest = new RespondedRequest(new Request('GET', '/'), new Response(body: Utils::streamFor($json)));

    $output = helper_invokeStepWithInput(Json::get(['foo' => 'data.foo']), $respondedRequest);

    expect($output)->toHaveCount(1)
        ->and($output[0]->get())->toBe(['foo' => 'bar']);
});

it('accepts PSR-7 Response as input', function () {
    $json = '{ "data": { "foo": "bar" } }';

    $response = new Response(body: Utils::streamFor($json));

    $output = helper_invokeStepWithInput(Json::get(['foo' => 'data.foo']), $response);

    expect($output)->toHaveCount(1)
        ->and($output[0]->get())->toBe(['foo' => 'bar']);
});

it('extracts data defined using dot notation', function () {
    $json = <<<JSON
        {
            "data": {
                "target": {
                    "foo": "bar",
                    "bar": "foo",
                    "baz": "yo"
                }
            }
        }
        JSON;

    $output = helper_invokeStepWithInput(Json::get(['foo' => 'data.target.foo', 'baz' => 'data.target.baz']), $json);

    expect($output)->toHaveCount(1)
        ->and($output[0]->get())->toBe(['foo' => 'bar', 'baz' => 'yo']);
});

it('uses the array values in the mapping as output key when no string keys defined in the mapping array', function () {
    $jsonString = <<<JSON
        {
            "data": {
                "target": {
                    "foo": "bar",
                    "bar": "foo",
                    "baz": "yo"
                }
            }
        }
        JSON;

    $output = helper_invokeStepWithInput(Json::get(['data.target.foo', 'baz' => 'data.target.baz']), $jsonString);

    expect($output[0]->get())->toBe(['data.target.foo' => 'bar', 'baz' => 'yo']);
});

it('can get items from a json array using a numeric key', function () {
    $jsonString = <<<JSON
        {
            "data": {
                "target": {
                    "array": [
                        { "name": "Adam" },
                        { "name": "Eve" }
                    ]
                }
            }
        }
        JSON;

    $output = helper_invokeStepWithInput(Json::get(['name' => 'data.target.array.1.name']), $jsonString);

    expect($output[0]->get())->toBe(['name' => 'Eve']);
});

test('Using the each method you can iterate over a json array and yield multiple results', function () {
    $json = <<<JSON
        {
            "list": {
                "people": [
                    { "name": "Peter", "age": { "years": 19 } },
                    { "name": "Paul", "age": { "years": 22 } },
                    { "name": "Mary", "age": { "years": 20 } }
                ]
            }
        }
        JSON;

    $output = helper_invokeStepWithInput(Json::each('list.people', ['name' => 'name', 'age' => 'age.years']), $json);

    expect($output)->toHaveCount(3)
        ->and($output[0]->get())->toBe(['name' => 'Peter', 'age' => 19])
        ->and($output[1]->get())->toBe(['name' => 'Paul', 'age' => 22])
        ->and($output[2]->get())->toBe(['name' => 'Mary', 'age' => 20]);
});

test('When the root element is an array you can use each with empty string as param', function () {
    $jsonString = <<<JSON
        [
            { "firstname": "Axel", "surname": "Klingmeier", "nickname": "Axel" },
            { "firstname": "Lieselotte", "surname": "Schroll", "nickname": "Lilo" },
            { "firstname": "Paula", "surname": "Monowitsch", "nickname": "Poppi" },
            { "firstname": "Dominik", "surname": "Kascha", "nickname": "Dominik" }
        ]
        JSON;

    $output = helper_invokeStepWithInput(Json::each('', ['nickname']), $jsonString);

    expect($output)->toHaveCount(4)
        ->and($output[0]->get())->toBe(['nickname' => 'Axel'])
        ->and($output[1]->get())->toBe(['nickname' => 'Lilo'])
        ->and($output[2]->get())->toBe(['nickname' => 'Poppi'])
        ->and($output[3]->get())->toBe(['nickname' => 'Dominik']);

});

it('yields no results and logs a warning when the target for "each" does not exist', function () {
    $jsonString = '{ "foo": { "bar": [{ "number": "one" }, { "number": "two" }] } }';

    $step = Json::each('boo.bar', ['number']);

    $step->addLogger(new CliLogger());

    $output = helper_invokeStepWithInput($step, $jsonString);

    expect($output)->toHaveCount(0);

    $logOutput = $this->getActualOutputForAssertion();

    expect($logOutput)->toContain('The target of "each" does not exist in the JSON data.');
});

it('also works with JS style JSON objects without quotes around keys', function () {
    $jsonString = <<<JSON
        {
            foo: "one",
            bar: "two",
            "baz": "three"
        }
        JSON;

    $outputs = helper_invokeStepWithInput(Json::get(['foo', 'bar', 'baz']), $jsonString);

    expect($outputs)->toHaveCount(1)
        ->and($outputs[0]->get())->toBe(['foo' => 'one', 'bar' => 'two', 'baz' => 'three']);
});

it('also correctly fixes keys without quotes, even when values contain colons', function () {
    $jsonString = <<<JSON
        {
            foo: "https://www.example.com",
            bar: 2,
            "baz": "some: thing"
        }
        JSON;

    $outputs = helper_invokeStepWithInput(Json::get(['foo', 'bar', 'baz']), $jsonString);

    expect($outputs)->toHaveCount(1)
        ->and($outputs[0]->get())
        ->toBe([
            'foo' => 'https://www.example.com',
            'bar' => 2,
            'baz' => 'some: thing',
        ]);
});

it('also correctly fixes keys without quotes, when the value is an empty string', function () {
    $jsonString = <<<JSON
        {
            foo: "",
            "bar": "baz"
        }
        JSON;

    $outputs = helper_invokeStepWithInput(Json::get(['foo', 'bar']), $jsonString);

    expect($outputs)->toHaveCount(1)
        ->and($outputs[0]->get())
        ->toBe([
            'foo' => '',
            'bar' => 'baz',
        ]);
});

it('works with a string that is an HTML document and inside the body there\'s a JSON object', function () {
    $jsonString = <<<HTML
        <!doctype html>
        <html lang="en">
        <head>
        <title>JSON</title>
        </head>
        <body>
        { "foo": "Hello World!", "bar": "baz" }
        </body>
        HTML;

    $outputs = helper_invokeStepWithInput(Json::get(['title' => 'foo']), $jsonString);

    expect($outputs)->toHaveCount(1)
        ->and($outputs[0]->get())
        ->toBe(['title' => 'Hello World!']);
});

it('gets the whole JSON object as array, when using the all() method', function () {
    $jsonString = <<<JSON
        {
            "foo": "one",
            "bar": "two",
            "array": ["one", "two", "three"]
        }
        JSON;

    $outputs = helper_invokeStepWithInput(Json::all(), $jsonString);

    expect($outputs)->toHaveCount(1)
        ->and($outputs[0]->get())
        ->toBe([
            'foo' => 'one',
            'bar' => 'two',
            'array' => ['one', 'two', 'three'],
        ]);
});

it('can also map the whole decoded data array to a output property', function () {
    $jsonString = <<<JSON
        {
            "foo": "one",
            "bar": "two",
            "array": ["one", "two", "three"]
        }
        JSON;

    $outputs = helper_invokeStepWithInput(Json::get(['all' => '*']), $jsonString);

    expect($outputs)
        ->toHaveCount(1)
        ->and($outputs[0]->get())
        ->toBe([
            'all' => [
                'foo' => 'one',
                'bar' => 'two',
                'array' => ['one', 'two', 'three'],
            ],
        ]);
});

test('when there is a key * in the object, the * gets that key, not the whole decoded data', function () {
    $jsonString = <<<JSON
        {
            "*": "yes",
            "foo": "bar",
            "baz": "quz"
        }
        JSON;

    $outputs = helper_invokeStepWithInput(Json::get(['shouldBeYes' => '*']), $jsonString);

    expect($outputs)
        ->toHaveCount(1)
        ->and($outputs[0]->get())
        ->toBe(['shouldBeYes' => 'yes']);
});

it('can also get the whole decoded data in the each() context', function () {
    $jsonString = <<<JSON
        [
            { "name": "foo", "value": "one" },
            { "name": "bar", "value": "two" },
            { "name": "baz", "value": "three" }
        ]
        JSON;

    $outputs = helper_invokeStepWithInput(Json::each('', ['full' => '*']), $jsonString);

    expect($outputs)
        ->toHaveCount(3)
        ->and($outputs[0]->get())
        ->toBe(['full' => ['name' => 'foo', 'value' => 'one']])
        ->and($outputs[1]->get())
        ->toBe(['full' => ['name' => 'bar', 'value' => 'two']])
        ->and($outputs[2]->get())
        ->toBe(['full' => ['name' => 'baz', 'value' => 'three']]);
});

test('in the each() context, when there is a key *, it gets that, not the whole decoded data', function () {
    $jsonString = <<<JSON
        [
            { "name": "foo", "value": "one", "*": "yo" },
            { "name": "bar", "value": "two" }
        ]
        JSON;

    $outputs = helper_invokeStepWithInput(Json::each('', ['full' => '*']), $jsonString);

    expect($outputs)
        ->toHaveCount(2)
        ->and($outputs[0]->get())
        ->toBe(['full' => 'yo'])
        ->and($outputs[1]->get())
        ->toBe(['full' => ['name' => 'bar', 'value' => 'two']]);
});


================================================
FILE: tests/Steps/Loading/GetSitemapsFromRobotsTxtTest.php
================================================
<?php

namespace tests\Steps\Loading;

use Crwlr\Crawler\Input;
use Crwlr\Crawler\Loader\Http\HttpLoader;
use Crwlr\Crawler\Steps\Sitemap;
use Crwlr\Crawler\UserAgents\UserAgent;
use GuzzleHttp\Client;
use GuzzleHttp\Psr7\Response;
use GuzzleHttp\Psr7\Utils;
use Mockery;
use Psr\Http\Message\RequestInterface;

use function tests\helper_invokeStepWithInput;

it('gets all the sitemaps listed in the robots.txt file on a host, based on some URL on that host', function () {
    $httpClient = Mockery::mock(Client::class);

    $robotsTxt = <<<ROBOTSTXT
        User-agent: *
        Disallow:

        Sitemap: https://www.crwlr.software/sitemap.xml
        Sitemap: https://www.crwlr.software/sitemap2.xml

        Sitemap: https://www.crwlr.software/sitemap3.xml
        ROBOTSTXT;

    $httpClient->shouldReceive('sendRequest')
        ->once()
        ->withArgs(function (RequestInterface $request) {
            return $request->getUri()->__toString() === 'https://www.crwlr.software/robots.txt';
        })
        ->andReturn(new Response(200, body: Utils::streamFor($robotsTxt)));

    $loader = new HttpLoader(new UserAgent('SomeUserAgent'), $httpClient);

    $step = Sitemap::getSitemapsFromRobotsTxt()->setLoader($loader);

    $outputs = helper_invokeStepWithInput($step, new Input('https://www.crwlr.software/packages'));

    expect($outputs)->toHaveCount(3);
});


================================================
FILE: tests/Steps/Loading/Http/DocumentTest.php
================================================
<?php

namespace tests\Steps\Loading\Http;

use Crwlr\Crawler\Loader\Http\Messages\RespondedRequest;
use Crwlr\Crawler\Steps\Dom\HtmlDocument;
use Crwlr\Crawler\Steps\Loading\Http\Document;
use GuzzleHttp\Psr7\Request;
use GuzzleHttp\Psr7\Response;

it('creates a HtmlDocument instance from a RespondedRequest', function () {
    $body = '<!DOCTYPE html><html><head><title>foo</title></head><body>hello</body></html>';

    $respondedRequest = new RespondedRequest(
        new Request('GET', 'https://www.example.com/foo'),
        new Response(200, body: $body),
    );

    $document = new Document($respondedRequest);

    expect($document->dom())->toBeInstanceOf(HtmlDocument::class)
        ->and($document->dom()->outerHtml())->toBe(
            '<html><head><title>foo</title></head><body>hello</body></html>',
        );
});

it('returns the effectiveUri as url()', function () {
    $body = '<!doctype html><html><head><title>foo</title><base href="/baz" /></head><body>hello</body></html>';

    $respondedRequest = new RespondedRequest(
        new Request('GET', 'https://www.example.com/foo'),
        new Response(301, ['Location' => 'https://www.example.com/bar'], $body),
    );

    $respondedRequest->addRedirectUri('https://www.example.com/bar');

    $document = new Document($respondedRequest);

    expect((string) $document->url())->toBe('https://www.example.com/bar');
});

it('returns the effectiveUri as baseUrl() if no base tag in HTML', function () {
    $respondedRequest = new RespondedRequest(
        new Request('GET', 'https://www.example.com/foo'),
        new Response(301, ['Location' => 'https://www.example.com/bar']),
    );

    $respondedRequest->addRedirectUri('https://www.example.com/bar');

    $document = new Document($respondedRequest);

    expect((string) $document->baseUrl())->toBe('https://www.example.com/bar');
});

it('returns the URL referenced in base tag as baseUrl()', function () {
    $body = '<!doctype html><html><head><title>foo</title><base href="/baz" /></head><body>hello</body></html>';

    $respondedRequest = new RespondedRequest(
        new Request('GET', 'https://www.example.com/foo'),
        new Response(301, ['Location' => 'https://www.example.com/bar'], $body),
    );

    $respondedRequest->addRedirectUri('https://www.example.com/bar');

    $document = new Document($respondedRequest);

    expect((string) $document->baseUrl())->toBe('https://www.example.com/baz');
});

it('returns the effectiveUri as canonicalUrl() if no canonical link in HTML', function () {
    $respondedRequest = new RespondedRequest(
        new Request('GET', 'https://www.example.com/foo'),
        new Response(301, ['Location' => 'https://www.example.com/bar']),
    );

    $respondedRequest->addRedirectUri('https://www.example.com/bar');

    $document = new Document($respondedRequest);

    expect($document->canonicalUrl())->toBe('https://www.example.com/bar');
});

it('returns the URL referenced in canonical link as canonicalUrl()', function () {
    $body = '<!doctype html><html><head><title>foo</title><link rel="canonical" href="/quz" /></head><body>hello</body></html>';

    $respondedRequest = new RespondedRequest(
        new Request('GET', 'https://www.example.com/foo'),
        new Response(301, ['Location' => 'https://www.example.com/bar'], $body),
    );

    $respondedRequest->addRedirectUri('https://www.example.com/bar');

    $document = new Document($respondedRequest);

    expect($document->canonicalUrl())->toBe('https://www.example.com/quz');
});


================================================
FILE: tests/Steps/Loading/Http/Paginators/AbstractPaginatorTest.php
================================================
<?php

namespace tests\Steps\Loading\Http\Paginators;

use Crwlr\Crawler\Loader\Http\Messages\RespondedRequest;
use Crwlr\Crawler\Steps\Loading\Http\Paginators\StopRules\PaginatorStopRules;
use Crwlr\Url\Url;
use GuzzleHttp\Psr7\Request;
use GuzzleHttp\Psr7\Response;
use tests\_Stubs\AbstractTestPaginator;
use tests\_Stubs\DummyLogger;

use function tests\helper_getRespondedRequest;

it('registers loaded requests from PSR-7 RequestInterface instances', function () {
    $paginator = new AbstractTestPaginator(nextUrl: 'https://www.example.com/bar');

    $respondedRequest1 = helper_getRespondedRequest('GET', 'https://www.example.com/foo', [], 'Hi');

    $paginator->processLoaded($respondedRequest1->request, $respondedRequest1);

    expect($paginator->getLoaded())
        ->toBe(['f2be1fcc5667a8f4ee2fd7f48c69c909' => true])
        ->and($paginator->getLoadedCount())
        ->toBe(1)
        ->and($paginator->getLatestRequest())
        ->toBe($respondedRequest1->request);

    $respondedRequest2 = helper_getRespondedRequest('GET', 'https://www.example.com/bar', [], 'Yo');

    $paginator->processLoaded($respondedRequest2->request, $respondedRequest2);

    expect($paginator->getLoaded())->toBe([
        'f2be1fcc5667a8f4ee2fd7f48c69c909' => true,
        'd9e0c3987944f190782f5af9506eb478' => true,
    ])
        ->and($paginator->getLoadedCount())
        ->toBe(2)
        ->and($paginator->getLatestRequest())
        ->toBe($respondedRequest2->request);
});

it('registers loaded requests from RespondedRequest objects', function () {
    $paginator = new AbstractTestPaginator(nextUrl: 'https://www.example.com/bar');

    $requestOne = new Request('GET', Url::parsePsr7('https://www.example.com/foo'), [], 'Hi');

    $requestTwo = new Request('GET', Url::parsePsr7('https://www.example.com/bar'), [], 'Yo');

    $paginator->processLoaded($requestOne, new RespondedRequest($requestTwo, new Response()));

    expect($paginator->getLoaded())
        ->toBe(['d9e0c3987944f190782f5af9506eb478' => true])
        ->and($paginator->getLoadedCount())
        ->toBe(1)
        ->and($paginator->getLatestRequest())
        ->toBe($requestTwo);
});

it('knows when the max pages to load limit is reached', function () {
    $paginator = new AbstractTestPaginator(3);

    $respondedRequest = helper_getRespondedRequest(url: 'https://www.example.com/foo');

    $paginator->processLoaded($respondedRequest->request, $respondedRequest);

    expect($paginator->limitReached())->toBeFalse();

    $respondedRequest = helper_getRespondedRequest(url: 'https://www.example.com/bar');

    $paginator->processLoaded($respondedRequest->request, $respondedRequest);

    expect($paginator->limitReached())->toBeFalse();

    $respondedRequest = helper_getRespondedRequest(url: 'https://www.example.com/baz');

    $paginator->processLoaded($respondedRequest->request, $respondedRequest);

    expect($paginator->limitReached())->toBeTrue();

    expect($paginator->hasFinished())->toBeTrue();
});

test('the same request is not registered twice', function () {
    $paginator = new AbstractTestPaginator();

    $respondedRequest = helper_getRespondedRequest();

    $paginator->processLoaded($respondedRequest->request, $respondedRequest);

    expect($paginator->getLoadedCount())->toBe(1);

    $respondedRequest = helper_getRespondedRequest();

    $paginator->processLoaded($respondedRequest->request, $respondedRequest);

    expect($paginator->getLoadedCount())->toBe(1);
});

it('logs a message when the max pages limit was reached', function () {
    $paginator = new AbstractTestPaginator(2);

    $respondedRequest = helper_getRespondedRequest(url: 'https://www.example.com/foo');

    $paginator->processLoaded($respondedRequest->request, $respondedRequest);

    $logger = new DummyLogger();

    $paginator->logWhenFinished($logger);

    expect($logger->messages[0])->toBe([
        'level' => 'info',
        'message' => 'Finished paginating.',
    ]);

    $respondedRequest = helper_getRespondedRequest(url: 'https://www.example.com/bar');

    $paginator->processLoaded($respondedRequest->request, $respondedRequest);

    $paginator->logWhenFinished($logger);

    expect($logger->messages[1])->toBe([
        'level' => 'notice',
        'message' => 'Max pages limit reached.',
    ]);
});

it('logs a message when it finished paginating', function () {
    $paginator = new AbstractTestPaginator();

    $paginator->stopWhen(PaginatorStopRules::isEmptyResponse());

    $respondedRequest = helper_getRespondedRequest();

    $paginator->processLoaded($respondedRequest->request, $respondedRequest);

    $logger = new DummyLogger();

    $paginator->logWhenFinished($logger);

    expect($logger->messages[0])->toBe([
        'level' => 'info',
        'message' => 'Finished paginating.',
    ]);
});

it('stops paginating when a stop condition is met', function () {
    $paginator = new AbstractTestPaginator();

    $paginator
        ->stopWhen(PaginatorStopRules::isEmptyResponse())
        ->stopWhen(PaginatorStopRules::isEmptyInJson('items'));

    $respondedRequest = helper_getRespondedRequest(
        url: 'https://www.example.com/list?page=1',
        responseBody: '{ "items": ["foo"] }',
    );

    $paginator->processLoaded($respondedRequest->request, $respondedRequest);

    expect($paginator->hasFinished())->toBeFalse();

    $respondedRequest = helper_getRespondedRequest(url: 'https://www.example.com/list?page=2', responseBody: '{}');

    $paginator->processLoaded($respondedRequest->request, $respondedRequest);

    expect($paginator->hasFinished())->toBeTrue();

    $paginator = new AbstractTestPaginator();

    $paginator
        ->stopWhen(PaginatorStopRules::isEmptyResponse())
        ->stopWhen(PaginatorStopRules::isEmptyInJson('items'));

    $respondedRequest = helper_getRespondedRequest(
        url: 'https://www.example.com/list?page=1',
        responseBody: '{ "items": [] }',
    );

    $paginator->processLoaded($respondedRequest->request, $respondedRequest);

    expect($paginator->hasFinished())->toBeTrue();
});

test('after calling the setFinished() method, the hasFinished() method returns true', function () {
    $paginator = new AbstractTestPaginator();

    expect($paginator->hasFinished())->toBeFalse();

    $paginator->setFinished();

    expect($paginator->hasFinished())->toBeTrue();
});


================================================
FILE: tests/Steps/Loading/Http/Paginators/QueryParams/AbstractQueryParamManipulatorTest.php
================================================
<?php

namespace tests\Steps\Loading\Http\Paginators\QueryParams;

use Crwlr\Crawler\Steps\Loading\Http\Paginators\QueryParams\AbstractQueryParamManipulator;
use Crwlr\QueryString\Query;

it('gets the current value of a query param', function () {
    $manipulator = new class ('foo') extends AbstractQueryParamManipulator {
        public string $currentParamValue = '';

        public function execute(Query $query): Query
        {
            $this->currentParamValue = $this->getCurrentValue($query);

            return $query;
        }
    };

    $manipulator->execute(Query::fromString('foo=bar'));

    expect($manipulator->currentParamValue)->toBe('bar');
});

it('gets the current value of a query param as integer', function () {
    $manipulator = new class ('foo') extends AbstractQueryParamManipulator {
        public int $currentParamValue = 0;

        public function execute(Query $query): Query
        {
            $this->currentParamValue = $this->getCurrentValueAsInt($query);

            return $query;
        }
    };

    $manipulator->execute(Query::fromString('foo=123'));

    expect($manipulator->currentParamValue)->toBe(123);
});


================================================
FILE: tests/Steps/Loading/Http/Paginators/QueryParams/DecrementorTest.php
================================================
<?php

namespace tests\Steps\Loading\Http\Paginators\QueryParams;

use Crwlr\Crawler\Steps\Loading\Http\Paginators\QueryParams\Decrementor;
use Crwlr\QueryString\Query;

it('reduces a query param value by a certain number', function () {
    $decrementor = new Decrementor('foo', 10);

    $query = Query::fromString('foo=20');

    expect($query->get('foo'))->toBe('20');

    $decrementor->execute($query);

    expect($query->get('foo'))->toBe('10');

    $decrementor->execute($query);

    expect($query->get('foo'))->toBe('0');

    $decrementor->execute($query);

    expect($query->get('foo'))->toBe('-10');
});

it('reduces a non first level query param value by a certain number', function () {
    $decrementor = new Decrementor('foo.bar.baz', 7, true);

    $query = Query::fromString('foo[bar][baz]=10');

    expect($decrementor->execute($query)->toString())->toBe('foo%5Bbar%5D%5Bbaz%5D=3');
});


================================================
FILE: tests/Steps/Loading/Http/Paginators/QueryParams/IncrementorTest.php
================================================
<?php

namespace tests\Steps\Loading\Http\Paginators\QueryParams;

use Crwlr\Crawler\Steps\Loading\Http\Paginators\QueryParams\Incrementor;
use Crwlr\QueryString\Query;

it('increments a query param value by a certain number', function () {
    $incrementor = new Incrementor('foo', 10);

    $query = Query::fromString('foo=-10');

    expect($query->get('foo'))->toBe('-10');

    $incrementor->execute($query);

    expect($query->get('foo'))->toBe('0');

    $incrementor->execute($query);

    expect($query->get('foo'))->toBe('10');

    $incrementor->execute($query);

    expect($query->get('foo'))->toBe('20');
});

it('increments a non first level query param value by a certain number', function () {
    $incrementor = new Incrementor('foo.bar.baz', 7, true);

    $query = Query::fromString('foo[bar][baz]=3');

    expect($incrementor->execute($query)->toString())->toBe('foo%5Bbar%5D%5Bbaz%5D=10');
});


================================================
FILE: tests/Steps/Loading/Http/Paginators/QueryParamsPaginatorTest.php
================================================
<?php

namespace tests\Steps\Loading\Http\Paginators;

use Crwlr\Crawler\Loader\Http\Messages\RespondedRequest;
use Crwlr\Crawler\Steps\Loading\Http\Paginators\QueryParamsPaginator;
use GuzzleHttp\Psr7\Request;
use GuzzleHttp\Psr7\Response;

it('increases and decreases values in request url query params', function () {
    $paginator = QueryParamsPaginator::paramsInUrl()
        ->increase('page')
        ->increase('offset', 20)
        ->decrease('foo', 10)
        ->decrease('bar', 20);

    $request = new Request('GET', 'https://www.example.com/list?page=1&offset=20&foo=40&bar=10');

    $respondedRequest = new RespondedRequest($request, new Response());

    $paginator->processLoaded($request, $respondedRequest);

    $nextRequest = $paginator->getNextRequest();

    expect($nextRequest?->getUri()->__toString())->toBe('https://www.example.com/list?page=2&offset=40&foo=30&bar=-10');
});

it('increases and decreases values in query params in the body', function () {
    $paginator = QueryParamsPaginator::paramsInBody()
        ->increase('page')
        ->increase('offset', 20)
        ->decrease('foo', 10)
        ->decrease('bar', 20);

    $request = new Request('POST', 'https://www.example.com/list', body: 'page=1&offset=20&foo=40&bar=10');

    $respondedRequest = new RespondedRequest($request, new Response());

    $paginator->processLoaded($request, $respondedRequest);

    $nextRequest = $paginator->getNextRequest();

    expect($nextRequest?->getMethod())
        ->toBe('POST')
        ->and($nextRequest?->getUri()->__toString())
        ->toBe('https://www.example.com/list')
        ->and($nextRequest?->getBody()->getContents())
        ->toBe('page=2&offset=40&foo=30&bar=-10');
});

it('increases and decreases non first level (of query array) parameters using dot notation', function () {
    $paginator = QueryParamsPaginator::paramsInBody()
        ->increaseUsingDotNotation('pagination.page')
        ->increase('pagination.size', 5, true)
        ->decreaseUsingDotNotation('pagination2.page')
        ->decrease('pagination2.size', 5, true);

    $request = new Request(
        'POST',
        'https://www.example.com/list',
        body: 'pagination[page]=1&pagination[size]=25&pagination2[page]=1&pagination2[size]=25&foo=bar',
    );

    $respondedRequest = new RespondedRequest($request, new Response());

    $paginator->processLoaded($request, $respondedRequest);

    $nextRequest = $paginator->getNextRequest();

    expect($nextRequest?->getBody()->getContents())
        ->toBe(
            'pagination%5Bpage%5D=2&pagination%5Bsize%5D=30&pagination2%5Bpage%5D=0&pagination2%5Bsize%5D=20&foo=bar',
        );
});


================================================
FILE: tests/Steps/Loading/Http/Paginators/SimpleWebsitePaginatorTest.php
================================================
<?php

namespace tests\Steps\Loading\Http\Paginators;

use Crwlr\Crawler\Loader\Http\Messages\RespondedRequest;
use Crwlr\Crawler\Logger\CliLogger;
use Crwlr\Crawler\Steps\Loading\Http\Paginators\SimpleWebsitePaginator;
use Crwlr\Crawler\Steps\Loading\Http\Paginators\StopRules\PaginatorStopRules;
use GuzzleHttp\Psr7\Response;
use PHPUnit\Framework\TestCase;
use Psr\Http\Message\RequestInterface;

use function tests\helper_getRespondedRequest;

function helper_getRespondedRequestWithResponseBody(string $urlPath, string $body): RespondedRequest
{
    return helper_getRespondedRequest(url: 'https://www.example.com' . $urlPath, responseBody: $body);
}

/**
 * @param array<string, string> $links
 */
function helper_createResponseBodyWithPaginationLinks(array $links): string
{
    $body = '<div class="pagination">';

    foreach ($links as $url => $text) {
        $body .= '<a href="' . $url . '">' . $text . '</a> ' . PHP_EOL;
    }

    return $body . '</div>';
}

/** @var TestCase $this */

it('says it has finished when no initial response was provided yet', function () {
    $paginator = new SimpleWebsitePaginator('.pagination');

    expect($paginator->hasFinished())->toBeTrue();
});

it('says it has finished when a response is provided, but it has no pagination links', function () {
    $paginator = new SimpleWebsitePaginator('.pagination', 3);

    $respondedRequest = helper_getRespondedRequestWithResponseBody('/listing', '<div class="listing"></div>');

    $paginator->processLoaded($respondedRequest->request, $respondedRequest);

    expect($paginator->hasFinished())->toBeTrue();
});

it('says it has not finished when an initial response with pagination links is provided', function () {
    $paginator = new SimpleWebsitePaginator('.pagination', 3);

    $responseBody = helper_createResponseBodyWithPaginationLinks([
        '/listing?page=1' => 'First page',
        '/listing?page=2' => 'Next page',
        '/listing?page12' => 'Last page',
    ]);

    $respondedRequest = helper_getRespondedRequestWithResponseBody('/listing', $responseBody);

    $paginator->processLoaded($respondedRequest->request, $respondedRequest);

    expect($paginator->hasFinished())->toBeFalse();
});

it('has finished when the loaded pages count exceeds the max pages limit', function () {
    $paginator = new SimpleWebsitePaginator('.pagination', 3);

    $responseBody = helper_createResponseBodyWithPaginationLinks([
        '/listing?page=1' => 'First page',
        '/listing?page=2' => 'Next page',
        '/listing?page12' => 'Last page',
    ]);

    $respondedRequest = helper_getRespondedRequestWithResponseBody('/listing', $responseBody);

    $paginator->processLoaded($respondedRequest->request, $respondedRequest);

    expect($paginator->hasFinished())->toBeFalse();

    $respondedRequest = helper_getRespondedRequestWithResponseBody('/listing?page=1', $responseBody);

    $paginator->processLoaded($respondedRequest->request, $respondedRequest);

    expect($paginator->hasFinished())->toBeFalse();

    $responseBody = helper_createResponseBodyWithPaginationLinks([
        '/listing?page=1' => 'First page',
        '/listing?page=3' => 'Next page',
        '/listing?page12' => 'Last page',
    ]);

    $respondedRequest = helper_getRespondedRequestWithResponseBody('/listing?page=2', $responseBody);

    $paginator->processLoaded($respondedRequest->request, $respondedRequest);

    expect($paginator->hasFinished())->toBeTrue();
});

it('says it has finished when there are no more found pagination links, that haven\'t been loaded yet', function () {
    $paginator = new SimpleWebsitePaginator('.pagination', 3);

    $responseBody = helper_createResponseBodyWithPaginationLinks(['/listing?page=2' => 'Page Two']);

    $respondedRequest = helper_getRespondedRequestWithResponseBody('/listing?page=1', $responseBody);

    $paginator->processLoaded($respondedRequest->request, $respondedRequest);

    expect($paginator->hasFinished())->toBeFalse();

    $paginator->getNextRequest();

    $responseBody = helper_createResponseBodyWithPaginationLinks(['/listing?page=2' => 'Page Two']);

    $respondedRequest = helper_getRespondedRequestWithResponseBody('/listing?page=2', $responseBody);

    $paginator->processLoaded($respondedRequest->request, $respondedRequest);

    expect($paginator->hasFinished())->toBeTrue();
});

it('finds pagination links when the selector matches the link itself', function () {
    $paginator = new SimpleWebsitePaginator('.nextPageLink', 3);

    $responseBody = '<a class="nextPageLink" href="/listing?page=2">Next Page</a>';

    $respondedRequest = helper_getRespondedRequestWithResponseBody('/listing?page=1', $responseBody);

    $paginator->processLoaded($respondedRequest->request, $respondedRequest);

    expect($paginator->getNextRequest()?->getUri()->__toString())->toBe('https://www.example.com/listing?page=2');
});

it('finds pagination links when the selected element is a wrapper for pagination links', function () {
    $paginator = new SimpleWebsitePaginator('.pagination', 3);

    $responseBody = '<div class="pagination"><a href="/listing?page=2">Next Page</a></div>';

    $respondedRequest = helper_getRespondedRequestWithResponseBody('/listing?page=1', $responseBody);

    $paginator->processLoaded($respondedRequest->request, $respondedRequest);

    expect($paginator->getNextRequest()?->getUri()->__toString())->toBe('https://www.example.com/listing?page=2');
});

it('finds all pagination links, when multiple elements match the pagination links selector', function () {
    $paginator = new SimpleWebsitePaginator('.pagination', 3);

    $responseBody = <<<HTML
        <div class="pagination"><a href="/listing?page=2">Next Page</a></div>
        <div class="pagination"><a href="/listing?page=12">Last Page</a></div>
        HTML;

    $respondedRequest = helper_getRespondedRequestWithResponseBody('/listing?page=1', $responseBody);

    $paginator->processLoaded($respondedRequest->request, $respondedRequest);

    expect($paginator->getNextRequest()?->getUri()->__toString())->toBe('https://www.example.com/listing?page=2')
        ->and($paginator->getNextRequest()?->getUri()->__toString())->toBe('https://www.example.com/listing?page=12');

});

it('logs that max pages limit was reached when it was reached', function () {
    $paginator = new SimpleWebsitePaginator('.pagination', 3);

    $responseBody = <<<HTML
        <div class="pagination">
            <a href="/listing?page=1">Page One</a>
            <a href="/listing?page=2">Page Two</a>
            <a href="/listing?page=3">Page Three</a>
            <a href="/listing?page=4">Page Four</a>
        </div>
        HTML;

    $respondedRequest = helper_getRespondedRequestWithResponseBody('/listing?page=1', $responseBody);

    $paginator->processLoaded($respondedRequest->request, $respondedRequest);

    $respondedRequest = helper_getRespondedRequestWithResponseBody('/listing?page=2', $responseBody);

    $paginator->processLoaded($respondedRequest->request, $respondedRequest);

    $respondedRequest = helper_getRespondedRequestWithResponseBody('/listing?page=3', $responseBody);

    $paginator->processLoaded($respondedRequest->request, $respondedRequest);

    expect($paginator->hasFinished())->toBeTrue();

    $paginator->logWhenFinished(new CliLogger());

    $output = $this->getActualOutputForAssertion();

    expect($output)->toContain('Max pages limit reached');
});

it('logs that all found pagination links have been loaded when max pages limit was not reached', function () {
    $paginator = new SimpleWebsitePaginator('.pagination', 3);

    $responseBody = <<<HTML
        <div class="pagination">
            <a href="/listing?page=1">Page One</a>
            <a href="/listing?page=2">Page Two</a>
            <a href="/listing?page=3">Page Three</a>
        </div>
        HTML;

    $respondedRequest = helper_getRespondedRequestWithResponseBody('/listing?page=1', $responseBody);

    $paginator->processLoaded($respondedRequest->request, $respondedRequest);

    $paginator->getNextRequest();

    $respondedRequest = helper_getRespondedRequestWithResponseBody('/listing?page=2', $responseBody);

    $paginator->logWhenFinished(new CliLogger());

    $paginator->processLoaded($respondedRequest->request, $respondedRequest);

    $paginator->logWhenFinished(new CliLogger());

    $paginator->getNextRequest();

    $respondedRequest = helper_getRespondedRequestWithResponseBody('/listing?page=3', $responseBody);

    $paginator->processLoaded($respondedRequest->request, $respondedRequest);

    expect($paginator->hasFinished())->toBeTrue();

    $paginator->logWhenFinished(new CliLogger());

    $output = $this->getActualOutputForAssertion();

    expect($output)
        ->not()->toContain('Max pages limit reached')
        ->and($output)
        ->toContain('All found pagination links loaded');
});

it(
    'always creates upcoming requests from the parent request, where a link was found (which does not have to be ' .
    'the latest processed response)',
    function () {
        $paginator = new SimpleWebsitePaginator('.pagination', 3);

        $responseBody = <<<HTML
        <div class="pagination">
            <a href="/list?page=1">Page One</a>
            <a href="/list?page=2">Page Two</a>
            <a href="/list?page=3">Page Three</a>
        </div>
        HTML;

        $respondedRequest = helper_getRespondedRequest(
            'GET',
            'https://www.example.com/list?page=1',
            ['foo' => 'bar'],
            responseBody: $responseBody,
        );

        $paginator->processLoaded($respondedRequest->request, $respondedRequest);

        $responseBody = <<<HTML
            <div class="pagination">
                <a href="/list?page=4">Page One</a>
                <a href="/list?page=5">Page Two</a>
                <a href="/list?page=6">Page Three</a>
            </div>
            HTML;

        $respondedRequest = helper_getRespondedRequest(
            'GET',
            'https://www.example.com/list?page=2',
            ['foo' => 'baz'],
            responseBody: $responseBody,
        );

        $paginator->processLoaded($respondedRequest->request, $respondedRequest);

        $nextRequest = $paginator->getNextRequest();

        expect($nextRequest?->getHeader('foo'))->toBe(['bar']);
    },
);

it('cleans up the stored parent requests always when getting the next request to load', function () {
    $paginator = new class ('.pagination') extends SimpleWebsitePaginator {
        /**
         * @return array<string, RequestInterface>
         */
        public function parentRequests(): array
        {
            return $this->parentRequests;
        }
    };

    $responseBody = <<<HTML
        <div class="pagination">
            <a href="/list?page=2">Page Two</a>
            <a href="/list?page=3">Page Three</a>
        </div>
        HTML;

    $respondedRequest = helper_getRespondedRequest(
        'GET',
        'https://www.example.com/list?page=1',
        ['foo' => 'bar'],
        responseBody: $responseBody,
    );

    $paginator->processLoaded($respondedRequest->request, $respondedRequest);

    expect(count($paginator->parentRequests()))->toBe(1);

    $nextRequest = $paginator->getNextRequest();

    if (!$nextRequest) {
        $this->fail('failed to get next request');
    }

    $respondedRequest = new RespondedRequest($nextRequest, new Response());

    $paginator->processLoaded($respondedRequest->request, $respondedRequest);

    expect(count($paginator->parentRequests()))->toBe(1);

    $nextRequest = $paginator->getNextRequest();

    if (!$nextRequest) {
        $this->fail('failed to get next request');
    }

    $respondedRequest = new RespondedRequest($nextRequest, new Response());

    $paginator->processLoaded($respondedRequest->request, $respondedRequest);

    expect(count($paginator->parentRequests()))->toBe(0);
});

it('does not stop, when a response does not meet the stop rule criterion', function () {
    $paginator = new SimpleWebsitePaginator('.pagination', 3);

    $paginator->stopWhen(PaginatorStopRules::contains('hello world'));

    $responseBody = helper_createResponseBodyWithPaginationLinks(['/listing?page=2' => 'Next page']);

    $respondedRequest = helper_getRespondedRequestWithResponseBody('/listing', $responseBody);

    $paginator->processLoaded($respondedRequest->request, $respondedRequest);

    expect($paginator->hasFinished())->toBeFalse();
});


================================================
FILE: tests/Steps/Loading/Http/Paginators/StopRules/ContainsTest.php
================================================
<?php

namespace tests\Steps\Loading\Http\Paginators\StopRules;

use Crwlr\Crawler\Loader\Http\Messages\RespondedRequest;
use Crwlr\Crawler\Steps\Loading\Http\Paginators\StopRules\PaginatorStopRules;
use GuzzleHttp\Psr7\Request;
use GuzzleHttp\Psr7\Response;

it('stops when called without a RespondedRequest object', function () {
    $rule = PaginatorStopRules::contains('foo');

    expect($rule->shouldStop(new Request('GET', 'https://www.example.com/foo'), null))->toBeTrue();
});

it('stops when the string is contained in the response body', function () {
    $rule = PaginatorStopRules::contains('foo');

    $respondedRequest = new RespondedRequest(
        new Request('GET', 'https://www.crwl.io/'),
        new Response(body: 'This string contains foo'),
    );

    expect($rule->shouldStop($respondedRequest->request, $respondedRequest))->toBeTrue();
});

it('does not stop when the string is not contained in the response body', function () {
    $rule = PaginatorStopRules::contains('foo');

    $respondedRequest = new RespondedRequest(
        new Request('GET', 'https://www.crwl.io/'),
        new Response(body: 'This does not contain the string'),
    );

    expect($rule->shouldStop($respondedRequest->request, $respondedRequest))->toBeFalse();
});


================================================
FILE: tests/Steps/Loading/Http/Paginators/StopRules/IsEmptyInHtmlTest.php
================================================
<?php

namespace tests\Steps\Loading\Http\Paginators\StopRules;

use Crwlr\Crawler\Loader\Http\Messages\RespondedRequest;
use Crwlr\Crawler\Steps\Loading\Http\Paginators\StopRules\PaginatorStopRules;
use GuzzleHttp\Psr7\Request;
use GuzzleHttp\Psr7\Response;

it('should stop, when called without a RespondedRequest object', function () {
    $rule = PaginatorStopRules::isEmptyInHtml('#list .item');

    expect($rule->shouldStop(new Request('GET', 'https://www.crwl.io/'), null))->toBeTrue();
});

it('should stop, when response is not HTML', function () {
    $rule = PaginatorStopRules::isEmptyInHtml('#list .item');

    $respondedRequest = new RespondedRequest(
        new Request('GET', 'https://www.crwl.io/'),
        new Response(body: '{ "foo": "bar" }'),
    );

    expect($rule->shouldStop($respondedRequest->request, $respondedRequest))->toBeTrue();
});

it('should stop, when the selector target does not exist in the HTML response', function () {
    $rule = PaginatorStopRules::isEmptyInHtml('#list');

    $respondedRequest = new RespondedRequest(
        new Request('GET', 'https://www.crwl.io/'),
        new Response(body: '<div id="foo"></div>'),
    );

    expect($rule->shouldStop($respondedRequest->request, $respondedRequest))->toBeTrue();
});

it('should stop, when the selector target is empty in the response', function () {
    $rule = PaginatorStopRules::isEmptyInHtml('#list');

    $respondedRequest = new RespondedRequest(
        new Request('GET', 'https://www.crwl.io/'),
        new Response(body: '<div id="list">  </div>'),
    );

    expect($rule->shouldStop($respondedRequest->request, $respondedRequest))->toBeTrue();
});

it('should not stop, when the selector target is not empty in the response', function () {
    $rule = PaginatorStopRules::isEmptyInHtml('#list');

    $respondedRequest = new RespondedRequest(
        new Request('GET', 'https://www.crwl.io/'),
        new Response(body: '<div id="list">a</div>'),
    );

    expect($rule->shouldStop($respondedRequest->request, $respondedRequest))->toBeFalse();

    // Also if the content is only child elements.
    $respondedRequest = new RespondedRequest(
        new Request('GET', 'https://www.crwl.io/'),
        new Response(body: '<div id="list"><span class="child"></span></div>'),
    );

    expect($rule->shouldStop($respondedRequest->request, $respondedRequest))->toBeFalse();
});


================================================
FILE: tests/Steps/Loading/Http/Paginators/StopRules/IsEmptyInJsonTest.php
================================================
<?php

namespace tests\Steps\Loading\Http\Paginators\StopRules;

use Crwlr\Crawler\Loader\Http\Messages\RespondedRequest;
use Crwlr\Crawler\Steps\Loading\Http\Paginators\StopRules\PaginatorStopRules;
use Crwlr\Utils\Exceptions\InvalidJsonException;
use GuzzleHttp\Psr7\Request;
use GuzzleHttp\Psr7\Response;

it('should stop, when called without a RespondedRequest object', function () {
    $rule = PaginatorStopRules::isEmptyInJson('data.items');

    expect($rule->shouldStop(new Request('GET', 'https://www.crwl.io/'), null))->toBeTrue();
});

it('throws an exception when response is not valid JSON', function () {
    $rule = PaginatorStopRules::isEmptyInJson('data.items');

    $respondedRequest = new RespondedRequest(
        new Request('GET', 'https://www.crwl.io/'),
        new Response(body: '<!doctype html><html></html>'),
    );

    expect($rule->shouldStop($respondedRequest->request, $respondedRequest))->toBeTrue();
})->throws(InvalidJsonException::class);

it('should stop, when the dot notation key does not exist in the response', function () {
    $rule = PaginatorStopRules::isEmptyInJson('data.items');

    $respondedRequest = new RespondedRequest(
        new Request('GET', 'https://www.crwl.io/'),
        new Response(body: '{ "data": { "foo": "bar" } }'),
    );

    expect($rule->shouldStop($respondedRequest->request, $respondedRequest))->toBeTrue();
});

it('should stop, when the dot notation key is empty in the response', function () {
    $rule = PaginatorStopRules::isEmptyInJson('data.items');

    $respondedRequest = new RespondedRequest(
        new Request('GET', 'https://www.crwl.io/'),
        new Response(body: '{ "data": { "items": [] } }'),
    );

    expect($rule->shouldStop($respondedRequest->request, $respondedRequest))->toBeTrue();
});

it('should not stop, when the dot notation key is not empty in the response', function () {
    $rule = PaginatorStopRules::isEmptyInJson('data.items');

    $respondedRequest = new RespondedRequest(
        new Request('GET', 'https://www.crwl.io/'),
        new Response(body: '{ "data": { "items": ["foo", "bar"] } }'),
    );

    expect($rule->shouldStop($respondedRequest->request, $respondedRequest))->toBeFalse();
});


================================================
FILE: tests/Steps/Loading/Http/Paginators/StopRules/IsEmptyInXmlTest.php
================================================
<?php

namespace tests\Steps\Loading\Http\Paginators\StopRules;

use Crwlr\Crawler\Loader\Http\Messages\RespondedRequest;
use Crwlr\Crawler\Steps\Loading\Http\Paginators\StopRules\PaginatorStopRules;
use GuzzleHttp\Psr7\Request;
use GuzzleHttp\Psr7\Response;

it('should stop, when called without a RespondedRequest object', function () {
    $rule = PaginatorStopRules::isEmptyInXml('channel item');

    expect($rule->shouldStop(new Request('GET', 'https://www.crwl.io/'), null))->toBeTrue();
});

it('should stop, when response is not XML', function () {
    $rule = PaginatorStopRules::isEmptyInXml('channel item');

    $respondedRequest = new RespondedRequest(
        new Request('GET', 'https://www.crwl.io/'),
        new Response(body: '{}'),
    );

    expect($rule->shouldStop($respondedRequest->request, $respondedRequest))->toBeTrue();
});

it('should stop, when the selector target does not exist in the XML response', function () {
    $rule = PaginatorStopRules::isEmptyInXml('channel item');

    $respondedRequest = new RespondedRequest(
        new Request('GET', 'https://www.crwl.io/'),
        new Response(body: '<?xml version="1.0" encoding="UTF-8" ?><rss version="2.0"><channel></channel></rss>'),
    );

    expect($rule->shouldStop($respondedRequest->request, $respondedRequest))->toBeTrue();
});

it('should stop, when the selector target is empty in the response', function () {
    $rule = PaginatorStopRules::isEmptyInXml('channel item');

    $respondedRequest = new RespondedRequest(
        new Request('GET', 'https://www.crwl.io/'),
        new Response(
            body: '<?xml version="1.0" encoding="UTF-8" ?><rss version="2.0"><channel><item>  </item></channel></rss>',
        ),
    );

    expect($rule->shouldStop($respondedRequest->request, $respondedRequest))->toBeTrue();
});

it('should not stop, when the selector target is not empty in the response', function () {
    $rule = PaginatorStopRules::isEmptyInXml('channel item');

    $respondedRequest = new RespondedRequest(
        new Request('GET', 'https://www.crwl.io/'),
        new Response(
            body: '<?xml version="1.0" encoding="UTF-8" ?><rss version="2.0"><channel><item>a</item></channel></rss>',
        ),
    );

    expect($rule->shouldStop($respondedRequest->request, $respondedRequest))->toBeFalse();

    // Also if the content is only child elements.
    $respondedRequest = new RespondedRequest(
        new Request('GET', 'https://www.crwl.io/'),
        new Response(
            body: '<?xml version="1.0" encoding="UTF-8" ?><rss version="2.0"><channel><item><foo></foo></item></channel></rss>',
        ),
    );

    expect($rule->shouldStop($respondedRequest->request, $respondedRequest))->toBeFalse();
});


================================================
FILE: tests/Steps/Loading/Http/Paginators/StopRules/IsEmptyResponseTest.php
================================================
<?php

namespace tests\Steps\Loading\Http\Paginators\StopRules;

use Crwlr\Crawler\Loader\Http\Messages\RespondedRequest;
use Crwlr\Crawler\Steps\Loading\Http\Paginators\StopRules\PaginatorStopRules;
use GuzzleHttp\Psr7\Request;
use GuzzleHttp\Psr7\Response;

it('should stop, when no RespondedRequest object is provided', function () {
    $rule = PaginatorStopRules::isEmptyResponse();

    expect($rule->shouldStop(new Request('GET', 'https://www.crwl.io/'), null))->toBeTrue();
});

it('should stop, when the response body is empty', function () {
    $rule = PaginatorStopRules::isEmptyResponse();

    $respondedRequest = new RespondedRequest(
        new Request('GET', 'https://www.crwl.io/'),
        new Response(body: ''),
    );

    expect($rule->shouldStop($respondedRequest->request, $respondedRequest))->toBeTrue();
});

it('should stop, when the response body is only spaces', function () {
    $rule = PaginatorStopRules::isEmptyResponse();

    $respondedRequest = new RespondedRequest(
        new Request('GET', 'https://www.example.com/'),
        new Response(body: " \n\r\t "),
    );

    expect($rule->shouldStop($respondedRequest->request, $respondedRequest))->toBeTrue();
});

it('should stop, when the response body is an empty JSON array', function () {
    $rule = PaginatorStopRules::isEmptyResponse();

    $respondedRequest = new RespondedRequest(
        new Request('GET', 'https://www.crwlr.software/packages'),
        new Response(body: " [] "),
    );

    expect($rule->shouldStop($respondedRequest->request, $respondedRequest))->toBeTrue();
});

it('should stop, when the response body is an empty JSON object', function () {
    $rule = PaginatorStopRules::isEmptyResponse();

    $respondedRequest = new RespondedRequest(
        new Request('GET', 'https://www.crwl.io/en/home'),
        new Response(body: "{}"),
    );

    expect($rule->shouldStop($respondedRequest->request, $respondedRequest))->toBeTrue();
});


================================================
FILE: tests/Steps/Loading/Http/Paginators/StopRules/NotContainsTest.php
================================================
<?php

namespace tests\Steps\Loading\Http\Paginators\StopRules;

use Crwlr\Crawler\Loader\Http\Messages\RespondedRequest;
use Crwlr\Crawler\Steps\Loading\Http\Paginators\StopRules\PaginatorStopRules;
use GuzzleHttp\Psr7\Request;
use GuzzleHttp\Psr7\Response;

it('stops when called without a RespondedRequest object', function () {
    $rule = PaginatorStopRules::notContains('foo');

    expect($rule->shouldStop(new Request('GET', 'https://www.example.com/foo'), null))->toBeTrue();
});

it('stops when the string is not contained in the response body', function () {
    $rule = PaginatorStopRules::notContains('foo');

    $respondedRequest = new RespondedRequest(
        new Request('GET', 'https://www.crwl.io/'),
        new Response(body: 'This does not contain the string'),
    );

    expect($rule->shouldStop($respondedRequest->request, $respondedRequest))->toBeTrue();
});

it('does not stop when the string is contained in the response body', function () {
    $rule = PaginatorStopRules::notContains('foo');

    $respondedRequest = new RespondedRequest(
        new Request('GET', 'https://www.crwl.io/'),
        new Response(body: 'This contains the string foo'),
    );

    expect($rule->shouldStop($respondedRequest->request, $respondedRequest))->toBeFalse();
});


================================================
FILE: tests/Steps/Loading/HttpTest.php
================================================
<?php

namespace tests\Steps\Loading;

use Closure;
use Crwlr\Crawler\Input;
use Crwlr\Crawler\Loader\Http\Exceptions\LoadingException;
use Crwlr\Crawler\Loader\Http\HeadlessBrowserLoaderHelper;
use Crwlr\Crawler\Loader\Http\HttpLoader;
use Crwlr\Crawler\Loader\Http\Messages\RespondedRequest;
use Crwlr\Crawler\Steps\Loading\Http;
use Crwlr\Crawler\Steps\Loading\Http\Browser\BrowserAction;
use Crwlr\Url\Url;
use GuzzleHttp\Psr7\Request;
use GuzzleHttp\Psr7\Response;
use GuzzleHttp\Psr7\Utils;
use InvalidArgumentException;
use Mockery;
use Psr\Http\Message\RequestInterface;
use stdClass;
use tests\_Stubs\DummyLogger;
use Throwable;

use function tests\helper_getRespondedRequest;
use function tests\helper_invokeStepWithInput;
use function tests\helper_nonBotUserAgent;
use function tests\helper_traverseIterable;

it('can be invoked with a string as input', function () {
    $loader = Mockery::mock(HttpLoader::class);

    $loader->shouldReceive('load')->once();

    $step = (new Http('GET'))->setLoader($loader);

    helper_traverseIterable($step->invokeStep(new Input('https://www.foo.bar/baz')));
});

it('can be invoked with a PSR-7 Uri object as input', function () {
    $loader = Mockery::mock(HttpLoader::class);

    $loader->shouldReceive('load')->once();

    $step = (new Http('GET'))->setLoader($loader);

    helper_traverseIterable($step->invokeStep(new Input(Url::parsePsr7('https://www.linkedin.com/'))));
});

it('logs an error message when invoked with something else as input', function () {
    $logger = new DummyLogger();

    $loader = Mockery::mock(HttpLoader::class);

    $step = (new Http('GET'))->setLoader($loader)->addLogger($logger);

    helper_traverseIterable($step->invokeStep(new Input(new stdClass())));

    expect($logger->messages)->not->toBeEmpty()
        ->and($logger->messages[0]['message'])->toStartWith(
            'The Crwlr\Crawler\Steps\Loading\Http step was called with input that it can not work with:',
        )
        ->and($logger->messages[0]['message'])->toEndWith('. The invalid input is of type object.');
});

it('logs an error message when invoked with a relative reference URI', function () {
    $logger = new DummyLogger();

    $loader = new HttpLoader(helper_nonBotUserAgent(), logger: $logger);

    $step = (new Http('GET'))->setLoader($loader)->addLogger($logger);

    helper_invokeStepWithInput($step, '/foo/bar');

    expect($logger->messages)->not->toBeEmpty()
        ->and($logger->messages[0]['message'])->toBe(
            'Invalid input URL: /foo/bar - The URI is a relative reference and therefore can\'t be loaded.',
        );
});

it('catches the exception and logs an error when feeded with an invalid URL', function () {
    $loader = Mockery::mock(HttpLoader::class);

    $logger = new DummyLogger();

    $step = (new Http('GET'))->setLoader($loader);

    $step->addLogger($logger);

    helper_traverseIterable($step->invokeStep(new Input('https://')));

    expect($logger->messages)->toHaveCount(1)
        ->and($logger->messages[0]['level'])->toBe('error')
        ->and($logger->messages[0]['message'])->toBe(
            'The Crwlr\\Crawler\\Steps\\Loading\\Http step was called with input that it can not work with: https:// ' .
            'is not a valid URL.',
        );
});

it('throws an exception when invoked with a relative reference URI and stopOnErrorResponse() was called', function () {
    $logger = new DummyLogger();

    $loader = new HttpLoader(helper_nonBotUserAgent(), logger: $logger);

    $step = (new Http('GET'))->setLoader($loader)->addLogger($logger);

    $step->stopOnErrorResponse();

    helper_invokeStepWithInput($step, '/foo/bar');
})->throws(InvalidArgumentException::class);

test('You can set the request method via constructor', function (string $httpMethod) {
    $loader = Mockery::mock(HttpLoader::class);

    $loader->shouldReceive('load')->withArgs(function (RequestInterface $request) use ($httpMethod) {
        return $request->getMethod() === $httpMethod;
    })->once();

    if ($httpMethod !== 'GET') {
        $loader->shouldReceive('usesHeadlessBrowser')->andReturnFalse();
    }

    $step = (new Http($httpMethod))->setLoader($loader);

    helper_traverseIterable($step->invokeStep(new Input('https://www.foo.bar/baz')));
})->with(['GET', 'POST', 'PUT', 'PATCH', 'DELETE']);

test('You can set request headers via constructor', function () {
    $loader = Mockery::mock(HttpLoader::class);

    $headers = [
        'Accept' => [
            'text/html',
            'application/xhtml+xml',
            'application/xml;q=0.9',
            'image/avif',
            'image/webp',
            'image/apng',
            '*/*;q=0.8',
            'application/signed-exchange;v=b3;q=0.9',
        ],
        'Accept-Encoding' => ['gzip', 'deflate', 'br'],
        'Accept-Language' => ['de-DE', 'de;q=0.9', 'en-US;q=0.8', 'en;q=0.7'],
    ];

    $loader->shouldReceive('load')->withArgs(function (RequestInterface $request) use ($headers) {
        foreach ($headers as $headerName => $values) {
            if (!$request->getHeader($headerName) || $request->getHeader($headerName) !== $values) {
                return false;
            }
        }

        return true;
    })->once();

    $step = (new Http('GET', $headers))->setLoader($loader);

    helper_traverseIterable($step->invokeStep(new Input('https://www.crwlr.software/packages/url')));
});

test('You can set request body via constructor', function () {
    $loader = Mockery::mock(HttpLoader::class);

    $body = 'This is the request body';

    $loader->shouldReceive('load')->withArgs(function (RequestInterface $request) use ($body) {
        return $request->getBody()->getContents() === $body;
    })->once();

    $loader->shouldReceive('usesHeadlessBrowser')->andReturnFalse();

    $step = (new Http('PATCH', [], $body))->setLoader($loader);

    helper_traverseIterable($step->invokeStep(new Input('https://github.com/')));
});

test('You can set the http version for the request via constructor', function (string $httpVersion) {
    $loader = Mockery::mock(HttpLoader::class);

    $loader->shouldReceive('load')->withArgs(function (RequestInterface $request) use ($httpVersion) {
        return $request->getProtocolVersion() === $httpVersion;
    })->once();

    $loader->shouldReceive('usesHeadlessBrowser')->andReturnFalse();

    $step = (new Http('PATCH', [], 'body', $httpVersion))->setLoader($loader);

    helper_traverseIterable($step->invokeStep(new Input('https://packagist.org/packages/crwlr/url')));
})->with(['1.0', '1.1', '2.0']);

it('has static methods to create instances with all the different http methods', function (string $httpMethod) {
    $loader = Mockery::mock(HttpLoader::class);

    $loader->shouldReceive('load')->withArgs(function (RequestInterface $request) use ($httpMethod) {
        return $request->getMethod() === $httpMethod;
    })->once();

    if ($httpMethod !== 'GET') {
        $loader->shouldReceive('usesHeadlessBrowser')->andReturnFalse();
    }

    $step = (Http::{strtolower($httpMethod)}())->setLoader($loader);

    helper_traverseIterable($step->invokeStep(new Input('https://dev.to/otsch')));
})->with(['GET', 'POST', 'PUT', 'PATCH', 'DELETE']);

it(
    'calls the loadOrFail() loader method when the stopOnErrorResponse() method was called',
    function (string $httpMethod) {
        $loader = Mockery::mock(HttpLoader::class);

        $loader->shouldReceive('loadOrFail')->withArgs(function (RequestInterface $request) use ($httpMethod) {
            return $request->getMethod() === $httpMethod;
        })->once()->andReturn(new RespondedRequest(new Request('GET', '/foo'), new Response(200)));

        if ($httpMethod !== 'GET') {
            $loader->shouldReceive('usesHeadlessBrowser')->andReturnFalse();
        }

        $step = (Http::{strtolower($httpMethod)}())
            ->setLoader($loader)
            ->stopOnErrorResponse();

        helper_traverseIterable($step->invokeStep(new Input('https://example.com/otsch')));
    },
)->with(['GET', 'POST', 'PUT', 'PATCH', 'DELETE']);

test('you can keep response properties with their aliases', function () {
    $loader = Mockery::mock(HttpLoader::class);

    $loader->shouldReceive('load')->once()->andReturn(
        new RespondedRequest(
            new Request('GET', 'https://www.example.com/testresponse'),
            new Response(202, ['foo' => 'bar'], Utils::streamFor('testbody')),
        ),
    );

    $step = Http::get()
        ->setLoader($loader)
        ->keep(['url', 'status', 'headers', 'body']);

    $outputs = helper_invokeStepWithInput($step);

    expect($outputs)->toHaveCount(1)
        ->and($outputs[0]->keep)->toBe([
            'url' => 'https://www.example.com/testresponse',
            'status' => 202,
            'headers' => ['foo' => ['bar']],
            'body' => 'testbody',
        ]);

});

test(
    'the value behind url and uri is the effectiveUri',
    function (string $outputKey) {
        $loader = Mockery::mock(HttpLoader::class);

        $respondedRequest = new RespondedRequest(
            new Request('GET', 'https://www.example.com/testresponse'),
            new Response(202, ['foo' => 'bar'], Utils::streamFor('testbody')),
        );

        $respondedRequest->addRedirectUri('https://www.example.com/testresponseredirect');

        $loader->shouldReceive('load')->once()->andReturn($respondedRequest);

        $step = Http::get()
            ->setLoader($loader)
            ->keep([$outputKey]);

        $outputs = helper_invokeStepWithInput($step);

        expect($outputs)->toHaveCount(1)
            ->and($outputs[0]->keep)->toBe([$outputKey => 'https://www.example.com/testresponseredirect']);
    },
)->with(['url', 'uri']);

it('gets the URL for the request from an input array when useInputKeyAsUrl() was called', function () {
    $inputArray = [
        'foo' => 'bar',
        'someUrl' => 'https://www.example.com/baz',
    ];

    $loader = Mockery::mock(HttpLoader::class);

    $loader->shouldReceive('load')->withArgs(function (RequestInterface $request) use ($inputArray) {
        return $request->getUri()->__toString() === $inputArray['someUrl'];
    })->once()->andReturn(new RespondedRequest(new Request('GET', 'https://www.example.com/baz'), new Response(200)));

    $step = Http::get()
        ->setLoader($loader)
        ->useInputKeyAsUrl('someUrl');

    helper_invokeStepWithInput($step, $inputArray);
});

it(
    'automatically gets the URL for the request from an input array when it contains an url or uri key',
    function ($key) {
        $inputArray = [
            'foo' => 'bar',
            $key => 'https://www.example.com/baz',
        ];

        $loader = Mockery::mock(HttpLoader::class);

        $loader->shouldReceive('load')->withArgs(function (RequestInterface $request) use ($inputArray, $key) {
            return $request->getUri()->__toString() === $inputArray[$key];
        })->once()->andReturn(new RespondedRequest(new Request('GET', 'https://www.example.com/baz'), new Response(200)));

        $step = Http::get()
            ->setLoader($loader);

        helper_invokeStepWithInput($step, $inputArray);
    },
)->with(['url', 'uri']);

it('gets the body for the request from an input array when useInputKeyAsBody() was called', function () {
    $inputArray = [
        'foo' => 'bar',
        'someUrl' => 'https://www.example.com/baz',
        'someBodyThatIUsedToKnow' => 'foo=bar&baz=quz',
    ];

    $loader = Mockery::mock(HttpLoader::class);

    $loader
        ->shouldReceive('load')
        ->withArgs(function (RequestInterface $request) use ($inputArray) {
            return $request->getBody()->getContents() === $inputArray['someBodyThatIUsedToKnow'];
        })
        ->once()
        ->andReturn(new RespondedRequest(new Request('GET', 'https://www.example.com/baz'), new Response(200)));

    $step = Http::get()
        ->setLoader($loader)
        ->useInputKeyAsUrl('someUrl')
        ->useInputKeyAsBody('someBodyThatIUsedToKnow');

    helper_invokeStepWithInput($step, $inputArray);
});

it('gets as single header for the request from an input array when useInputKeyAsHeader() was called', function () {
    $inputArray = [
        'foo' => 'bar',
        'someUrl' => 'https://www.example.com/baz',
        'someHeader' => 'someHeaderValue',
    ];

    $loader = Mockery::mock(HttpLoader::class);

    $loader
        ->shouldReceive('load')
        ->withArgs(function (RequestInterface $request) use ($inputArray) {
            return $request->getHeader('header-name-x') === [$inputArray['someHeader']];
        })
        ->once()
        ->andReturn(new RespondedRequest(new Request('GET', 'https://www.example.com/baz'), new Response(200)));

    $step = Http::get()
        ->setLoader($loader)
        ->useInputKeyAsUrl('someUrl')
        ->useInputKeyAsHeader('someHeader', 'header-name-x');

    helper_invokeStepWithInput($step, $inputArray);
});

it('uses the input key as header name if no header name defined as argument', function () {
    $inputArray = [
        'foo' => 'bar',
        'url' => 'https://www.example.com/baz',
        'header-name' => 'someHeaderValue',
    ];

    $loader = Mockery::mock(HttpLoader::class);

    $loader
        ->shouldReceive('load')
        ->withArgs(function (RequestInterface $request) use ($inputArray) {
            return $request->getHeader('header-name') === [$inputArray['header-name']];
        })
        ->once()
        ->andReturn(new RespondedRequest(new Request('GET', 'https://www.example.com/baz'), new Response(200)));

    $step = Http::get()
        ->setLoader($loader)
        ->useInputKeyAsHeader('header-name');

    helper_invokeStepWithInput($step, $inputArray);
});

it('merges header values if you provide a static header value and use an input value as header', function () {
    $inputArray = [
        'foo' => 'bar',
        'someUrl' => 'https://www.example.com/baz',
        'someHeader' => 'someHeaderValue',
    ];

    $loader = Mockery::mock(HttpLoader::class);

    $loader
        ->shouldReceive('load')
        ->withArgs(function (RequestInterface $request) use ($inputArray) {
            return $request->getHeader('header-name-x') === ['foo', $inputArray['someHeader']];
        })
        ->once()
        ->andReturn(new RespondedRequest(new Request('GET', 'https://www.example.com/baz'), new Response(200)));

    $step = Http::get(['header-name-x' => 'foo'])
        ->setLoader($loader)
        ->useInputKeyAsUrl('someUrl')
        ->useInputKeyAsHeader('someHeader', 'header-name-x');

    helper_invokeStepWithInput($step, $inputArray);
});

test('you can use useInputKeyAsHeader() multiple times', function () {
    $inputArray = [
        'foo' => 'bar',
        'someUrl' => 'https://www.example.com/baz',
        'someHeader' => 'someHeaderValue',
        'anotherHeader' => 'anotherHeaderValue',
    ];

    $loader = Mockery::mock(HttpLoader::class);

    $loader
        ->shouldReceive('load')
        ->withArgs(function (RequestInterface $request) use ($inputArray) {
            return $request->getHeader('header-name-x') === [$inputArray['someHeader']] &&
                $request->getHeader('header-name-y') === [$inputArray['anotherHeader']];
        })
        ->once()
        ->andReturn(new RespondedRequest(new Request('GET', 'https://www.example.com/baz'), new Response(200)));

    $step = Http::get()
        ->setLoader($loader)
        ->useInputKeyAsUrl('someUrl')
        ->useInputKeyAsHeader('someHeader', 'header-name-x')
        ->useInputKeyAsHeader('anotherHeader', 'header-name-y');

    helper_invokeStepWithInput($step, $inputArray);
});

it('gets multiple headers from an input array using useInputKeyAsHeaders()', function () {
    $inputArray = [
        'foo' => 'bar',
        'someUrl' => 'https://www.example.com/baz',
        'customHeaders' => [
            'header-name-x' => 'foo',
            'header-name-y' => ['bar', 'baz'],
        ],
    ];

    $loader = Mockery::mock(HttpLoader::class);

    $loader
        ->shouldReceive('load')
        ->withArgs(function (RequestInterface $request) use ($inputArray) {
            $customHeaders = $inputArray['customHeaders'];

            $yHeaderExpectedValue = array_merge(['quz'], $customHeaders['header-name-y']);

            return $request->getHeader('header-name-x') === [$customHeaders['header-name-x']] &&
                $request->getHeader('header-name-y') === $yHeaderExpectedValue;
        })
        ->once()
        ->andReturn(new RespondedRequest(new Request('GET', 'https://www.example.com/baz'), new Response(200)));

    $step = Http::get(['header-name-y' => 'quz'])
        ->setLoader($loader)
        ->useInputKeyAsUrl('someUrl')
        ->useInputKeyAsHeaders('customHeaders');

    helper_invokeStepWithInput($step, $inputArray);
});

it('uses a static URL when defined', function () {
    $input = 'foo';

    $loader = Mockery::mock(HttpLoader::class);

    $loader
        ->shouldReceive('load')
        ->withArgs(function (RequestInterface $request) {
            return $request->getUri()->__toString() === 'https://www.example.com/servus';
        })
        ->once()
        ->andReturn(new RespondedRequest(new Request('GET', 'https://www.example.com/servus'), new Response(200)));

    $step = Http::get()
        ->setLoader($loader)
        ->staticUrl('https://www.example.com/servus');

    helper_invokeStepWithInput($step, $input);
});

it('resolves variables in a static URL from input data', function () {
    $input = ['one' => 'foo', 'two' => 'bar'];

    $loader = Mockery::mock(HttpLoader::class);

    $loader->shouldReceive('usesHeadlessBrowser')->andReturn(false);

    $loader
        ->shouldReceive('load')
        ->withArgs(function (RequestInterface $request) {
            return $request->getUri()->__toString() === 'https://www.example.com/foo/bar/baz';
        })
        ->once()
        ->andReturn(new RespondedRequest(new Request('GET', 'https://www.example.com/foo/bar/baz'), new Response(200)));

    $step = Http::get()
        ->setLoader($loader)
        ->staticUrl('https://www.example.com/[crwl:\'one\']/[crwl:two]/baz');

    helper_invokeStepWithInput($step, $input);
});

it('resolves variables in the request body from input data', function () {
    $input = [
        'url' => 'https://www.example.com/foo',
        'hey' => 'ho',
        'yo' => 'lo',
    ];

    $loader = Mockery::mock(HttpLoader::class);

    $loader->shouldReceive('usesHeadlessBrowser')->andReturn(false);

    $loader
        ->shouldReceive('load')
        ->withArgs(function (RequestInterface $request) {
            $bodyString = Http::getBodyString($request);

            return $bodyString === 'Ho ho ho and lo asdf';
        })
        ->once()
        ->andReturn(new RespondedRequest(new Request('GET', 'https://www.example.com/foo'), new Response(200)));

    $step = Http::post(body: 'Ho ho [crwl:hey] and [crwl:yo] asdf')
        ->setLoader($loader);

    helper_invokeStepWithInput($step, $input);
});

it('resolves variables in request headers from input data', function () {
    $input = [
        'url' => 'https://www.example.com/foo',
        'encoding' => 'deflate, br',
        'language' => 'de-DE,de;q=0.9,en-US;q=0.8,en;q=0.7',
    ];

    $loader = Mockery::mock(HttpLoader::class);

    $loader
        ->shouldReceive('load')
        ->withArgs(function (RequestInterface $request) {
            return $request->getHeaderLine('Accept-Encoding') === 'gzip, deflate, br, zstd' &&
                $request->getHeaderLine('Accept-Language') === 'de-DE,de;q=0.9,en-US;q=0.8,en;q=0.7';
        })
        ->once()
        ->andReturn(new RespondedRequest(new Request('GET', 'https://www.example.com/foo'), new Response(200)));

    $step = Http::get([
        'Accept-Encoding' => 'gzip, [crwl:"encoding"], zstd',
        'Accept-Language' => '[crwl:language]',
    ])
        ->setLoader($loader);

    helper_invokeStepWithInput($step, $input);
});

test(
    'the getBodyString() method does not generate a warning, when the response contains a ' .
    'Content-Type: application/x-gzip header, but the content actually isn\'t compressed',
    function () {
        $warnings = [];

        set_error_handler(function ($errno, $errstr) use (&$warnings) {
            if ($errno === E_WARNING) {
                $warnings[] = $errstr;
            }

            return false;
        });

        $response = helper_getRespondedRequest(
            url: 'https://example.com/yolo',
            responseHeaders: ['Content-Type' => 'application/x-gzip'],
            responseBody: 'Servas!',
        );

        $string = Http::getBodyString($response);

        restore_error_handler();

        expect($warnings)->toBeEmpty()
            ->and($string)->toBe('Servas!');
    },
);

it('rejects post browser navigate hooks, when the HTTP method is not GET', function (string $httpMethod) {
    $logger = new DummyLogger();

    $step = (new Http($httpMethod))->addLogger($logger)->postBrowserNavigateHook(BrowserAction::wait(1.0));

    expect($logger->messages)->toHaveCount(1)
        ->and($logger->messages[0]['message'])->toBe(
            'A ' . $httpMethod . ' request cannot be executed using the (headless) browser, so post browser ' .
            'navigate hooks can\'t be defined for this step either.',
        )
        ->and(invade($step)->postBrowserNavigateHooks)->toBe([]);
})->with(['POST', 'PUT', 'PATCH', 'DELETE']);

it(
    'calls the HttpLoader::skipCacheForNextRequest() method before calling load when the skipCache() method was called',
    function () {
        $loader = Mockery::mock(HttpLoader::class);

        $respondedRequest = new RespondedRequest(
            new Request('GET', 'https://www.example.com/blog/posts'),
            new Response(200, body: Utils::streamFor('blog posts')),
        );

        $loader->shouldReceive('skipCacheForNextRequest')->once();

        $loader->shouldReceive('load')->once()->andReturn($respondedRequest);

        $step = Http::get()->setLoader($loader)->skipCache();

        helper_invokeStepWithInput($step);
    },
);

it(
    'calls the HttpLoader::skipCacheForNextRequest() method before calling loadOrFail() when the skipCache() method ' .
    'was called',
    function () {
        $loader = Mockery::mock(HttpLoader::class);

        $respondedRequest = new RespondedRequest(
            new Request('GET', 'https://www.example.com/blog/posts'),
            new Response(200, body: Utils::streamFor('blog posts')),
        );

        $loader->shouldReceive('skipCacheForNextRequest')->once();

        $loader->shouldReceive('loadOrFail')->once()->andReturn($respondedRequest);

        $step = Http::get()->setLoader($loader)->skipCache()->stopOnErrorResponse();

        helper_invokeStepWithInput($step);
    },
);

it(
    'switches the loader to use the browser, when useBrowser() was called and the loader is configured to use the ' .
    'HTTP client',
    function () {
        $loader = Mockery::mock(HttpLoader::class);

        $loader->shouldReceive('usesHeadlessBrowser')->once()->andReturn(false);

        $loader->shouldReceive('useHeadlessBrowser')->once();

        $loader->shouldReceive('useHttpClient')->once();

        $respondedRequest = new RespondedRequest(
            new Request('GET', 'https://www.example.com/hello/world'),
            new Response(200, body: Utils::streamFor('Hello World!')),
        );

        $loader->shouldReceive('load')->once()->andReturn($respondedRequest);

        $step = Http::get()->setLoader($loader)->useBrowser();

        helper_invokeStepWithInput($step);
    },
);

it(
    'switches the loader to use the browser, when stopOnErrorResponse() and useBrowser() was called and the loader ' .
    'is configured to use the HTTP client',
    function () {
        $loader = Mockery::mock(HttpLoader::class);

        $loader->shouldReceive('usesHeadlessBrowser')->once()->andReturn(false);

        $loader->shouldReceive('useHeadlessBrowser')->once();

        $loader->shouldReceive('useHttpClient')->once();

        $respondedRequest = new RespondedRequest(
            new Request('GET', 'https://www.example.com/hello/world'),
            new Response(200, body: Utils::streamFor('Hello World!')),
        );

        $loader->shouldReceive('loadOrFail')->once()->andReturn($respondedRequest);

        $step = Http::get()->setLoader($loader)->stopOnErrorResponse()->useBrowser();

        helper_invokeStepWithInput($step);
    },
);

it(
    'does not switch the loader to use the browser, when useBrowser() was called, the loader is configured to use ' .
    'the HTTP client, but the request method is not GET',
    function (string $httpMethod) {
        $logger = new DummyLogger();

        $loader = Mockery::mock(HttpLoader::class);

        $loader->shouldReceive('usesHeadlessBrowser')->once()->andReturn(false);

        $loader->shouldNotReceive('useHeadlessBrowser');

        $respondedRequest = new RespondedRequest(
            new Request($httpMethod, 'https://www.example.com/something'),
            new Response(200, body: Utils::streamFor('Something!')),
        );

        $loader->shouldReceive('load')->once()->andReturn($respondedRequest);

        $step = Http::{$httpMethod}()->setLoader($loader)->addLogger($logger)->useBrowser();

        helper_invokeStepWithInput($step);

        expect($logger->messages)->toHaveCount(1)
            ->and($logger->messages[0]['message'])->toBe(
                'The (headless) browser can only be used for GET requests! Therefore this step will use the HTTP ' .
                'client for loading.',
            );
    },
)->with(['post', 'put', 'patch', 'delete']);

it(
    'automatically switches the loader to use the HTTP client, when the HTTP method is not GET and the loader is ' .
    'configured to use the browser',
    function (string $httpMethod) {
        $logger = new DummyLogger();

        $loader = Mockery::mock(HttpLoader::class);

        $loader->shouldReceive('usesHeadlessBrowser')->once()->andReturn(true);

        $loader->shouldReceive('useHttpClient')->once();

        $loader->shouldReceive('useHeadlessBrowser')->once();

        $respondedRequest = new RespondedRequest(
            new Request($httpMethod, 'https://www.example.com/something'),
            new Response(200, body: Utils::streamFor('Something!')),
        );

        $loader->shouldReceive('load')->once()->andReturn($respondedRequest);

        $step = Http::{$httpMethod}()->setLoader($loader)->addLogger($logger)->useBrowser();

        helper_invokeStepWithInput($step);

        expect($logger->messages)->toHaveCount(1)
            ->and($logger->messages[0]['message'])->toBe(
                'The (headless) browser can only be used for GET requests! Therefore this step will use the HTTP ' .
                'client for loading.',
            );
    },
)->with(['post', 'put', 'patch', 'delete']);

it(
    'switches back the loader to use the HTTP client, when stopOnErrorResponse() and useBrowser() was called and ' .
    'loading throws an exception',
    function () {
        $loader = Mockery::mock(HttpLoader::class);

        $loader->shouldReceive('usesHeadlessBrowser')->once()->andReturn(false);

        $loader->shouldReceive('useHeadlessBrowser')->once();

        $loader->shouldReceive('useHttpClient')->once();

        $loader->shouldReceive('loadOrFail')->once()->andThrow(new LoadingException('error message'));

        $step = Http::get()->setLoader($loader)->stopOnErrorResponse()->useBrowser();

        try {
            helper_invokeStepWithInput($step);
        } catch (Throwable $exception) {
        }
    },
);

it(
    'does not call the useHeadlessBrowser() method of the loader, when useBrowser() was called and the loader is ' .
    'already configured to use the browser',
    function () {
        $loader = Mockery::mock(HttpLoader::class);

        $loader->shouldReceive('usesHeadlessBrowser')->once()->andReturn(true);

        $loader->shouldNotReceive('useHeadlessBrowser');

        $loader->shouldNotReceive('useHttpClient');

        $respondedRequest = new RespondedRequest(
            new Request('GET', 'https://www.example.com/hello/world'),
            new Response(200, body: Utils::streamFor('Hello World!')),
        );

        $loader->shouldReceive('load')->once()->andReturn($respondedRequest);

        $step = Http::get()->setLoader($loader)->useBrowser();

        helper_invokeStepWithInput($step);
    },
);

it(
    'does not call the useHeadlessBrowser() method of the loader, when stopOnErrorResponse() and useBrowser() was ' .
    'called and the loader is already configured to use the browser',
    function () {
        $loader = Mockery::mock(HttpLoader::class);

        $loader->shouldReceive('usesHeadlessBrowser')->once()->andReturn(true);

        $loader->shouldNotReceive('useHeadlessBrowser');

        $loader->shouldNotReceive('useHttpClient');

        $respondedRequest = new RespondedRequest(
            new Request('GET', 'https://www.example.com/hello/world'),
            new Response(200, body: Utils::streamFor('Hello World!')),
        );

        $loader->shouldReceive('loadOrFail')->once()->andReturn($respondedRequest);

        $step = Http::get()->setLoader($loader)->stopOnErrorResponse()->useBrowser();

        helper_invokeStepWithInput($step);
    },
);

it(
    'sets post browser navigate hooks, when useBrowser() was called and the loader is configured to use the HTTP ' .
    'client',
    function () {
        $loader = Mockery::mock(HttpLoader::class)->makePartial();

        $browserHelperMock = Mockery::mock(HeadlessBrowserLoaderHelper::class);

        $loader->shouldReceive('browser')->andReturn($browserHelperMock);

        $browserHelperMock
            ->shouldReceive('setTempPostNavigateHooks')
            ->once()
            ->withArgs(function (array $hooks) {
                return $hooks[0] instanceof Closure;
            });

        $respondedRequest = new RespondedRequest(
            new Request('GET', 'https://www.example.com/woop'),
            new Response(200, body: Utils::streamFor('Woop')),
        );

        $loader->shouldReceive('load')->once()->andReturn($respondedRequest);

        $step = Http::get()->setLoader($loader)->useBrowser()->postBrowserNavigateHook(BrowserAction::wait(1.0));

        helper_invokeStepWithInput($step);
    },
);


================================================
FILE: tests/Steps/Loading/LoadingStepTest.php
================================================
<?php

namespace tests\Steps\Loading;

use Crwlr\Crawler\Input;
use Crwlr\Crawler\Loader\Http\HttpLoader;
use Crwlr\Crawler\Loader\Loader;
use Crwlr\Crawler\Steps\Loading\LoadingStep;
use Crwlr\Crawler\Steps\Step;
use Generator;
use Mockery;

use function tests\helper_invokeStepWithInput;
use function tests\helper_traverseIterable;

test('you can add a loader', function () {
    $step = new class extends Step {
        /**
         * @use LoadingStep<HttpLoader>
         */
        use LoadingStep;

        protected function invoke(mixed $input): Generator
        {
            $this->getLoader()->load($input);

            yield [];
        }
    };

    $loader = Mockery::mock(HttpLoader::class);

    $loader->shouldReceive('load')->once();

    $step->setLoader($loader);

    helper_traverseIterable($step->invokeStep(new Input('https://www.digitalocean.com/blog')));
});

test(
    'you can provide a custom loader to a step via the withLoader() method, and it will be preferred to the loader ' .
    'provided via setLoader()',
    function () {
        $loaderOne = Mockery::mock(Loader::class);

        $loaderOne->shouldNotReceive('load');

        $loaderTwo = Mockery::mock(Loader::class);

        $loaderTwo->shouldReceive('load')->once()->andReturn('Hi');

        $step = new class extends Step {
            /**
             * @use LoadingStep<Loader>
             */
            use LoadingStep;

            protected function invoke(mixed $input): Generator
            {
                yield $this->getLoader()->load($input);
            }
        };

        $step->withLoader($loaderTwo);

        // The crawler will call the setLoader() method of the step after the step was added to the crawler.
        // So, the call to withLoader() will happen before that.
        // Nevertheless, the loader passed to withLoader() should be preferred.
        $step->setLoader($loaderOne);

        helper_invokeStepWithInput($step);
    },
);


================================================
FILE: tests/Steps/Refiners/AbstractRefinerTest.php
================================================
<?php

namespace tests\Steps\Refiners;

use Crwlr\Crawler\Logger\CliLogger;
use Crwlr\Crawler\Steps\Refiners\AbstractRefiner;
use PHPUnit\Framework\TestCase;

class SomeRefiner extends AbstractRefiner
{
    public function refine(mixed $value): mixed
    {
        $this->logger?->info('logging works');

        return $value;
    }

    public function testLogTypeWarning(): void
    {
        $this->logTypeWarning('Some::staticMethodName()', 'foo');
    }
}

/** @var TestCase $this */

it('takes a logger that can be used in the Refiner', function () {
    $refiner = new SomeRefiner();

    $refiner->addLogger(new CliLogger());

    $refiner->refine('foo');

    $logOutput = $this->getActualOutputForAssertion();

    expect($logOutput)->toContain('logging works');
});

it('provides a method for children to log a warning if the type of the incoming value is wrong', function () {
    (new SomeRefiner())->addLogger(new CliLogger())->testLogTypeWarning();

    $logOutput = $this->getActualOutputForAssertion();

    expect($logOutput)->toContain('Refiner Some::staticMethodName() can\'t be applied to value of type string');
});


================================================
FILE: tests/Steps/Refiners/DateTime/DateTimeFormatTest.php
================================================
<?php

namespace tests\Steps\Refiners\DateTime;

use Crwlr\Crawler\Steps\Refiners\DateTimeRefiner;
use tests\_Stubs\DummyLogger;

it('reformats common date/time strings without knowing the origin format', function (string $from, string $to) {
    $refinedValue = DateTimeRefiner::reformat('Y-m-d H:i:s')->refine($from);

    expect($refinedValue)->toBe($to);
})->with([
    ['2024-09-21T13:55:41Z', '2024-09-21 13:55:41'],
    ['2024-09-21T13:55:41.000Z', '2024-09-21 13:55:41'],
    ['2024-09-21', '2024-09-21 00:00:00'],
    ['2024-09-21, 13:55:41', '2024-09-21 13:55:41'],
    ['21 September 2024, 13:55:41', '2024-09-21 13:55:41'],
    ['21. September 2024, 13:55:41', '2024-09-21 13:55:41'],
    ['21 September 2024', '2024-09-21 00:00:00'],
    ['21. September 2024', '2024-09-21 00:00:00'],
    ['21.09.2024', '2024-09-21 00:00:00'],
    ['21.09.2024 13:55', '2024-09-21 13:55:00'],
    ['21.09.2024 13:55:41', '2024-09-21 13:55:41'],
    ['Sat, 21 September 2024 13:55:41 +0000', '2024-09-21 13:55:41'],
    ['Sat Sep 21 2024 16:55:41 GMT+0100', '2024-09-21 15:55:41'],
]);

it('reformats a format that PHP\'s strtotime() does not know, when the origin format is provided', function () {
    $refinedValue = DateTimeRefiner::reformat('Y-m-d H:i:s', 'd. F Y \u\m H:i:s')
        ->refine('21. September 2024 um 13:55:41');

    expect($refinedValue)->toBe('2024-09-21 13:55:41');
});

it('logs a warning message (and keeps original input) when it wasn\'t able to auto-convert a date time string', function () {
    $refiner = DateTimeRefiner::reformat('Y-m-d H:i:s');

    $logger = new DummyLogger();

    $refiner->addLogger($logger);

    $refinedValue = $refiner->refine('21. September 2024 um 13:55:41');

    expect($logger->messages)->toHaveCount(1)
        ->and($logger->messages[0]['level'])->toBe('warning')
        ->and($logger->messages[0]['message'])->toStartWith('Failed to automatically (without known format) parse')
        ->and($refinedValue)->toBe('21. September 2024 um 13:55:41');
});

it(
    'logs a warning message (and keeps original input) when it wasn\'t able to convert a date time string with the ' .
    'given origin format',
    function () {
        $refiner = DateTimeRefiner::reformat('Y-m-d H:i:s', 'd. F Y um H:i:s');

        $logger = new DummyLogger();

        $refiner->addLogger($logger);

        $refinedValue = $refiner->refine('21. September 2024 um 13:55:41');

        expect($logger->messages)->toHaveCount(1)
            ->and($logger->messages[0]['level'])->toBe('warning')
            ->and($logger->messages[0]['message'])->toStartWith('Failed parsing date/time ')
            ->and($refinedValue)->toBe('21. September 2024 um 13:55:41');
    },
);

it('reformats an array of date time strings', function () {
    $refinedValue = DateTimeRefiner::reformat('Y-m-d H:i:s')->refine([
        '2024-09-21T13:55:41Z',
        '2024-09-21T13:55:41.000Z',
        '2024-09-21',
    ]);

    expect($refinedValue)->toBe([
        '2024-09-21 13:55:41',
        '2024-09-21 13:55:41',
        '2024-09-21 00:00:00',
    ]);
});


================================================
FILE: tests/Steps/Refiners/Html/RemoveFromHtmlTest.php
================================================
<?php

namespace tests\Steps\Refiners\DateTime;

use Crwlr\Crawler\Steps\Dom;
use Crwlr\Crawler\Steps\Refiners\HtmlRefiner;

it('removes a certain node from an HTML document by selector', function () {
    $html = <<<HTML
        <!doctype html>
        <html>
        <head></head>
        <body>
        <h1>Hi!</h1>
        <div id="foo">remove this!</div>
        </body>
        </html>
        HTML;

    $refinedValue = HtmlRefiner::remove('#foo')->refine($html);

    expect($refinedValue)->not()->toContain('remove this!')
        ->and($refinedValue)->toContain('<h1>Hi!</h1>');
});

it('removes a certain node from an HTML snippet by selector', function () {
    $html = <<<HTML
        <article>
        <h1>Hi!</h1>
        <p id="foo">remove this!</p>
        </article>
        HTML;

    $refinedValue = HtmlRefiner::remove('#foo')->refine($html);

    expect($refinedValue)->not()->toContain('remove this!')
        ->and($refinedValue)->toContain('<h1>Hi!</h1>')
        ->and($refinedValue)->not()->toContain('<html>');
});

it('removes multiple nodes from an HTML snippet by selector', function () {
    $html = <<<HTML
        <article>
        <ul id="list">
            <li>foo</li>
            <li class="remove">bar</li>
            <li>baz</li>
            <li class="remove">quz</li>
        </ul>
        </article>
        HTML;

    $refinedValue = HtmlRefiner::remove('#list .remove')->refine($html);

    expect($refinedValue)->not()->toContain('bar')
        ->and($refinedValue)->not()->toContain('quz')
        ->and($refinedValue)->toContain('<li>foo</li>')
        ->and($refinedValue)->toContain('<li>baz</li>')
        ->and($refinedValue)->not()->toContain('<html>');
});

it('removes multiple nodes from HTML by xpath query', function () {
    $html = <<<HTML
        <article>
        <ul id="list">
            <li>foo</li>
            <li class="remove">bar</li>
            <li>baz</li>
            <li class="remove">quz</li>
        </ul>
        </article>
        HTML;

    $refinedValue = HtmlRefiner::remove(Dom::xPath('//li[contains(@class, \'remove\')]'))->refine($html);

    expect($refinedValue)->not()->toContain('bar')
        ->and($refinedValue)->not()->toContain('quz')
        ->and($refinedValue)->toContain('<li>foo</li>')
        ->and($refinedValue)->toContain('<li>baz</li>')
        ->and($refinedValue)->not()->toContain('<html>');
});

it('removes node from an array of HTML snippets', function () {
    $html = [
        <<<HTML
        <ul id="list">
            <li>foo</li>
            <li class="remove">bar</li>
            <li>baz</li>
            <li class="remove">quz</li>
        </ul>
        HTML,
        <<<HTML
        <ul id="list">
            <li>lorem</li>
            <li class="remove">ipsum</li>
            <li>dolor</li>
            <li class="remove">sit</li>
        </ul>
        HTML,
    ];

    $refinedValue = HtmlRefiner::remove('.remove')->refine($html);

    expect($refinedValue[0])->not()->toContain('bar')
        ->and($refinedValue[0])->not()->toContain('quz')
        ->and($refinedValue[1])->not()->toContain('ipsum')
        ->and($refinedValue[1])->not()->toContain('sit');
});


================================================
FILE: tests/Steps/Refiners/String/AfterFirstTest.php
================================================
<?php

namespace tests\Steps\Refiners\String;

use Crwlr\Crawler\Logger\CliLogger;
use Crwlr\Crawler\Steps\Refiners\StringRefiner;
use PHPUnit\Framework\TestCase;

/** @var TestCase $this */

it('logs a warning and returns the unchanged value when $value is not of type string', function (mixed $value) {
    $refinedValue = StringRefiner::afterFirst('foo')
        ->addLogger(new CliLogger())
        ->refine($value);

    $logOutput = $this->getActualOutputForAssertion();

    expect($logOutput)
        ->toContain('Refiner StringRefiner::afterFirst() can\'t be applied to value of type ' . gettype($value))
        ->and($refinedValue)->toBe($value);
})->with([
    [123],
    [12.3],
    [true],
]);

it('works with an array of strings as value', function () {
    $refinedValue = StringRefiner::afterFirst('a')
        ->addLogger(new CliLogger())
        ->refine(['foo a bar a baz', 'lorem a ipsum a dolor']);

    expect($refinedValue)->toBe(['bar a baz', 'ipsum a dolor']);
});

it('returns the string after first occurrence of another string', function () {
    expect(StringRefiner::afterFirst('foo')->refine('yo lo foo boo choo foo gnu'))->toBe('boo choo foo gnu');
});

it('returns the full string if the string to look for is empty', function () {
    expect(StringRefiner::afterFirst('')->refine('yo lo foo boo choo'))->toBe('yo lo foo boo choo');
});

it('returns the full string when the string to look for is not contained', function () {
    expect(StringRefiner::afterFirst('moo')->refine('yo lo foo boo choo'))->toBe('yo lo foo boo choo');
});


================================================
FILE: tests/Steps/Refiners/String/AfterLastTest.php
================================================
<?php

namespace tests\Steps\Refiners\String;

use Crwlr\Crawler\Logger\CliLogger;
use Crwlr\Crawler\Steps\Refiners\StringRefiner;
use PHPUnit\Framework\TestCase;

/** @var TestCase $this */

it('logs a warning and returns the unchanged value when $value is not of type string', function (mixed $value) {
    $refinedValue = StringRefiner::afterLast('foo')
        ->addLogger(new CliLogger())
        ->refine($value);

    $logOutput = $this->getActualOutputForAssertion();

    expect($logOutput)
        ->toContain('Refiner StringRefiner::afterLast() can\'t be applied to value of type ' . gettype($value))
        ->and($refinedValue)->toBe($value);
})->with([
    [123],
    [12.3],
    [true],
]);

it('works with an array of strings as value', function () {
    $refinedValue = StringRefiner::afterLast('a')
        ->addLogger(new CliLogger())
        ->refine(['foo a bar a baz', 'lorem a ipsum a dolor']);

    expect($refinedValue)->toBe(['z', 'dolor']);
});

it('returns the string after last occurrence of another string', function () {
    expect(StringRefiner::afterLast('foo')->refine('yo lo foo boo choo foo gnu'))->toBe('gnu');
});

it('returns an empty string if the string to look for is empty', function () {
    expect(StringRefiner::afterLast('')->refine('yo lo foo boo choo'))->toBe('');
});

it('returns the full string when the string to look for is not contained', function () {
    expect(StringRefiner::afterLast('moo')->refine('yo lo foo boo choo'))->toBe('yo lo foo boo choo');
});


================================================
FILE: tests/Steps/Refiners/String/BeforeFirstTest.php
================================================
<?php

namespace tests\Steps\Refiners\String;

use Crwlr\Crawler\Logger\CliLogger;
use Crwlr\Crawler\Steps\Refiners\StringRefiner;
use PHPUnit\Framework\TestCase;

/** @var TestCase $this */

it('logs a warning and returns the unchanged value when $value is not of type string', function (mixed $value) {
    $refinedValue = StringRefiner::beforeFirst('foo')
        ->addLogger(new CliLogger())
        ->refine($value);

    $logOutput = $this->getActualOutputForAssertion();

    expect($logOutput)
        ->toContain('Refiner StringRefiner::beforeFirst() can\'t be applied to value of type ' . gettype($value))
        ->and($refinedValue)->toBe($value);
})->with([
    [123],
    [12.3],
    [true],
]);

it('works with an array of strings as value', function () {
    $refinedValue = StringRefiner::beforeFirst('a')
        ->addLogger(new CliLogger())
        ->refine(['foo a bar a baz', 'lorem a ipsum a dolor']);

    expect($refinedValue)->toBe(['foo', 'lorem']);
});

it('returns the string before the first occurrence of another string', function () {
    expect(StringRefiner::beforeFirst('foo')->refine('yo lo foo boo choo foo gnu'))->toBe('yo lo');
});

it('returns an empty string if the string to look for is empty', function () {
    expect(StringRefiner::beforeFirst('')->refine('yo lo foo boo choo'))->toBe('');
});

it('returns the full string when the string to look for is not contained', function () {
    expect(StringRefiner::beforeFirst('moo')->refine('yo lo foo boo choo'))->toBe('yo lo foo boo choo');
});


================================================
FILE: tests/Steps/Refiners/String/BeforeLastTest.php
================================================
<?php

namespace tests\Steps\Refiners\String;

use Crwlr\Crawler\Logger\CliLogger;
use Crwlr\Crawler\Steps\Refiners\StringRefiner;
use PHPUnit\Framework\TestCase;

/** @var TestCase $this */

it('logs a warning and returns the unchanged value when $value is not of type string', function (mixed $value) {
    $refinedValue = StringRefiner::beforeLast('foo')
        ->addLogger(new CliLogger())
        ->refine($value);

    $logOutput = $this->getActualOutputForAssertion();

    expect($logOutput)
        ->toContain('Refiner StringRefiner::beforeLast() can\'t be applied to value of type ' . gettype($value))
        ->and($refinedValue)->toBe($value);
})->with([
    [123],
    [12.3],
    [true],
]);

it('works with an array of strings as value', function () {
    $refinedValue = StringRefiner::beforeLast('a')
        ->addLogger(new CliLogger())
        ->refine(['foo a bar a baz', 'lorem a ipsum a dolor']);

    expect($refinedValue)->toBe(['foo a bar a b', 'lorem a ipsum']);
});

it('returns the string before the last occurrence of another string', function () {
    expect(StringRefiner::beforeLast('foo')->refine('yo lo foo boo choo foo gnu'))->toBe('yo lo foo boo choo');
});

it('returns the full string if the string to look for is empty', function () {
    expect(StringRefiner::beforeLast('')->refine('yo lo foo boo choo'))->toBe('yo lo foo boo choo');
});

it('returns the full string when the string to look for is not contained', function () {
    expect(StringRefiner::beforeLast('moo')->refine('yo lo foo boo choo'))->toBe('yo lo foo boo choo');
});


================================================
FILE: tests/Steps/Refiners/String/BetweenFirstTest.php
================================================
<?php

namespace tests\Steps\Refiners\String;

use Crwlr\Crawler\Logger\CliLogger;
use Crwlr\Crawler\Steps\Refiners\StringRefiner;
use PHPUnit\Framework\TestCase;

/** @var TestCase $this */

it('logs a warning and returns the unchanged value when $value is not of type string', function (mixed $value) {
    $refinedValue = StringRefiner::betweenFirst('foo', 'bar')
        ->addLogger(new CliLogger())
        ->refine($value);

    $logOutput = $this->getActualOutputForAssertion();

    expect($logOutput)
        ->toContain('Refiner StringRefiner::betweenFirst() can\'t be applied to value of type ' . gettype($value))
        ->and($refinedValue)->toBe($value);
})->with([
    [123],
    [12.3],
    [true],
]);

it('works with an array of strings as value', function () {
    $refinedValue = StringRefiner::betweenFirst('foo', 'bar')
        ->addLogger(new CliLogger())
        ->refine(['one foo two bar three foo four bar five', 'six foo seven bar eight foo nine bar ten']);

    expect($refinedValue)->toBe(['two', 'seven']);
});

it('gets the (trimmed) string between the first occurrence of start and the next occurrence of end', function () {
    $refiner = StringRefiner::betweenFirst('foo', 'bar');

    $refinedValue = $refiner->refine('bla foo bli bar blu foo bar asdf foo bar');

    expect($refinedValue)->toBe('bli');
});

test('if start is an empty string, start from the beginning', function () {
    $refiner = StringRefiner::betweenFirst('', 'bar');

    $refinedValue = $refiner->refine('bla foo bli bar blu foo bar asdf foo bar');

    expect($refinedValue)->toBe('bla foo bli');
});

test('if end is an empty string, it takes the rest of the string until the end', function () {
    $refiner = StringRefiner::betweenFirst('blu', '');

    $refinedValue = $refiner->refine('bla foo bli bar blu foo bar asdf foo bar');

    expect($refinedValue)->toBe('foo bar asdf foo bar');
});

it('returns an empty string if start is not contained in the string', function () {
    $refiner = StringRefiner::betweenFirst('not contained', '');

    $refinedValue = $refiner->refine('bla foo bli bar blu foo bar asdf foo bar');

    expect($refinedValue)->toBe('');
});


================================================
FILE: tests/Steps/Refiners/String/BetweenLastTest.php
================================================
<?php

namespace tests\Steps\Refiners\String;

use Crwlr\Crawler\Logger\CliLogger;
use Crwlr\Crawler\Steps\Refiners\StringRefiner;
use PHPUnit\Framework\TestCase;

/** @var TestCase $this */

it('logs a warning and returns the unchanged value when $value is not of type string', function (mixed $value) {
    $refinedValue = StringRefiner::betweenLast('foo', 'bar')
        ->addLogger(new CliLogger())
        ->refine($value);

    $logOutput = $this->getActualOutputForAssertion();

    expect($logOutput)
        ->toContain('Refiner StringRefiner::betweenLast() can\'t be applied to value of type ' . gettype($value))
        ->and($refinedValue)->toBe($value);
})->with([
    [123],
    [12.3],
    [true],
]);

it('works with an array of strings as value', function () {
    $refinedValue = StringRefiner::betweenLast('foo', 'bar')
        ->addLogger(new CliLogger())
        ->refine(['one foo two bar three foo four bar five', 'six foo seven bar eight foo nine bar ten']);

    expect($refinedValue)->toBe(['four', 'nine']);
});

it('gets the (trimmed) string between the last occurrence of start and the next occurrence of end', function () {
    $refiner = StringRefiner::betweenLast('foo', 'bar');

    $refinedValue = $refiner->refine('bla foo bli bar blu foo ble foo blo bar blö bar blä');

    expect($refinedValue)->toBe('blo');
});

test('if start is an empty string, start from the beginning', function () {
    $refiner = StringRefiner::betweenLast('', 'blu');

    $refinedValue = $refiner->refine('bla foo bli bar blu foo ble foo blo bar blö bar blä');

    expect($refinedValue)->toBe('bla foo bli bar');
});

test('if end is an empty string, it takes the rest of the string until the end', function () {
    $refiner = StringRefiner::betweenLast('blo', '');

    $refinedValue = $refiner->refine('bla foo bli bar blu foo ble foo blo bar blö bar blä');

    expect($refinedValue)->toBe('bar blö bar blä');
});

it('returns an empty string if start is not contained in the string', function () {
    $refiner = StringRefiner::betweenFirst('not contained', '');

    $refinedValue = $refiner->refine('bla foo bli bar blu foo bar asdf foo bar');

    expect($refinedValue)->toBe('');
});


================================================
FILE: tests/Steps/Refiners/String/ReplaceTest.php
================================================
<?php

namespace tests\Steps\Refiners\String;

use Crwlr\Crawler\Logger\CliLogger;
use Crwlr\Crawler\Steps\Refiners\StringRefiner;
use PHPUnit\Framework\TestCase;

/** @var TestCase $this */

it('logs a warning and returns the unchanged value when $value is not of type string', function (mixed $value) {
    $refinedValue = StringRefiner::replace('foo', 'bar')
        ->addLogger(new CliLogger())
        ->refine($value);

    $logOutput = $this->getActualOutputForAssertion();

    expect($logOutput)
        ->toContain('Refiner StringRefiner::replace() can\'t be applied to value of type ' . gettype($value))
        ->and($refinedValue)->toBe($value);
})->with([
    [123],
    [12.3],
    [true],
]);

it('works when the value is an array of strings', function () {
    $refinedValue = StringRefiner::replace('foo', 'bar')
        ->addLogger(new CliLogger())
        ->refine(['foo boo', 'who foo', 'yo lo']);

    expect($refinedValue)->toBe(['bar boo', 'who bar', 'yo lo']);
});

it('replaces occurrences of a string with another string', function () {
    expect(StringRefiner::replace('foo', 'bar')->refine('foo, test lorem foo yolo'))->toBe('bar, test lorem bar yolo');
});

it('replaces occurrences of an array of strings with another array of strings', function () {
    expect(StringRefiner::replace(['foo', 'bar'], ['yo', 'lo'])->refine('foo bar baz'))->toBe('yo lo baz');
});

it('replaces occurrences of an array of strings with some single string', function () {
    expect(StringRefiner::replace(['foo', 'bar'], '-')->refine('foo bar baz'))->toBe('- - baz');
});


================================================
FILE: tests/Steps/Refiners/Url/WithFragmentTest.php
================================================
<?php

namespace tests\Steps\Refiners\Url;

use Crwlr\Crawler\Logger\CliLogger;
use Crwlr\Crawler\Steps\Refiners\UrlRefiner;
use Crwlr\Url\Url;
use PHPUnit\Framework\TestCase;
use stdClass;

/** @var TestCase $this */

it(
    'logs a warning and returns the unchanged value when $value is not a string or instance of UriInterface',
    function (mixed $value) {
        $refinedValue = UrlRefiner::withFragment('foo')
            ->addLogger(new CliLogger())
            ->refine($value);

        $logOutput = $this->getActualOutputForAssertion();

        expect($logOutput)
            ->toContain('Refiner UrlRefiner::withFragment() can\'t be applied to value of type ' . gettype($value))
            ->and($refinedValue)->toBe($value);
    },
)->with([
    [123],
    [true],
    [new stdClass()],
]);

it('replaces the query in a URL', function (mixed $value, string $expected) {
    expect(UrlRefiner::withFragment('#lorem')->refine($value))->toBe($expected);
})->with([
    ['https://www.example.com/path#foo', 'https://www.example.com/path#lorem'],
    ['https://www.example.com/path', 'https://www.example.com/path#lorem'],
    [Url::parse('https://www.crwlr.software/some/path#abc'), 'https://www.crwlr.software/some/path#lorem'],
    [Url::parsePsr7('https://www.crwl.io/quz#'), 'https://www.crwl.io/quz#lorem'],
]);

it('resets any query', function (mixed $value, string $expected) {
    expect(UrlRefiner::withoutFragment()->refine($value))->toBe($expected);
})->with([
    ['https://www.example.com/foo#bar', 'https://www.example.com/foo'],
    ['https://www.crwlr.software/#', 'https://www.crwlr.software/'],
]);

it('refines an array of URLs', function () {
    expect(
        UrlRefiner::withFragment('#lorem')
            ->refine([
                'https://www.example.com/path#foo',
                'https://www.example.com/path#bar',
            ]),
    )->toBe(['https://www.example.com/path#lorem', 'https://www.example.com/path#lorem']);
});


================================================
FILE: tests/Steps/Refiners/Url/WithHostTest.php
================================================
<?php

namespace tests\Steps\Refiners\Url;

use Crwlr\Crawler\Logger\CliLogger;
use Crwlr\Crawler\Steps\Refiners\UrlRefiner;
use Crwlr\Url\Url;
use PHPUnit\Framework\TestCase;
use stdClass;

/** @var TestCase $this */

it(
    'logs a warning and returns the unchanged value when $value is not a string or instance of UriInterface',
    function (mixed $value) {
        $refinedValue = UrlRefiner::withHost('www.crwlr.software')
            ->addLogger(new CliLogger())
            ->refine($value);

        $logOutput = $this->getActualOutputForAssertion();

        expect($logOutput)
            ->toContain('Refiner UrlRefiner::withHost() can\'t be applied to value of type ' . gettype($value))
            ->and($refinedValue)->toBe($value);
    },
)->with([
    [123],
    [true],
    [new stdClass()],
]);

it('replaces the host in a URL', function (mixed $value, string $expected) {
    expect(UrlRefiner::withHost('www.crwlr.software')->refine($value))->toBe($expected);
})->with([
    ['https://www.example.com/foo', 'https://www.crwlr.software/foo'],
    ['https://www.crwl.io/bar', 'https://www.crwlr.software/bar'],
    [Url::parse('https://www.crwlr.software/baz'), 'https://www.crwlr.software/baz'],
    [Url::parsePsr7('https://crwl.io/quz'), 'https://www.crwlr.software/quz'],
]);

it('refines an array of URLs', function () {
    expect(
        UrlRefiner::withHost('crwl.io')
            ->refine([
                'https://www.example.com/foo',
                'https://www.example.com/bar',
            ]),
    )->toBe(['https://crwl.io/foo', 'https://crwl.io/bar']);
});


================================================
FILE: tests/Steps/Refiners/Url/WithPathTest.php
================================================
<?php

namespace tests\Steps\Refiners\Url;

use Crwlr\Crawler\Logger\CliLogger;
use Crwlr\Crawler\Steps\Refiners\UrlRefiner;
use Crwlr\Url\Url;
use PHPUnit\Framework\TestCase;
use stdClass;

/** @var TestCase $this */

it(
    'logs a warning and returns the unchanged value when $value is not a string or instance of UriInterface',
    function (mixed $value) {
        $refinedValue = UrlRefiner::withPath('/home')
            ->addLogger(new CliLogger())
            ->refine($value);

        $logOutput = $this->getActualOutputForAssertion();

        expect($logOutput)
            ->toContain('Refiner UrlRefiner::withPath() can\'t be applied to value of type ' . gettype($value))
            ->and($refinedValue)->toBe($value);
    },
)->with([
    [123],
    [true],
    [new stdClass()],
]);

it('replaces the path in a URL', function (mixed $value, string $expected) {
    expect(UrlRefiner::withPath('/some/path/123')->refine($value))->toBe($expected);
})->with([
    ['https://www.example.com/foo', 'https://www.example.com/some/path/123'],
    ['https://localhost/yo', 'https://localhost/some/path/123'],
    [Url::parse('https://www.crwlr.software/packages'), 'https://www.crwlr.software/some/path/123'],
    [Url::parsePsr7('https://www.crwl.io/'), 'https://www.crwl.io/some/path/123'],
]);

it('refines an array of URLs', function () {
    expect(
        UrlRefiner::withPath('/hawedere')
            ->refine([
                'https://www.example.com/foo',
                'https://www.example.com/bar',
            ]),
    )->toBe(['https://www.example.com/hawedere', 'https://www.example.com/hawedere']);
});


================================================
FILE: tests/Steps/Refiners/Url/WithPortTest.php
================================================
<?php

namespace tests\Steps\Refiners\Url;

use Crwlr\Crawler\Logger\CliLogger;
use Crwlr\Crawler\Steps\Refiners\UrlRefiner;
use Crwlr\Url\Url;
use PHPUnit\Framework\TestCase;
use stdClass;

/** @var TestCase $this */

it(
    'logs a warning and returns the unchanged value when $value is not a string or instance of UriInterface',
    function (mixed $value) {
        $refinedValue = UrlRefiner::withPort(1234)
            ->addLogger(new CliLogger())
            ->refine($value);

        $logOutput = $this->getActualOutputForAssertion();

        expect($logOutput)
            ->toContain('Refiner UrlRefiner::withPort() can\'t be applied to value of type ' . gettype($value))
            ->and($refinedValue)->toBe($value);
    },
)->with([
    [123],
    [true],
    [new stdClass()],
]);

it('replaces the port in a URL', function (mixed $value, string $expected) {
    expect(UrlRefiner::withPort(1234)->refine($value))->toBe($expected);
})->with([
    ['https://www.example.com:8000/foo', 'https://www.example.com:1234/foo'],
    ['https://localhost:8080/yo', 'https://localhost:1234/yo'],
    [Url::parse('https://www.crwlr.software:5678/bar'), 'https://www.crwlr.software:1234/bar'],
    [Url::parsePsr7('https://crwl.io/quz'), 'https://crwl.io:1234/quz'],
]);

it('refines an array of URLs', function () {
    expect(
        UrlRefiner::withPort(1234)
            ->refine([
                'https://www.example.com/foo',
                'https://www.example.com/bar',
            ]),
    )->toBe(['https://www.example.com:1234/foo', 'https://www.example.com:1234/bar']);
});


================================================
FILE: tests/Steps/Refiners/Url/WithQueryTest.php
================================================
<?php

namespace tests\Steps\Refiners\Url;

use Crwlr\Crawler\Logger\CliLogger;
use Crwlr\Crawler\Steps\Refiners\UrlRefiner;
use Crwlr\Url\Url;
use PHPUnit\Framework\TestCase;
use stdClass;

/** @var TestCase $this */

it(
    'logs a warning and returns the unchanged value when $value is not a string or instance of UriInterface',
    function (mixed $value) {
        $refinedValue = UrlRefiner::withQuery('a=b&c=d')
            ->addLogger(new CliLogger())
            ->refine($value);

        $logOutput = $this->getActualOutputForAssertion();

        expect($logOutput)
            ->toContain('Refiner UrlRefiner::withQuery() can\'t be applied to value of type ' . gettype($value))
            ->and($refinedValue)->toBe($value);
    },
)->with([
    [123],
    [true],
    [new stdClass()],
]);

it('replaces the query in a URL', function (mixed $value, string $expected) {
    expect(UrlRefiner::withQuery('a=b&c=d')->refine($value))->toBe($expected);
})->with([
    ['https://www.example.com/foo?one=two', 'https://www.example.com/foo?a=b&c=d'],
    ['https://www.example.com/bar', 'https://www.example.com/bar?a=b&c=d'],
    [Url::parse('https://www.crwlr.software/?'), 'https://www.crwlr.software/?a=b&c=d'],
    [Url::parsePsr7('https://www.crwl.io/quz?a=c&b=d'), 'https://www.crwl.io/quz?a=b&c=d'],
]);

it('resets any query', function (mixed $value, string $expected) {
    expect(UrlRefiner::withoutQuery()->refine($value))->toBe($expected);
})->with([
    ['https://www.example.com/foo?one=two', 'https://www.example.com/foo'],
    ['https://www.crwlr.software/?', 'https://www.crwlr.software/'],
]);

it('refines an array of URLs', function () {
    expect(
        UrlRefiner::withoutQuery()
            ->refine([
                'https://www.example.com/foo?one=two',
                'https://www.example.com/bar?three=four',
            ]),
    )->toBe(['https://www.example.com/foo', 'https://www.example.com/bar']);
});


================================================
FILE: tests/Steps/Refiners/Url/WithSchemeTest.php
================================================
<?php

namespace tests\Steps\Refiners\Url;

use Crwlr\Crawler\Logger\CliLogger;
use Crwlr\Crawler\Steps\Refiners\UrlRefiner;
use Crwlr\Url\Url;
use PHPUnit\Framework\TestCase;
use stdClass;

/** @var TestCase $this */

it(
    'logs a warning and returns the unchanged value when $value is not a string or instance of UriInterface',
    function (mixed $value) {
        $refinedValue = UrlRefiner::withScheme('https')
            ->addLogger(new CliLogger())
            ->refine($value);

        $logOutput = $this->getActualOutputForAssertion();

        expect($logOutput)
            ->toContain('Refiner UrlRefiner::withScheme() can\'t be applied to value of type ' . gettype($value))
            ->and($refinedValue)->toBe($value);
    },
)->with([
    [123],
    [true],
    [new stdClass()],
]);

it('replaces the scheme in a URL', function (mixed $value, string $expected) {
    expect(UrlRefiner::withScheme('https')->refine($value))->toBe($expected);
})->with([
    ['http://www.example.com/foo', 'https://www.example.com/foo'],
    ['https://www.example.com/foo', 'https://www.example.com/foo'],
    [Url::parse('ftp://www.example.com/bar'), 'https://www.example.com/bar'],
    [Url::parsePsr7('http://www.example.com/baz'), 'https://www.example.com/baz'],
]);

it('refines an array of URLs', function () {
    expect(
        UrlRefiner::withScheme('https')
            ->refine([
                'http://www.example.com/foo',
                'https://www.example.com/bar',
            ]),
    )->toBe(['https://www.example.com/foo', 'https://www.example.com/bar']);
});


================================================
FILE: tests/Steps/Refiners/Url/WithoutPortTest.php
================================================
<?php

namespace tests\Steps\Refiners\Url;

use Crwlr\Crawler\Logger\CliLogger;
use Crwlr\Crawler\Steps\Refiners\UrlRefiner;
use Crwlr\Url\Url;
use PHPUnit\Framework\TestCase;
use stdClass;

/** @var TestCase $this */

it(
    'logs a warning and returns the unchanged value when $value is not a string or instance of UriInterface',
    function (mixed $value) {
        $refinedValue = UrlRefiner::withoutPort()
            ->addLogger(new CliLogger())
            ->refine($value);

        $logOutput = $this->getActualOutputForAssertion();

        expect($logOutput)
            ->toContain('Refiner UrlRefiner::withoutPort() can\'t be applied to value of type ' . gettype($value))
            ->and($refinedValue)->toBe($value);
    },
)->with([
    [123],
    [true],
    [new stdClass()],
]);

it('resets the port to null in a URL', function (mixed $value, string $expected) {
    expect(UrlRefiner::withoutPort()->refine($value))->toBe($expected);
})->with([
    ['https://www.example.com:8000/foo', 'https://www.example.com/foo'],
    ['http://localhost:8080/yo', 'http://localhost/yo'],
    [Url::parse('https://www.crwlr.software:5678/bar'), 'https://www.crwlr.software/bar'],
    [Url::parsePsr7('https://crwl.io/quz'), 'https://crwl.io/quz'],
]);

it('refines an array of URLs', function () {
    expect(
        UrlRefiner::withoutPort()
            ->refine([
                'https://www.example.com:8000/foo',
                'https://www.example.com:8080/bar',
            ]),
    )->toBe(['https://www.example.com/foo', 'https://www.example.com/bar']);
});


================================================
FILE: tests/Steps/Sitemap/GetUrlsFromSitemapTest.php
================================================
<?php

namespace tests\Steps\Sitemap;

use Crwlr\Crawler\Steps\Sitemap;

use function tests\helper_invokeStepWithInput;

it('gets all urls from a sitemap XML', function () {
    $xml = <<<XML
        <?xml version="1.0" encoding="UTF-8"?>
        <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
        <url><loc>https://www.crwlr.software/</loc><priority>0.5</priority></url>
        <url><loc>https://www.crwlr.software/packages</loc><priority>0.7</priority></url>
        <url><loc>https://www.crwlr.software/blog</loc><priority>0.7</priority></url>
        <url><loc>https://www.crwlr.software/blog/whats-new-in-crwlr-crawler-v0-5</loc><priority>1</priority><lastmod>2022-09-03</lastmod></url>
        <url><loc>https://www.crwlr.software/blog/dealing-with-http-url-query-strings-in-php</loc><priority>1</priority><lastmod>2022-06-02</lastmod></url>
        <url><loc>https://www.crwlr.software/blog/whats-new-in-crwlr-crawler-v0-4</loc><priority>1</priority><lastmod>2022-05-10</lastmod></url>
        <url><loc>https://www.crwlr.software/blog/whats-new-in-crwlr-crawler-v0-2-and-v0-3</loc><priority>1</priority><lastmod>2022-04-30</lastmod></url>
        <url><loc>https://www.crwlr.software/blog/release-of-crwlr-crawler-v-0-1-0</loc><priority>1</priority><lastmod>2022-04-18</lastmod></url>
        <url><loc>https://www.crwlr.software/blog/prevent-homograph-attacks-in-user-input-urls</loc><priority>1</priority><lastmod>2022-01-19</lastmod></url>
        </urlset>
        XML;

    $outputs = helper_invokeStepWithInput(Sitemap::getUrlsFromSitemap(), $xml);

    expect($outputs)->toHaveCount(9)
        ->and($outputs[0]->get())->toBe('https://www.crwlr.software/')
        ->and($outputs[8]->get())->toBe('https://www.crwlr.software/blog/prevent-homograph-attacks-in-user-input-urls');
});

it('gets all urls with additional data when the withData() method is used', function () {
    $xml = <<<XML
        <?xml version="1.0" encoding="UTF-8"?>
        <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
        <url><loc>https://www.crwlr.software/blog/whats-new-in-crwlr-crawler-v0-5</loc><priority>1</priority><lastmod>2022-09-03</lastmod></url>
        <url><loc>https://www.crwlr.software/blog/dealing-with-http-url-query-strings-in-php</loc><priority>1</priority><lastmod>2022-06-02</lastmod></url>
        <url><loc>https://www.crwlr.software/blog/whats-new-in-crwlr-crawler-v0-4</loc><priority>0.7</priority><lastmod>2022-05-10</lastmod></url>
        </urlset>
        XML;

    $outputs = helper_invokeStepWithInput(Sitemap::getUrlsFromSitemap()->withData(), $xml);

    expect($outputs)->toHaveCount(3)
        ->and($outputs[0]->get())->toBe([
            'url' => 'https://www.crwlr.software/blog/whats-new-in-crwlr-crawler-v0-5',
            'lastmod' => '2022-09-03',
            'priority' => '1',
        ])
        ->and($outputs[1]->get())->toBe([
            'url' => 'https://www.crwlr.software/blog/dealing-with-http-url-query-strings-in-php',
            'lastmod' => '2022-06-02',
            'priority' => '1',
        ])
        ->and($outputs[2]->get())->toBe([
            'url' => 'https://www.crwlr.software/blog/whats-new-in-crwlr-crawler-v0-4',
            'lastmod' => '2022-05-10',
            'priority' => '0.7',
        ]);
});

it('doesn\'t fail when sitemap is empty', function () {
    $xml = <<<XML
        <?xml version="1.0" encoding="UTF-8"?>
        <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
        </urlset>
        XML;

    $outputs = helper_invokeStepWithInput(Sitemap::getUrlsFromSitemap()->withData(), $xml);

    expect($outputs)->toHaveCount(0);
});

it(
    'doesn\'t fail when the urlset tag contains attributes, that would cause the symfony DomCrawler to not find the ' .
    'elements',
    function () {
        $xml = <<<XML
            <?xml version="1.0" encoding="UTF-8"?>
            <urlset xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
                    xsi:schemaLocation="http://www.sitemaps.org/schemas/sitemap/0.9 http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd"
                    xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
                <url><loc>https://www.crwlr.software/blog/whats-new-in-crwlr-crawler-v0-5</loc></url>
                <url><loc>https://www.crwlr.software/blog/dealing-with-http-url-query-strings-in-php</loc></url>
                <url><loc>https://www.crwlr.software/blog/whats-new-in-crwlr-crawler-v0-4</loc></url>
            </urlset>
            XML;

        $outputs = helper_invokeStepWithInput(Sitemap::getUrlsFromSitemap(), $xml);

        expect($outputs)->toHaveCount(3);
    },
);

it(
    'doesn\'t fail when the urlset tag contains attributes, that would cause the symfony DomCrawler to not find the ' .
    'elements, when the XML content has no line breaks',
    function () {
        $xml = <<<XML
            <?xml version="1.0" encoding="UTF-8"?><urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:news="http://www.google.com/schemas/sitemap-news/0.9" xmlns:xhtml="http://www.w3.org/1999/xhtml" xmlns:mobile="http://www.google.com/schemas/sitemap-mobile/1.0" xmlns:image="http://www.google.com/schemas/sitemap-image/1.1" xmlns:video="http://www.google.com/schemas/sitemap-video/1.1"><url><loc>https://www.crwlr.software/blog/whats-new-in-crwlr-crawler-v0-5</loc></url><url><loc>https://www.crwlr.software/blog/dealing-with-http-url-query-strings-in-php</loc></url><url><loc>https://www.crwlr.software/blog/whats-new-in-crwlr-crawler-v0-4</loc></url></urlset>
            XML;

        $outputs = helper_invokeStepWithInput(Sitemap::getUrlsFromSitemap(), $xml);

        expect($outputs)->toHaveCount(3);
    },
);


================================================
FILE: tests/Steps/StepTest.php
================================================
<?php

namespace tests\Steps;

use Crwlr\Crawler\Input;
use Crwlr\Crawler\Loader\Http\Messages\RespondedRequest;
use Crwlr\Crawler\Logger\CliLogger;
use Crwlr\Crawler\Output;
use Crwlr\Crawler\Steps\Filters\Filter;
use Crwlr\Crawler\Steps\Loading\Http;
use Crwlr\Crawler\Steps\Refiners\StringRefiner;
use Crwlr\Crawler\Steps\Step;
use Crwlr\Crawler\Steps\StepOutputType;
use Exception;
use Generator;
use GuzzleHttp\Psr7\Request;
use GuzzleHttp\Psr7\Response;
use InvalidArgumentException;
use PHPUnit\Framework\TestCase;
use stdClass;
use tests\_Stubs\DummyLogger;

use function tests\helper_getInputReturningStep;
use function tests\helper_getStdClassWithData;
use function tests\helper_getStepYieldingInputArrayAsSeparateOutputs;
use function tests\helper_getStepYieldingMultipleArraysWithNumber;
use function tests\helper_getStepYieldingMultipleNumbers;
use function tests\helper_getStepYieldingMultipleObjectsWithNumber;
use function tests\helper_getValueReturningStep;
use function tests\helper_invokeStepWithInput;
use function tests\helper_traverseIterable;

/** @var TestCase $this */

test('You can add a logger and it is available within the invoke method', function () {
    $step = new class extends Step {
        /**
         * @return Generator<string>
         */
        protected function invoke(mixed $input): Generator
        {
            $this->logger?->info('logging works');

            yield 'something';
        }
    };

    $step->addLogger(new CliLogger());

    helper_traverseIterable($step->invokeStep(new Input('test')));

    $output = $this->getActualOutputForAssertion();

    expect($output)->toContain('logging works');
});

test('The invokeStep method wraps the values returned by invoke in Output objects', function () {
    $step = helper_getValueReturningStep('returnValue');

    $output = helper_invokeStepWithInput($step);

    expect($output)->toHaveCount(1)
        ->and($output[0])->toBeInstanceOf(Output::class)
        ->and($output[0]->get())->toBe('returnValue');
});

/* ------------------------------- keep() ------------------------------- */

test('keep() can pick keys from nested (array) output using dot notation', function () {
    $step = helper_getValueReturningStep([
        'users' => [
            ['user' => 'otsch', 'firstname' => 'Christian', 'surname' => 'Olear'],
            ['user' => 'juerx', 'firstname' => 'Jürgen', 'surname' => 'Müller'],
            ['user' => 'sandy', 'firstname' => 'Sandra', 'surname' => 'Mayr'],
        ],
        'foo' => 'bar',
    ])
        ->keep(['nickname' => 'users.0.user', 'foo']);

    $output = helper_invokeStepWithInput($step);

    expect($output[0]->keep)->toBe(['nickname' => 'otsch', 'foo' => 'bar']);
});

test('keep() picks keys from nested output including a RespondedRequest object', function () {
    $step = helper_getValueReturningStep([
        'response' => new RespondedRequest(
            new Request('GET', 'https://www.example.com/something'),
            new Response(200, body: 'Hi :)'),
        ),
        'foo' => 'bar',
    ])
        ->keep(['content' => 'response.body']);

    $output = helper_invokeStepWithInput($step);

    expect($output[0]->keep)->toBe(['content' => 'Hi :)']);
});

it('maps output keys to different keys when defined in the array passed to keep()', function () {
    $step = helper_getValueReturningStep(['user' => 'otsch', 'firstname' => 'Christian', 'surname' => 'Olear'])
        ->keep(['foo' => 'firstname', 'bar' => 'surname']);

    $output = helper_invokeStepWithInput($step);

    expect($output[0]->keep)->toBe(['foo' => 'Christian', 'bar' => 'Olear']);
});

/* ------------------------------- useInputKey() ------------------------------- */

it('uses a key from array input when defined', function () {
    $step = helper_getInputReturningStep()->useInputKey('bar');

    $output = helper_invokeStepWithInput($step, new Input(
        ['foo' => 'fooValue', 'bar' => 'barValue', 'baz' => 'bazValue'],
    ));

    expect($output)->toHaveCount(1)
        ->and($output[0]->get())->toBe('barValue');
});

it('logs a warning message when the input key to use does not exist in input array', function () {
    $step = helper_getInputReturningStep()->useInputKey('baz');

    $step->addLogger(new CliLogger());

    $output = helper_invokeStepWithInput($step, new Input(['foo' => 'one', 'bar' => 'two']));

    expect($output)->toHaveCount(0)
        ->and($this->getActualOutputForAssertion())
        ->toContain('Can\'t get key from input, because it does not exist.');
});

it(
    'logs a warning message when useInputKey() was called but the input value is not an array',
    function (mixed $inputValue) {
        $step = helper_getInputReturningStep()->useInputKey('baz');

        $step->addLogger(new CliLogger());

        $output = helper_invokeStepWithInput($step, new Input($inputValue));

        expect($output)->toHaveCount(0)
            ->and($this->getActualOutputForAssertion())
            ->toContain(
                'Can\'t get key from input, because input is of type ' . gettype($inputValue) . ' instead of array.',
            );
    },
)->with([
    ['string'],
    [0],
    [new stdClass()],
]);

it('does not lose previously kept data, when it uses the useInputKey() method', function () {
    $step = helper_getValueReturningStep(['test' => 'test'])->useInputKey('foo');

    $outputs = helper_invokeStepWithInput($step, new Input(['foo' => 'test'], ['some' => 'thing']));

    expect($outputs[0]->keep)->toBe(['some' => 'thing']);
});

it('keeps the original input data when useInputKey() is used', function () {
    $step = helper_getValueReturningStep(['baz' => 'three'])
        ->keepFromInput()
        ->useInputKey('bar');

    $outputs = helper_invokeStepWithInput($step, ['foo' => 'one', 'bar' => 'two']);

    expect($outputs[0]->get())->toBe(['baz' => 'three'])
        ->and($outputs[0]->keep)->toBe(['foo' => 'one', 'bar' => 'two']);
});

test('useInputKey() can be used to get data that was kept from a previous step with keep() or keepAs()', function () {
    $step = helper_getInputReturningStep();

    $step->useInputKey('bar');

    $outputs = helper_invokeStepWithInput($step, new Input('value', keep: ['bar' => 'baz']));

    expect($outputs[0]->get())->toBe('baz');
});

it(
    'also passes on kept data through further steps when they don\'t define any further data to keep',
    function () {
        $step = helper_getValueReturningStep('returnValue');

        $output = helper_invokeStepWithInput($step, new Input('inputValue', ['prevProperty' => 'foobar']));

        expect($output)->toHaveCount(1)
            ->and($output[0]->keep)->toBe(['prevProperty' => 'foobar']);
    },
);

/* ------------------------------- uniqueInputs() ------------------------------- */

it('doesn\'t invoke twice with duplicate inputs when uniqueInput was called', function () {
    $step = helper_getInputReturningStep();

    $outputs = helper_invokeStepWithInput($step, 'foo');

    expect($outputs)->toHaveCount(1);

    $outputs = helper_invokeStepWithInput($step, 'foo');

    expect($outputs)->toHaveCount(1);

    $step->uniqueInputs();

    $outputs = helper_invokeStepWithInput($step, 'foo');

    expect($outputs)->toHaveCount(1);

    $outputs = helper_invokeStepWithInput($step, 'foo');

    expect($outputs)->toHaveCount(0);
});

it(
    'doesn\'t invoke twice with inputs with the same value in an array key when uniqueInput was called with that key',
    function () {
        $step = helper_getInputReturningStep();

        $step->uniqueInputs();

        $outputs = helper_invokeStepWithInput($step, ['foo' => 'bar', 'number' => 1]);

        expect($outputs)->toHaveCount(1);

        $outputs = helper_invokeStepWithInput($step, ['foo' => 'bar', 'number' => 2]);

        expect($outputs)->toHaveCount(1);

        $step->resetAfterRun();

        $step->uniqueInputs('foo');

        $outputs = helper_invokeStepWithInput($step, ['foo' => 'bar', 'number' => 1]);

        expect($outputs)->toHaveCount(1);

        $outputs = helper_invokeStepWithInput($step, ['foo' => 'bar', 'number' => 2]);

        expect($outputs)->toHaveCount(0);
    },
);

it(
    'doesn\'t invoke twice with inputs with the same value in an object key when uniqueInput was called with that key',
    function () {
        $step = helper_getInputReturningStep();

        $step->uniqueInputs();

        $outputs = helper_invokeStepWithInput($step, helper_getStdClassWithData(['foo' => 'bar', 'number' => 1]));

        expect($outputs)->toHaveCount(1);

        $outputs = helper_invokeStepWithInput($step, helper_getStdClassWithData(['foo' => 'bar', 'number' => 2]));

        expect($outputs)->toHaveCount(1);

        $step->resetAfterRun();

        $step->uniqueInputs('foo');

        $outputs = helper_invokeStepWithInput($step, helper_getStdClassWithData(['foo' => 'bar', 'number' => 1]));

        expect($outputs)->toHaveCount(1);

        $outputs = helper_invokeStepWithInput($step, helper_getStdClassWithData(['foo' => 'bar', 'number' => 2]));

        expect($outputs)->toHaveCount(0);
    },
);

/* ------------------------------- uniqueOutputs() ------------------------------- */

it('makes outputs unique when uniqueOutput was called', function () {
    $step = helper_getStepYieldingMultipleNumbers();

    $step->uniqueOutputs();

    $output = helper_invokeStepWithInput($step, new Input('anything'));

    expect($output)->toHaveCount(5)
        ->and($output[0]->get())->toBe('one')
        ->and($output[1]->get())->toBe('two')
        ->and($output[2]->get())->toBe('three')
        ->and($output[3]->get())->toBe('four')
        ->and($output[4]->get())->toBe('five');
});

it('makes outputs unique when providing a key name to uniqueOutput to use from array output', function () {
    $step = helper_getStepYieldingMultipleArraysWithNumber();

    $step->uniqueOutputs('number');

    $output = helper_invokeStepWithInput($step, new Input('anything'));

    expect($output)->toHaveCount(5);
});

it('makes outputs unique when providing a key name to uniqueOutput to use from object output', function () {
    $step = helper_getStepYieldingMultipleObjectsWithNumber();

    $step->uniqueOutputs('number');

    $output = helper_invokeStepWithInput($step, new Input('anything'));

    expect($output)->toHaveCount(5);
});

it('makes array outputs unique when providing no key name to uniqueOutput', function () {
    $step = helper_getStepYieldingMultipleArraysWithNumber();

    $step->uniqueOutputs();

    $output = helper_invokeStepWithInput($step, new Input(false));

    expect($output)->toHaveCount(5);

    $output = helper_invokeStepWithInput($step, new Input(true));

    expect($output)->toHaveCount(8);
});

it('makes object outputs unique when providing no key name to uniqueOutput', function () {
    $step = helper_getStepYieldingMultipleArraysWithNumber();

    $step->uniqueOutputs();

    $output = helper_invokeStepWithInput($step, new Input(false));

    expect($output)->toHaveCount(5);

    $output = helper_invokeStepWithInput($step, new Input(true));

    expect($output)->toHaveCount(8);
});

/* ----------------------------- oneOutputPerInput() ----------------------------- */

test(
    'when a step yields multiple outputs per input and the oneOutputPerInput() method was called, the step yields it ' .
    'as a single output with an array of all the single output values',
    function () {
        $step = helper_getStepYieldingInputArrayAsSeparateOutputs();

        $step->oneOutputPerInput();

        $outputs = helper_invokeStepWithInput($step, ['foo', 'bar', 'baz']);

        expect($outputs)->toHaveCount(1)
            ->and($outputs[0]->get())->toBe(['foo', 'bar', 'baz']);
    },
);

test('when using oneOutputPerInput(), the combined output counts as one output for the max outputs limit', function () {
    $step = helper_getStepYieldingInputArrayAsSeparateOutputs();

    $step->oneOutputPerInput()->maxOutputs(2);

    $outputs = helper_invokeStepWithInput($step, ['foo', 'bar', 'baz']);

    expect($outputs)->toHaveCount(1)
        ->and($outputs[0]->get())->toBe(['foo', 'bar', 'baz']);

    $outputs = helper_invokeStepWithInput($step, ['foo', 'bar', 'baz']);

    expect($outputs)->toHaveCount(1)
        ->and($outputs[0]->get())->toBe(['foo', 'bar', 'baz']);

    $outputs = helper_invokeStepWithInput($step, ['foo', 'bar', 'baz']);

    expect($outputs)->toHaveCount(0);
});

test('when using oneOutputPerInput(), refiners are applied to the single elements of the combined output', function () {
    $step = helper_getStepYieldingInputArrayAsSeparateOutputs();

    $step->oneOutputPerInput()->refineOutput('title', fn(mixed $outputValue) => $outputValue . '-hey');

    $outputs = helper_invokeStepWithInput($step, [
        ['title' => 'foo'],
        ['title' => 'bar'],
        ['title' => 'baz'],
    ]);

    expect($outputs)->toHaveCount(1)
        ->and($outputs[0]->get())->toBe([
            ['title' => 'foo-hey'],
            ['title' => 'bar-hey'],
            ['title' => 'baz-hey'],
        ]);
});

test('when using oneOutputPerInput(), filters are applied to the single elements of the combined output', function () {
    $step = helper_getStepYieldingInputArrayAsSeparateOutputs();

    $step->where('id', Filter::greaterThan(109))->oneOutputPerInput();

    $outputs = helper_invokeStepWithInput($step, [
        ['title' => 'foo', 'id' => 109],
        ['title' => 'bar', 'id' => 110],
        ['title' => 'baz', 'id' => 111],
    ]);

    expect($outputs)->toHaveCount(1)
        ->and($outputs[0]->get())->toBe([
            ['title' => 'bar', 'id' => 110],
            ['title' => 'baz', 'id' => 111],
        ]);
});

test(
    'when using oneOutputPerInput() in combination with outputKey(), the whole combined output is returned in an ' .
    'array with the defined key',
    function () {
        $step = helper_getStepYieldingInputArrayAsSeparateOutputs();

        $step->outputKey('test')->oneOutputPerInput();

        $outputs = helper_invokeStepWithInput($step, ['foo', 'bar', 'baz']);

        expect($outputs)->toHaveCount(1)
            ->and($outputs[0]->get())->toBe(['test' => ['foo', 'bar', 'baz']]);
    },
);

test(
    'when using oneOutputPerInput() in combination with uniqueOutputs(), the whole combined output is compared',
    function () {
        $step = helper_getStepYieldingInputArrayAsSeparateOutputs();

        $step->oneOutputPerInput()->uniqueOutputs();

        $outputs = helper_invokeStepWithInput($step, ['foo', 'bar', 'baz']);

        expect($outputs)->toHaveCount(1)
            ->and($outputs[0]->get())->toBe(['foo', 'bar', 'baz']);

        $outputs = helper_invokeStepWithInput($step, ['foo', 'bar', 'quz']);

        expect($outputs)->toHaveCount(1)
            ->and($outputs[0]->get())->toBe(['foo', 'bar', 'quz']);

        $outputs = helper_invokeStepWithInput($step, ['foo', 'bar', 'baz']);

        expect($outputs)->toHaveCount(0);
    },
);

/* -------------------------- validateAndSanitizeInput() -------------------------- */

it('calls the validateAndSanitizeInput method', function () {
    $step = new class extends Step {
        protected function validateAndSanitizeInput(mixed $input): string
        {
            return $input . ' validated and sanitized';
        }

        protected function invoke(mixed $input): Generator
        {
            yield $input;
        }
    };

    $output = helper_invokeStepWithInput($step, 'inputValue');

    expect($output[0]->get())->toBe('inputValue validated and sanitized');
});

test(
    'when calling validateAndSanitizeStringOrStringable() and the input is array with a single element it tries to ' .
    'use that element as input value',
    function () {
        $step = new class extends Step {
            protected function validateAndSanitizeInput(mixed $input): string
            {
                return $this->validateAndSanitizeStringOrStringable($input);
            }

            protected function invoke(mixed $input): Generator
            {
                yield $input;
            }
        };

        $output = helper_invokeStepWithInput($step, ['inputValue']);

        expect($output[0]->get())->toBe('inputValue');
    },
);

test(
    'when calling validateAndSanitizeStringOrStringable() and the input is array with multiple elements it logs ' .
    'an error message',
    function () {
        $logger = new DummyLogger();

        $step = new class extends Step {
            protected function validateAndSanitizeInput(mixed $input): string
            {
                return $this->validateAndSanitizeStringOrStringable($input);
            }

            protected function invoke(mixed $input): Generator
            {
                yield $input;
            }
        };

        $step->addLogger($logger);

        helper_invokeStepWithInput($step, ['inputValue', 'foo' => 'bar']);

        expect($logger->messages)->not->toBeEmpty()
            ->and($logger->messages[0]['message'])->toStartWith(
                'A step was called with input that it can not work with:',
            )
            ->and($logger->messages[0]['message'])->toEndWith('. The invalid input is of type array.');
    },
);

test(
    'when throwing an InvalidArgumentException from the validateAndSanitizeInput() it is caught and logged as an error',
    function () {
        $logger = new DummyLogger();

        $step = new class extends Step {
            protected function validateAndSanitizeInput(mixed $input): string
            {
                throw new InvalidArgumentException('hey :)');
            }

            protected function invoke(mixed $input): Generator
            {
                yield $input;
            }
        };

        $step->addLogger($logger);

        $outputs = helper_invokeStepWithInput($step, 'anything');

        expect($outputs)->toBeEmpty()
            ->and($logger->messages)->not->toBeEmpty()
            ->and($logger->messages[0]['message'])->toBe(
                'A step was called with input that it can not work with: hey :)',
            );
    },
);

test(
    'when throwing an Exception that is not an InvalidArgumentException, from the validateAndSanitizeInput() it is ' .
    'not caught',
    function () {
        $logger = new DummyLogger();

        $step = new class extends Step {
            protected function validateAndSanitizeInput(mixed $input): string
            {
                throw new Exception('hey :)');
            }

            protected function invoke(mixed $input): Generator
            {
                yield $input;
            }
        };

        $step->addLogger($logger);

        helper_invokeStepWithInput($step, 'anything');
    },
)->throws(Exception::class);

it('is possible that a step does not produce any output at all', function () {
    $step = new class extends Step {
        protected function invoke(mixed $input): Generator
        {
            if ($input === 'foo') {
                yield 'bar';
            }
        }
    };

    $output = helper_invokeStepWithInput($step, 'lol');

    expect($output)->toHaveCount(0);

    $output = helper_invokeStepWithInput($step, 'foo');

    expect($output)->toHaveCount(1)
        ->and($output[0]->get())->toBe('bar');
});

/* --------------------------- updateInputUsingOutput() --------------------------- */

test('You can add and call an updateInputUsingOutput callback', function () {
    $step = helper_getValueReturningStep('something');

    $step->updateInputUsingOutput(function (mixed $input, mixed $output) {
        return $input . ' ' . $output;
    });

    $updatedInput = $step->callUpdateInputUsingOutput(new Input('Boo'), new Output('Yah!'));

    expect($updatedInput)->toBeInstanceOf(Input::class)
        ->and($updatedInput->get())->toBe('Boo Yah!');
});

it('does not lose previously kept data, when updateInputUsingOutput() is called', function () {
    $step = helper_getValueReturningStep('something');

    $step->updateInputUsingOutput(function (mixed $input, mixed $output) {
        return $input . ' ' . $output;
    });

    $updatedInput = $step->callUpdateInputUsingOutput(
        new Input('Some', ['foo' => 'bar']),
        new Output('thing'),
    );

    expect($updatedInput->keep)->toBe(['foo' => 'bar']);
});

/* -------------------------------- maxOutputs() -------------------------------- */

it('does not yield more outputs than defined via maxOutputs() method', function () {
    $step = helper_getValueReturningStep('yolo')->maxOutputs(3);

    for ($i = 1; $i <= 5; $i++) {
        $outputs = helper_invokeStepWithInput($step, new Input('asdf'));

        if ($i <= 3) {
            expect($outputs)->toHaveCount(1);
        } else {
            expect($outputs)->toHaveCount(0);
        }
    }
});

it(
    'does not yield more outputs than defined via maxOutputs() when step yields multiple outputs per input and the ' .
    'limit is reached in the middle of the outputs resulting from one input',
    function () {
        $step = new class extends Step {
            protected function invoke(mixed $input): Generator
            {
                yield 'one';

                yield 'two';

                yield 'three';
            }
        };

        $step->maxOutputs(7);

        $outputs = helper_invokeStepWithInput($step, new Input('a'));

        expect($outputs)->toHaveCount(3);

        $outputs = helper_invokeStepWithInput($step, new Input('b'));

        expect($outputs)->toHaveCount(3);

        $outputs = helper_invokeStepWithInput($step, new Input('c'));

        expect($outputs)->toHaveCount(1);
    },
);

test('When a step has max outputs defined, it won\'t call the invoke method after the limit was reached', function () {
    $step = new class extends Step {
        public int $_invokeCallCount = 0;

        protected function invoke(mixed $input): Generator
        {
            $this->_invokeCallCount += 1;

            yield 'something';
        }
    };

    $step->maxOutputs(2);

    helper_invokeStepWithInput($step, new Input('one'));

    helper_invokeStepWithInput($step, new Input('two'));

    helper_invokeStepWithInput($step, new Input('three'));

    helper_invokeStepWithInput($step, new Input('four'));

    expect($step->_invokeCallCount)->toBe(2);
});

it('resets outputs count for maxOutputs rule when resetAfterRun() is called', function () {
    $step = helper_getValueReturningStep('gogogo')->maxOutputs(2);

    helper_invokeStepWithInput($step, new Input('one'));

    helper_invokeStepWithInput($step, new Input('two'));

    $step->resetAfterRun();

    expect(helper_invokeStepWithInput($step, new Input('three')))->toHaveCount(1);
});

/* -------------------------------- outputKey() -------------------------------- */

it('converts non array output to array with a certain key using the outputKey() method', function () {
    $step = helper_getValueReturningStep('bar')->outputKey('foo');

    $outputs = helper_invokeStepWithInput($step);

    expect($outputs[0]->get())->toBe(['foo' => 'bar']);
});

test('keeping a scalar output value with keep() also works when outputKey() was used', function () {
    $step = new class extends Step {
        protected function invoke(mixed $input): Generator
        {
            yield 'hey';
        }

        public function outputType(): StepOutputType
        {
            return StepOutputType::Scalar;
        }
    };

    $step
        ->outputKey('greeting')
        ->keep();

    $step->validateBeforeRun(Http::get());

    $outputs = helper_invokeStepWithInput($step, 'guten tag');

    expect($outputs[0]->get())->toBe(['greeting' => 'hey']);
});

/* -------------------------------- refineOutput() -------------------------------- */

it('applies a Closure refiner to the steps output', function () {
    $step = helper_getValueReturningStep('output');

    $step->refineOutput(function (mixed $outputValue) {
        return $outputValue . ' refined';
    });

    $outputs = helper_invokeStepWithInput($step);

    expect($outputs[0]->get())->toBe('output refined');
});

it('applies an instance of the RefinerInterface to the steps output', function () {
    $step = helper_getInputReturningStep();

    $step->refineOutput(StringRefiner::betweenFirst('foo', 'baz'));

    $outputs = helper_invokeStepWithInput($step, 'foo bar baz');

    expect($outputs[0]->get())->toBe('bar');
});

it('applies multiple refiners to the steps output in the order they\'re added', function () {
    $step = helper_getInputReturningStep();

    $step
        ->refineOutput(StringRefiner::betweenFirst('foo', 'baz'))
        ->refineOutput(function (mixed $outputValue) {
            return $outputValue . ' refined';
        })
        ->refineOutput(function (mixed $outputValue) {
            return $outputValue . ', and refined further';
        });

    $outputs = helper_invokeStepWithInput($step, 'foo bar baz');

    expect($outputs[0]->get())->toBe('bar refined, and refined further');
});

it('applies refiners to certain keys from array output when the key is provided', function () {
    $step = helper_getInputReturningStep();

    $step
        ->refineOutput('foo', StringRefiner::betweenFirst('lorem', 'dolor'))
        ->refineOutput('baz', function (mixed $outputValue) {
            return 'refined ' . $outputValue;
        });

    $outputs = helper_invokeStepWithInput(
        $step,
        ['foo' => 'lorem ipsum dolor', 'bar' => 'bla', 'baz' => 'quz'],
    );

    expect($outputs[0]->get())->toBe([
        'foo' => 'ipsum',
        'bar' => 'bla',
        'baz' => 'refined quz',
    ]);
});

test('you can apply multiple refiners to the same output array key', function () {
    $step = helper_getInputReturningStep();

    $step
        ->refineOutput('foo', StringRefiner::betweenFirst('lorem', 'dolor'))
        ->refineOutput('foo', function (mixed $outputValue) {
            return $outputValue . ' yolo';
        });

    $outputs = helper_invokeStepWithInput(
        $step,
        ['foo' => 'lorem ipsum dolor', 'bar' => 'bla'],
    );

    expect($outputs[0]->get())->toBe([
        'foo' => 'ipsum yolo',
        'bar' => 'bla',
    ]);
});

it(
    'uses the original input value when applying a refiner, not only the value of an input array key chosen via ' .
    'useInputKey()',
    function () {
        $step = helper_getInputReturningStep();

        $step
            ->useInputKey('bar')
            ->refineOutput(function (mixed $outputValue, mixed $originalInputValue) {
                return $originalInputValue;
            });

        $outputs = helper_invokeStepWithInput(
            $step,
            ['foo' => 'one', 'bar' => 'two'],
        );

        expect($outputs[0]->get())->toBe(['foo' => 'one', 'bar' => 'two']);
    },
);

/* ------------------------------- outputKeyAliases() ------------------------------- */

test('you can define aliases for output keys and they are considered when using keep()', function () {
    $step = new class extends Step {
        protected function invoke(mixed $input): Generator
        {
            yield [
                'foo' => 'one',
                'bar' => 'two',
                'baz' => 'three',
            ];
        }

        protected function outputKeyAliases(): array
        {
            return [
                'woo' => 'foo',
                'war' => 'bar',
                'waz' => 'baz',
            ];
        }
    };

    $step->keep(['woo', 'far' => 'war', 'waz']);

    $outputs = helper_invokeStepWithInput($step);

    expect($outputs[0]->keep)->toBe([
        'woo' => 'one',
        'far' => 'two',
        'waz' => 'three',
    ]);
});

test('you can filter outputs using an output key alias', function () {
    $step = new class extends Step {
        protected function invoke(mixed $input): Generator
        {
            yield [
                'foo' => 'one',
                'bar' => 'two',
            ];
        }

        protected function outputKeyAliases(): array
        {
            return [
                'baz' => 'bar',
            ];
        }
    };

    $step->where('baz', Filter::equal('two'));

    $outputs = helper_invokeStepWithInput($step);

    expect($outputs[0])->toBeInstanceOf(Output::class);
});

it('can filter by a key that only exists in the serialized version of an output object', function () {
    $step = new class extends Step {
        protected function invoke(mixed $input): Generator
        {
            yield new class {
                public string $foo = 'one';

                public string $bar = 'two';

                /**
                 * @return string[]
                 */
                public function __serialize(): array
                {
                    return [
                        'foo' => $this->foo,
                        'bar' => $this->bar,
                        'baz' => $this->bar,
                    ];
                }
            };
        }

        protected function outputKeyAliases(): array
        {
            return [
                'quz' => 'baz',
            ];
        }
    };

    $step->where('quz', Filter::equal('two'));

    $outputs = helper_invokeStepWithInput($step);

    expect($outputs[0])->toBeInstanceOf(Output::class);
});


================================================
FILE: tests/Steps/XmlTest.php
================================================
<?php

namespace tests\Steps;

use Crwlr\Crawler\Loader\Http\Messages\RespondedRequest;
use Crwlr\Crawler\Steps\Dom;
use Crwlr\Crawler\Steps\Xml;
use GuzzleHttp\Psr7\Request;
use GuzzleHttp\Psr7\Response;

use function tests\helper_getStepFilesContent;
use function tests\helper_invokeStepWithInput;

it('returns single strings when extract is called with a selector only', function () {
    $output = helper_invokeStepWithInput(
        Xml::each('bookstore book')->extract('title'),
        helper_getStepFilesContent('Xml/bookstore.xml'),
    );

    expect($output)->toHaveCount(4)
        ->and($output[0]->get())->toBe('Everyday Italian')
        ->and($output[3]->get())->toBe('Learning XML');
});

it('extracts data from an XML document with XPath queries per default', function () {
    $output = helper_invokeStepWithInput(
        Xml::each('bookstore book')->extract([
            'title' => 'title',
            'author' => 'author',
            'year' => 'year',
        ]),
        helper_getStepFilesContent('Xml/bookstore.xml'),
    );

    expect($output)->toHaveCount(4)
        ->and($output[0]->get())->toBe(
            ['title' => 'Everyday Italian', 'author' => 'Giada De Laurentiis', 'year' => '2005'],
        )
        ->and($output[1]->get())->toBe(['title' => 'Harry Potter', 'author' => 'J K. Rowling', 'year' => '2005'])
        ->and($output[2]->get())->toBe(
            [
                'title' => 'XQuery Kick Start',
                'author' => ['James McGovern', 'Per Bothner', 'Kurt Cagle', 'James Linn', 'Vaidyanathan Nagarajan'],
                'year' => '2003',
            ],
        )
        ->and($output[3]->get())->toBe(['title' => 'Learning XML', 'author' => 'Erik T. Ray', 'year' => '2003']);
});

it('can also extract data using XPath queries', function () {
    $output = helper_invokeStepWithInput(
        Xml::each(Dom::xPath('//bookstore/book'))->extract([
            'title' => Dom::xPath('//title'),
            'author' => Dom::xPath('//author'),
            'year' => Dom::xPath('//year'),
        ]),
        helper_getStepFilesContent('Xml/bookstore.xml'),
    );

    expect($output)->toHaveCount(4)
        ->and($output[2]->get())->toBe(
            [
                'title' => 'XQuery Kick Start',
                'author' => ['James McGovern', 'Per Bothner', 'Kurt Cagle', 'James Linn', 'Vaidyanathan Nagarajan'],
                'year' => '2003',
            ],
        );
});

it('returns only one (compound) output when the root method is used', function () {
    $output = helper_invokeStepWithInput(
        Xml::root()->extract(['title' => 'title', 'author' => 'author', 'year' => 'year']),
        helper_getStepFilesContent('Xml/bookstore.xml'),
    );

    expect($output)->toHaveCount(1)
        ->and($output[0]->get()['title'])->toBe(['Everyday Italian', 'Harry Potter', 'XQuery Kick Start', 'Learning XML']);
});

it('extracts the data of the first matching element when the first method is used', function () {
    $output = helper_invokeStepWithInput(
        Xml::first('bookstore book')->extract(['title' => 'title', 'author' => 'author', 'year' => 'year']),
        helper_getStepFilesContent('Xml/bookstore.xml'),
    );

    expect($output)->toHaveCount(1)
        ->and($output[0]->get())->toBe(
            ['title' => 'Everyday Italian', 'author' => 'Giada De Laurentiis', 'year' => '2005'],
        );
});

it('extracts the data of the last matching element when the last method is used', function () {
    $output = helper_invokeStepWithInput(
        Xml::last('bookstore book')->extract(['title' => 'title', 'author' => 'author', 'year' => 'year']),
        helper_getStepFilesContent('Xml/bookstore.xml'),
    );

    expect($output)->toHaveCount(1)
        ->and($output[0]->get())->toBe(['title' => 'Learning XML', 'author' => 'Erik T. Ray', 'year' => '2003']);
});

test(
    'you can extract data in a second level to the output array using another Xml step as an element in the mapping ' .
    'array',
    function () {
        $response = new RespondedRequest(
            new Request('GET', 'https://www.example.com/events.xml'),
            new Response(body: helper_getStepFilesContent('Xml/events.xml')),
        );

        $outputs = helper_invokeStepWithInput(
            Xml::each('events event')->extract([
                'title' => 'name',
                'location' => 'location',
                'date' => 'date',
                'talks' => Xml::each('talks talk')->extract([
                    'title' => 'title',
                    'speaker' => 'speaker',
                ]),
            ]),
            $response,
        );

        expect($outputs)->toHaveCount(2)
            ->and($outputs[0]->get())->toBe([
                'title' => 'Some Meetup',
                'location' => 'Somewhere',
                'date' => '2023-01-14 20:00',
                'talks' => [
                    [
                        'title' => 'Sophisticated talk title',
                        'speaker' => 'Super Mario',
                    ],
                    [
                        'title' => 'Fun talk',
                        'speaker' => 'Princess Peach',
                    ],
                ],
            ])
            ->and($outputs[1]->get())->toBe([
                'title' => 'Another Meetup',
                'location' => 'Somewhere else',
                'date' => '2023-01-21 19:00',
                'talks' => [
                    [
                        'title' => 'Join the dark side',
                        'speaker' => 'Wario',
                    ],
                    [
                        'title' => 'Let\'s go',
                        'speaker' => 'Yoshi',
                    ],
                ],
            ]);

    },
);

test(
    'When a child step is nested in the extraction and does not use each(), the extracted value is an array with ' .
    'the keys defined in extract(), rather than an array of such arrays as it would be with each().',
    function () {
        $xml = <<<XML
            <?xml version="1.0" encoding="UTF-8"?>
            <companies>
            <company>
                <name>ABCDEFGmbH</name>
                <founded year="1984">foo</founded>
                <location>
                    <country>Germany</country>
                    <city>Frankfurt</city>
                </location>
            </company>
            <company>
                <name>Saubär GmbH</name>
                <founded year="2014">bar</founded>
                <location>
                    <country>Austria</country>
                    <city>Klagenfurt</city>
                </location>
            </company>
            </companies>
            XML;

        $expectedCompany1 = [
            'name' => 'ABCDEFGmbH',
            'founded' => '1984',
            'location' => ['country' => 'Germany', 'city' => 'Frankfurt'],
        ];

        $expectedCompany2 = [
            'name' => 'Saubär GmbH',
            'founded' => '2014',
            'location' => ['country' => 'Austria', 'city' => 'Klagenfurt'],
        ];

        // With base root()
        $step = Xml::each(Dom::xPath('//companies/company'))->extract([
            'name' => Dom::cssSelector('name')->text(),
            'founded' => Dom::xPath('//founded')->attribute('year'),
            'location' => Xml::root()->extract([
                'country' => Dom::xPath('//location/country')->text(),
                'city' => Dom::cssSelector('location city')->text(),
            ]),
        ]);

        $outputs = helper_invokeStepWithInput($step, $xml);

        expect($outputs)->toHaveCount(2)
            ->and($outputs[0]->get())->toBe($expectedCompany1)
            ->and($outputs[1]->get())->toBe($expectedCompany2);

        // With base first()
        $step = Xml::each(Dom::xPath('//companies/company'))->extract([
            'name' => Dom::cssSelector('name')->text(),
            'founded' => Dom::xPath('//founded')->attribute('year'),
            'location' => Xml::first(Dom::cssSelector('location'))->extract([
                'country' => Dom::xPath('//country')->text(),
                'city' => Dom::cssSelector('city')->text(),
            ]),
        ]);

        $outputs = helper_invokeStepWithInput($step, $xml);

        expect($outputs)->toHaveCount(2)
            ->and($outputs[0]->get())->toBe($expectedCompany1)
            ->and($outputs[1]->get())->toBe($expectedCompany2);

        // With base last()
        $step = Xml::each(Dom::xPath('//companies/company'))->extract([
            'name' => Dom::cssSelector('name')->text(),
            'founded' => Dom::xPath('//founded')->attribute('year'),
            'location' => Xml::last(Dom::cssSelector('location'))->extract([
                'country' => Dom::xPath('//country')->text(),
                'city' => Dom::cssSelector('city')->text(),
            ]),
        ]);

        $outputs = helper_invokeStepWithInput($step, $xml);

        expect($outputs)->toHaveCount(2)
            ->and($outputs[0]->get())->toBe($expectedCompany1)
            ->and($outputs[1]->get())->toBe($expectedCompany2);
    },
);

it('works when the response string starts with an UTF-8 byte order mark character', function () {
    $response = new RespondedRequest(
        new Request('GET', 'https://www.example.com/rss'),
        new Response(body: helper_getStepFilesContent('Xml/rss-with-bom.xml')),
    );

    $outputs = helper_invokeStepWithInput(
        Xml::each('channel item')->extract([
            'url' => 'link',
            'title' => 'title',
        ]),
        $response,
    );

    expect($outputs[0]->get())->toBe([
        'url' => 'https://www.example.com/story/1234567/foo-bar-baz?ref=rss',
        'title' => 'Some title',
    ]);
});

test(
    'when selecting elements with each(), you can reference the element already selected within the each() selector ' .
    'itself, in sub selectors',
    function () {
        $xml = <<<XML
            <?xml version="1.0" encoding="utf-8"?>
            <data>
                <items>
                    <item attr="abc">
                        <id>123</id>
                        <subitems>
                            <subitem>
                                <id>456</id>
                            </subitem>
                        </subitems>
                    </item>
                </items>
            </data>
            XML;

        $response = new RespondedRequest(
            new Request('GET', 'https://www.example.com/foo'),
            new Response(body: $xml),
        );

        $output = helper_invokeStepWithInput(
            Xml::each('data items item')->extract([
                // This is what this test is about. The element already selected in each (item) can be
                // referenced in these child selectors.
                'id' => Dom::cssSelector('item > id'),
                'attribute' => Dom::cssSelector('')->attribute('attr'),
            ]),
            $response,
        );

        expect($output)->toHaveCount(1)
            ->and($output[0]->get())->toBe(['id' => '123', 'attribute' => 'abc']);
    },
);

it('works with tags with camelCase names', function () {
    $xml = <<<XML
        <?xml version="1.0" encoding="utf-8"?>
        <feed>
          <channelName>foo</channelName>
          <channelIdentifier>foo</channelIdentifier>
          <items>
            <item>
              <id>abc-123</id>
              <updated>2024-11-07T11:00:31Z</updated>
              <title>Foo bar baz!</title>
              <someUrl>https://www.example.com/item-1?utm_source=foo&amp;utm_medium=feed-xml</someUrl>
              <foo>
                <baRbaz>test</baRbaz>
              </foo>
            </item>
          </items>
        </feed>
        XML;

    $response = new RespondedRequest(
        new Request('GET', 'https://www.example.com/xml-feed'),
        new Response(body: $xml),
    );

    $outputs = helper_invokeStepWithInput(
        Xml::each(Dom::cssSelector('feed items item'))->extract([
            'title' => 'title',
            'some-url' => 'someUrl',
            'foo-bar-baz' => 'foo baRbaz',
        ]),
        $response,
    );

    expect($outputs[0]->get())->toBe([
        'title' => 'Foo bar baz!',
        'some-url' => 'https://www.example.com/item-1?utm_source=foo&utm_medium=feed-xml',
        'foo-bar-baz' => 'test',
    ]);
})->group('php84');


================================================
FILE: tests/Steps/_Files/Csv/basic.csv
================================================
123,"Otsch","https://www.otsch.codes"
234,"John Doe","https://www.john.doe"
345,"Jane Doe","https://www.jane.doe"


================================================
FILE: tests/Steps/_Files/Csv/enclosure.csv
================================================
123,?Kräftige Rindsuppe?,4.5
234,?Crispy Chicken Burger?,12
345,?Duett von Saibling und Forelle?,21


================================================
FILE: tests/Steps/_Files/Csv/escape.csv
================================================
123,"test %"escape%" test",test
123,"foo %"escape%" bar %"baz%" lorem",test


================================================
FILE: tests/Steps/_Files/Csv/separator.csv
================================================
123*"CoDerOtsch"*Christian*Olear*35
234*"g3n1u5"*Albert*Einstein*143
345*"sWiFtY"*Taylor*Swift*32


================================================
FILE: tests/Steps/_Files/Csv/with-column-headlines.csv
================================================
Stunde,Montag,Dienstag,Mittwoch,Donnerstag,Freitag
1,Mathematik,Deutsch,Englisch,Erdkunde,Politik
2,Sport,Deutsch,Englisch,Sport,Geschichte
3,Sport,"Religion (ev., kath.)",Kunst,,Kunst


================================================
FILE: tests/Steps/_Files/Html/basic.html
================================================
<!DOCTYPE html>
<html>
<head></head>
<body>
<p class="match">match 1</p>

<div class="list">
    <div class="item">
        <p class="match">match 2</p>
    </div>
    <div class="item">
        <p class="match">match 3</p>
    </div>
</div>
</body>
</html>


================================================
FILE: tests/Steps/_Files/Html/bookstore.html
================================================
<!DOCTYPE html>
<html lang="en">
    <head>
        <title>Bookstore Example in HTML :)</title>
    </head>
    <body>
        <div id="bookstore">

            <div class="book" data-category="cooking">
                <h3 class="title" lang="en">Everyday Italian</h3>
                <div class="author">Giada De Laurentiis</div>
                <span class="year">2005</span> - <span class="price">30.00</span>
            </div>

            <div class="book" data-category="children">
                <h3 class="title" lang="en">Harry Potter</h3>
                <div class="author">J K. Rowling</div>
                <span class="year">2005</span> - <span class="price">29.99</span>
            </div>

            <div class="book" data-category="web">
                <h3 class="title" lang="en">XQuery Kick Start</h3>
                <span class="author">James McGovern</span>,
                <span class="author">Per Bothner</span>,
                <span class="author">Kurt Cagle</span>,
                <span class="author">James Linn</span>,
                <span class="author">Vaidyanathan Nagarajan</span>
                <span class="year">2003</span> - <span class="price">49.99</span>
            </div>

            <div class="book" data-category="web" data-cover="paperback">
                <h3 class="title" lang="en">Learning XML</h3>
                <div class="author">Erik T. Ray</div>
                <span class="year">2003</span> - <span class="price">39.95</span>
            </div>

        </div>
    </body>
</html>


================================================
FILE: tests/Steps/_Files/Html/event.html
================================================
<!DOCTYPE html>
<html lang="en">
<head>
    <title>Bookstore Example in HTML :)</title>
</head>
<body>
<div id="event">
    <h1>Some Meetup</h1>
    <div class="location">Somewhere</div>
    <div class="date">2023-01-14 21:00</div>

    <div class="talks">
        <div class="talk">
            <h2 class="title">Sophisticated talk title</h2>
            <div class="speaker">Super Mario</div>
            <a href="slides/talk1.pdf" class="slidesLink">Slides</a>
        </div>
        <div class="talk">
            <h2 class="title">Simple beginner talk</h2>
            <div class="speaker">Luigi</div>
            <a href="slides/talk2.pdf" class="slidesLink">Slides</a>
        </div>
        <div class="talk">
            <h2 class="title">Fun talk</h2>
            <div class="speaker">Princess Peach</div>
            <a href="slides/talk3.pdf" class="slidesLink">Slides</a>
        </div>
    </div>
</div>
</body>
</html>


================================================
FILE: tests/Steps/_Files/Xml/bookstore.xml
================================================
<?xml version="1.0" encoding="UTF-8"?>
<bookstore>

    <book category="cooking">
        <title lang="en">Everyday Italian</title>
        <author>Giada De Laurentiis</author>
        <year>2005</year>
        <price>30.00</price>
    </book>

    <book category="children">
        <title lang="en">Harry Potter</title>
        <author>J K. Rowling</author>
        <year>2005</year>
        <price>29.99</price>
    </book>

    <book category="web">
        <title lang="en">XQuery Kick Start</title>
        <author>James McGovern</author>
        <author>Per Bothner</author>
        <author>Kurt Cagle</author>
        <author>James Linn</author>
        <author>Vaidyanathan Nagarajan</author>
        <year>2003</year>
        <price>49.99</price>
    </book>

    <book category="web" cover="paperback">
        <title lang="en">Learning XML</title>
        <author>Erik T. Ray</author>
        <year>2003</year>
        <price>39.95</price>
    </book>

</bookstore>


================================================
FILE: tests/Steps/_Files/Xml/events.xml
================================================
<?xml version="1.0" encoding="UTF-8"?>
<events>

    <event>
        <name>Some Meetup</name>
        <location>Somewhere</location>
        <date>2023-01-14 20:00</date>
        <talks>
            <talk>
                <title>Sophisticated talk title</title>
                <speaker>Super Mario</speaker>
            </talk>
            <talk>
                <title>Fun talk</title>
                <speaker>Princess Peach</speaker>
            </talk>
        </talks>
    </event>

    <event>
        <name>Another Meetup</name>
        <location>Somewhere else</location>
        <date>2023-01-21 19:00</date>
        <talks>
            <talk>
                <title>Join the dark side</title>
                <speaker>Wario</speaker>
            </talk>
            <talk>
                <title>Let's go</title>
                <speaker>Yoshi</speaker>
            </talk>
        </talks>
    </event>

</events>


================================================
FILE: tests/Steps/_Files/Xml/rss-with-bom.xml
================================================
﻿<?xml version="1.0" encoding="utf-8"?><rss version="2.0"><channel><title>Foo - Bar</title><link>https://www.example.com/something</link><description>lorem ipsum dolor sit</description><language>de-de</language><item><guid isPermaLink="true">https://www.example.com/story/1234567/foo-bar-baz?ref=rss</guid><link>https://www.example.com/story/1234567/foo-bar-baz?ref=rss</link><category domain="https://www.example.com/something">Foo</category><category domain="https://www.example.com/something/else">Bar</category><title>Some title</title><description>lorem ipsum dolor sit amet</description><pubDate>Mon, 08 May 2023 14:08:21 Z</pubDate><group xmlns="http://search.yahoo.com/mrss/"><content width="150" url="https://example.com/yolo.jpg" /><content width="800" url="https://example.com/yolo.jpg" /><credit>Foto: Foo/Bar</credit></group><content width="150" url="https://example.com/yolo.jpg" xmlns="http://search.yahoo.com/mrss/" /><credit xmlns="http://search.yahoo.com/mrss/">Foto: Foo/Bar</credit></item></channel></rss>


================================================
FILE: tests/Stores/JsonFileStoreTest.php
================================================
<?php

namespace tests\Stores;

use Crwlr\Crawler\Result;
use Crwlr\Crawler\Stores\JsonFileStore;

/**
 * @param mixed[] $data
 */
function helper_getResultWithJsonData(array $data): Result
{
    $result = new Result();

    foreach ($data as $key => $value) {
        $result->set($key, $value);
    }

    return $result;
}

it('saves Results to a JSON file', function () {
    $result1 = helper_getResultWithJsonData(['user' => 'otsch', 'firstname' => 'Christian', 'surname' => 'Olear']);

    $store = new JsonFileStore(__DIR__ . '/_files', 'test');

    $store->store($result1);

    expect(file_get_contents($store->filePath()))->toBe('[{"user":"otsch","firstname":"Christian","surname":"Olear"}]');

    $result2 = helper_getResultWithJsonData(['user' => 'hader', 'firstname' => 'Josef', 'surname' => 'Hader']);

    $store->store($result2);

    expect(file_get_contents($store->filePath()))->toBe(
        '[{"user":"otsch","firstname":"Christian","surname":"Olear"},' .
        '{"user":"hader","firstname":"Josef","surname":"Hader"}]',
    );

    $result3 = helper_getResultWithJsonData(['user' => 'evamm', 'firstname' => 'Eva Maria', 'surname' => 'Maier']);

    $store->store($result3);

    expect(file_get_contents($store->filePath()))->toBe(
        '[{"user":"otsch","firstname":"Christian","surname":"Olear"},' .
        '{"user":"hader","firstname":"Josef","surname":"Hader"},' .
        '{"user":"evamm","firstname":"Eva Maria","surname":"Maier"}]',
    );
});

afterAll(function () {
    $dir = __DIR__ . '/_files';

    if (file_exists($dir)) {
        $files = scandir($dir);

        if (is_array($files)) {
            foreach ($files as $file) {
                if ($file === '.' || $file === '..' || !str_ends_with($file, '.json')) {
                    continue;
                }

                @unlink($dir . '/' . $file);
            }
        }
    }
});


================================================
FILE: tests/Stores/SimpleCsvFileStoreTest.php
================================================
<?php

namespace tests\Stores;

use Crwlr\Crawler\Result;
use Crwlr\Crawler\Stores\SimpleCsvFileStore;

/**
 * @param mixed[] $data
 */
function helper_getResultWithData(array $data): Result
{
    $result = new Result();

    foreach ($data as $key => $value) {
        $result->set($key, $value);
    }

    return $result;
}

it('saves Results to a csv file', function () {
    $result1 = helper_getResultWithData(['user' => 'otsch', 'firstname' => 'Christian', 'surname' => 'Olear']);

    $store = new SimpleCsvFileStore(__DIR__ . '/_files', 'test');

    $store->store($result1);

    expect(file_get_contents($store->filePath()))->toBe("user,firstname,surname\notsch,Christian,Olear\n");

    $result2 = helper_getResultWithData(['user' => 'hader', 'firstname' => 'Josef', 'surname' => 'Hader']);

    $store->store($result2);

    expect(file_get_contents($store->filePath()))->toBe(
        "user,firstname,surname\notsch,Christian,Olear\nhader,Josef,Hader\n",
    );

    $result3 = helper_getResultWithData(['user' => 'evamm', 'firstname' => 'Eva Maria', 'surname' => 'Maier']);

    $store->store($result3);

    expect(file_get_contents($store->filePath()))->toBe(
        "user,firstname,surname\notsch,Christian,Olear\nhader,Josef,Hader\nevamm,\"Eva Maria\",Maier\n",
    );
});

test('if the value of a result property is an array, it concatenates the values separated with a pipe', function () {
    $result1 = helper_getResultWithData(['col1' => 'foo', 'col2' => ['bar', 'baz', 'quz']]);

    $store = new SimpleCsvFileStore(__DIR__ . '/_files', 'test2');

    $store->store($result1);

    expect(file_get_contents($store->filePath()))->toBe("col1,col2\nfoo,\"bar | baz | quz\"\n");

    $result2 = helper_getResultWithData(['col1' => 'Donald', 'col2' => ['Tick', 'Trick', 'Track']]);

    $store->store($result2);

    expect(file_get_contents($store->filePath()))->toBe(
        "col1,col2\nfoo,\"bar | baz | quz\"\nDonald,\"Tick | Trick | Track\"\n",
    );
});

afterAll(function () {
    $dir = __DIR__ . '/_files';

    if (file_exists($dir)) {
        $files = scandir($dir);

        if (is_array($files)) {
            foreach ($files as $file) {
                if ($file === '.' || $file === '..' || !str_ends_with($file, '.csv')) {
                    continue;
                }

                unlink($dir . '/' . $file);
            }
        }
    }
});


================================================
FILE: tests/Stores/_files/.gitkeep
================================================


================================================
FILE: tests/UserAgents/BotUserAgentTest.php
================================================
<?php

namespace tests\UserAgents;

use Crwlr\Crawler\UserAgents\BotUserAgent;
use PHPUnit\Framework\TestCase;

/** @var TestCase $this */

test('Manually create UserAgent instance', function () {
    $userAgent = new BotUserAgent('SomeBot');
    $this->assertStringContainsString('SomeBot', $userAgent);
});

test('Create UserAgent instance via static make method', function () {
    $userAgent = BotUserAgent::make('CrwlrBot');
    $this->assertStringContainsString('CrwlrBot', $userAgent);
});

test('Create instance with info uri', function () {
    $userAgent = new BotUserAgent('SomeBot', 'https://www.example.com/somebot');
    $this->assertStringContainsString('SomeBot; +https://www.example.com/somebot', $userAgent);
});

test('Create instance with info uri and version', function () {
    $userAgent = new BotUserAgent('SomeBot', 'https://www.example.com/somebot', '1.3');
    $this->assertStringContainsString('SomeBot/1.3; +https://www.example.com/somebot', $userAgent);
});

test('Create instance with version but without info uri', function () {
    $userAgent = new BotUserAgent('SomeBot', version: '1.3');
    $this->assertStringContainsString('SomeBot/1.3)', $userAgent);
});

test('User agent string starts with Mozilla/5.0', function () {
    $userAgent = new BotUserAgent('ExampleBot', 'https://www.example.com/bot', '2.0');
    expect($userAgent->__toString())->toStartWith('Mozilla/5.0');
});


================================================
FILE: tests/UserAgents/UserAgentTest.php
================================================
<?php

namespace tests\UserAgents;

use Crwlr\Crawler\UserAgents\UserAgent;

test(
    'It can be created with any string in constructor and the __toString method returns that string',
    function ($string) {
        $userAgent = new UserAgent($string);
        expect($userAgent->__toString())->toBe($string);
    },
)->with([
    '',
    'Foo',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 ' .
    'Safari/537.36',
    '%$§$!")(=aäöüäö?ßß``2304980=)(§$/&!"=)=',
]);


================================================
FILE: tests/Utils/GzipTest.php
================================================
<?php

namespace tests\Utils;

use Crwlr\Crawler\Utils\Gzip;

it('encodes a string', function () {
    $string = str_repeat('Hello World! ', 100);

    $compressed = Gzip::encode($string);

    expect($compressed)->not->toBe($string)
        ->and(strlen($compressed))->toBeLessThan(strlen($string));
});

it('decodes a string', function () {
    $encoded = Gzip::encode('Hello World!');

    expect($encoded)->not->toBe('Hello World!')
        ->and(Gzip::decode($encoded))->toBe('Hello World!');
});

it('does not generate a warning, when string to decode actually isn\'t encoded', function () {
    $warnings = [];

    set_error_handler(function ($errno, $errstr) use (&$warnings) {
        if ($errno === E_WARNING) {
            $warnings[] = $errstr;
        }

        return false;
    });

    $decoded = Gzip::decode('Hello World!');

    restore_error_handler();

    expect($decoded)->toBe('Hello World!')
        ->and($warnings)->toBeEmpty();
});


================================================
FILE: tests/Utils/HttpHeadersTest.php
================================================
<?php

namespace tests\Utils;

use Crwlr\Crawler\Utils\HttpHeaders;

it('normalizes a headers array', function () {
    expect(HttpHeaders::normalize([
        'Accept-Language' => 'de',
        'Accept-Encoding' => ['gzip', 'deflate', 'br'],
    ]))->toBe([
        'Accept-Language' => ['de'],
        'Accept-Encoding' => ['gzip', 'deflate', 'br'],
    ]);
});

it('merges two header arrays', function () {
    $headers = [
        'Accept-Language' => ['de'],
        'Accept-Encoding' => ['gzip', 'deflate', 'br'],
    ];

    $merge = [
        'Accept' => ['text/html', 'application/xhtml+xml', 'application/xml'],
        'Accept-Language' => ['de', 'en'],
    ];

    expect(HttpHeaders::merge($headers, $merge))->toBe([
        'Accept-Language' => ['de', 'en'],
        'Accept-Encoding' => ['gzip', 'deflate', 'br'],
        'Accept' => ['text/html', 'application/xhtml+xml', 'application/xml'],
    ]);
});

it('adds a single value to a certain header in a headers array', function () {
    $headers = ['Accept-Language' => ['de']];

    expect(HttpHeaders::addTo($headers, 'Accept-Language', 'en'))->toBe(['Accept-Language' => ['de', 'en']]);
});

it('adds an array of values to a certain header in a headers array', function () {
    $headers = ['Accept-Language' => ['de']];

    expect(
        HttpHeaders::addTo($headers, 'Accept-Language', ['en-US', 'en']),
    )->toBe(['Accept-Language' => ['de', 'en-US', 'en']]);
});

it('adds the header when calling addTo() with a header name that the array does not contain yet', function () {
    $headers = ['Accept-Encoding' => ['gzip', 'deflate', 'br']];

    expect(
        HttpHeaders::addTo($headers, 'Accept-Language', ['de', 'en']),
    )->toBe([
        'Accept-Encoding' => ['gzip', 'deflate', 'br'],
        'Accept-Language' => ['de', 'en'],
    ]);
});


================================================
FILE: tests/Utils/OutputTypeHelperTest.php
================================================
<?php

namespace tests\Utils;

use Crwlr\Crawler\Utils\OutputTypeHelper;
use stdClass;

it('converts an object with a toArrayForResult() method to an array', function () {
    $object = new class {
        /**
         * @return string[]
         */
        public function toArrayForResult(): array
        {
            return ['foo' => 'bar', 'baz'];
        }
    };

    expect(OutputTypeHelper::objectToArray($object))->toBe(['foo' => 'bar', 'baz']);
});

it('converts an object with a toArray() method to an array', function () {
    $object = new class {
        /**
         * @return string[]
         */
        public function toArray(): array
        {
            return ['foo' => 'bar'];
        }
    };

    expect(OutputTypeHelper::objectToArray($object))->toBe(['foo' => 'bar']);
});

it('converts an object with a __serialize() method to an array', function () {
    $object = new class {
        public function __serialize(): array
        {
            return ['winnie' => 'the pooh'];
        }
    };

    expect(OutputTypeHelper::objectToArray($object))->toBe(['winnie' => 'the pooh']);
});

it('converts an object to an array by just casting it', function () {
    $object = new class {
        public string $foo = 'one';

        public string $bar = 'two';
    };

    expect(OutputTypeHelper::objectToArray($object))->toBe(['foo' => 'one', 'bar' => 'two']);
});

it('checks if a value is a scalar value', function (mixed $value, bool $expectedResult) {
    expect(OutputTypeHelper::isScalar($value))->toBe($expectedResult);
})->with([
    ['foo', true],
    [123, true],
    [true, true],
    [false, true],
    [1.23, true],
    [['foo', 'bar'], true], // only associative array counts as non scalar for the output types
    [['foo' => 'bar'], false],
    [new stdClass(), false],
]);

it('checks if a value is an associative array', function (mixed $value, bool $expectedResult) {
    expect(OutputTypeHelper::isAssociativeArray($value))->toBe($expectedResult);
})->with([
    ['foo', false],
    [['foo', 'bar'], false],
    [['foo' => 'bar'], true],
    [new stdClass(), false],
]);

it(
    'checks if a value is an associative array or object (a.k.a. non-scalar)',
    function (mixed $value, bool $expectedResult) {
        expect(OutputTypeHelper::isAssociativeArrayOrObject($value))->toBe($expectedResult);
    },
)->with([
    ['foo', false],
    [['foo', 'bar'], false],
    [['foo' => 'bar'], true],
    [new stdClass(), true],
]);


================================================
FILE: tests/Utils/RequestKeyTest.php
================================================
<?php

namespace tests\Utils;

use Crwlr\Crawler\Loader\Http\Messages\RespondedRequest;
use Crwlr\Crawler\Utils\RequestKey;
use GuzzleHttp\Psr7\Request;
use GuzzleHttp\Psr7\Response;

it('makes a cache key from a Request object', function () {
    $request = new Request('GET', 'https://www.crwlr.software/packages', ['accept-encoding' => 'gzip, deflate, br']);

    expect(RequestKey::from($request))->toBe('fc2a9e78c97e68674201853cea4a3d74');

    $request = $request->withAddedHeader('accept-language', 'de-DE,de;q=0.9,en-US;q=0.8,en;q=0.7');

    expect(RequestKey::from($request))->not()->toBe('fc2a9e78c97e68674201853cea4a3d74');
});

it('makes a cache key from a RespondedRequest object', function () {
    $respondedRequest = new RespondedRequest(
        new Request('GET', 'https://www.crwl.io/en/home', ['accept-encoding' => 'gzip, deflate, br']),
        new Response(),
    );

    expect(RequestKey::from($respondedRequest))->toBe('08bcc643c9fb21af5e4f3361243e2220');
});

test('when creating the key it ignores cookies in the sent headers by default', function () {
    $request = new Request('GET', 'https://www.crwlr.software/packages', ['accept-encoding' => 'gzip, deflate, br']);

    $keyWithoutCookie = RequestKey::from($request);

    $request = new Request('GET', 'https://www.crwlr.software/packages', [
        'accept-encoding' => 'gzip, deflate, br',
        'Cookie' => 'cookieName=v4lu3',
    ]);

    expect(RequestKey::from($request))->toBe($keyWithoutCookie);
});

it('also ignores other headers when provided in second parameter', function () {
    $request = new Request('GET', 'https://www.example.com', ['accept-encoding' => 'gzip, deflate, br']);

    $keyWithAcceptEncodingHeader = RequestKey::from($request);

    $keyWithoutAcceptEncodingHeader = RequestKey::from($request, ['accept-encoding']);

    expect($keyWithAcceptEncodingHeader)->not()->toBe($keyWithoutAcceptEncodingHeader);

    $request = new Request('GET', 'https://www.example.com', ['Accept-Encoding' => 'gzip']);

    $anotherKeyWithoutAcceptEncodingHeader = RequestKey::from($request, ['accept-encoding']);

    expect($keyWithoutAcceptEncodingHeader)->toBe($anotherKeyWithoutAcceptEncodingHeader);
});


================================================
FILE: tests/Utils/TemplateStringTest.php
================================================
<?php

namespace tests\Utils;

use Crwlr\Crawler\Utils\TemplateString;

it('resolves the variable syntax in a string with data from an array', function () {
    $string = <<<STRING
        https://www.example.com/[crwl:foo]/bar

        Lorem ipsum [crwl:'asdf'] dolor. Don't replace [crwl.io](/a/markdown/link) this.

        But [crwl:'asdf\'asdf'] this.

        Also with [crwl:"qu\"z"] quotes in it.
        STRING;

    $replaced = TemplateString::resolve($string, [
        'foo' => 'foo',
        'asdf' => 'asdf',
        'var' => 'yolo',
        'asdf\'asdf' => 'replace',
        'qu"z' => 'double',
    ]);

    expect($replaced)->toBe(
        <<<STRING
        https://www.example.com/foo/bar

        Lorem ipsum asdf dolor. Don't replace [crwl.io](/a/markdown/link) this.

        But replace this.

        Also with double quotes in it.
        STRING,
    );
});

it('resolves two variables in one line (regex is non greedy)', function () {
    expect(
        TemplateString::resolve(
            'hi [crwl:"one"]/[crwl:two] bye',
            ['one' => 'bonjour', 'two' => 'ciao'],
        ),
    )->toBe('hi bonjour/ciao bye');
});


================================================
FILE: tests/_Integration/GroupTest.php
================================================
<?php

namespace tests\_Integration;

use Crwlr\Crawler\Crawler;
use Crwlr\Crawler\HttpCrawler;
use Crwlr\Crawler\Loader\LoaderInterface;
use Crwlr\Crawler\Steps\Html;
use Crwlr\Crawler\Steps\Loading\Http;
use Crwlr\Crawler\UserAgents\BotUserAgent;
use Crwlr\Crawler\UserAgents\UserAgentInterface;
use Psr\Log\LoggerInterface;

use function tests\helper_generatorToArray;
use function tests\helper_getFastLoader;

it(
    'gets both, data from html and the enclosed json-ld using two steps in a group and combines the results',
    function () {
        $crawler = new class extends HttpCrawler {
            protected function userAgent(): UserAgentInterface
            {
                return new BotUserAgent('MyBot');
            }

            public function loader(UserAgentInterface $userAgent, LoggerInterface $logger): LoaderInterface
            {
                return helper_getFastLoader($userAgent, $logger);
            }
        };

        $crawler->input('http://localhost:8000/blog-post-with-json-ld');

        $crawler
            ->addStep(Http::get())
            ->addStep(
                Crawler::group()
                    ->addStep(
                        Html::first('#content article.blog-post')
                            ->extract(['title' => 'h1', 'date' => '.date']),
                    )
                    ->addStep(
                        Html::schemaOrg()
                            ->onlyType('BlogPosting')
                            ->extract([
                                'author' => 'author.name',
                                'keywords',
                            ]),
                    )
                    ->keep(),
            );

        $result = helper_generatorToArray($crawler->run());

        expect($result[0]->toArray())->toBe([
            'title' => 'Prevent Homograph Attacks using the crwlr/url Package',
            'date' => '2022-01-19',
            'author' => 'Christian Olear',
            'keywords' => 'homograph, attack, security, idn, internationalized domain names, prevention, url, uri',
        ]);
    },
);


================================================
FILE: tests/_Integration/Http/CharsetTest.php
================================================
<?php

namespace tests\_Integration\Http;

use Crwlr\Crawler\HttpCrawler;
use Crwlr\Crawler\Loader\LoaderInterface;
use Crwlr\Crawler\Steps\Html;
use Crwlr\Crawler\Steps\Loading\Http;
use Crwlr\Crawler\UserAgents\UserAgent;
use Crwlr\Crawler\UserAgents\UserAgentInterface;
use Psr\Log\LoggerInterface;

use function tests\helper_generatorToArray;
use function tests\helper_getFastLoader;

class CharsetExampleCrawler extends HttpCrawler
{
    public function loader(UserAgentInterface $userAgent, LoggerInterface $logger): LoaderInterface
    {
        return helper_getFastLoader($userAgent, $logger);
    }

    protected function userAgent(): UserAgentInterface
    {
        return new UserAgent('SomeUserAgent');
    }
}

it('Fixes non UTF-8 characters in HTML documents declared as UTF-8', function () {
    $crawler = new CharsetExampleCrawler();

    $crawler
        ->input('http://localhost:8000/non-utf-8-charset')
        ->addStep(Http::get())
        ->addStep(Html::root()->extract(['foo' => '.element']));

    $results = helper_generatorToArray($crawler->run());

    expect($results)->toHaveCount(1)
        ->and($results[0]->toArray())->toBe(['foo' => '0 l/m²']);
});


================================================
FILE: tests/_Integration/Http/CrawlingTest.php
================================================
<?php

namespace tests\_Integration\Http;

use Crwlr\Crawler\HttpCrawler;
use Crwlr\Crawler\Loader\Http\HttpLoader;
use Crwlr\Crawler\Loader\Http\Messages\RespondedRequest;
use Crwlr\Crawler\Loader\Http\Politeness\RetryErrorResponseHandler;
use Crwlr\Crawler\Loader\Http\Politeness\RobotsTxtHandler;
use Crwlr\Crawler\Loader\Http\Politeness\Throttler;
use Crwlr\Crawler\Loader\Http\Politeness\TimingUnits\MultipleOf;
use Crwlr\Crawler\Result;
use Crwlr\Crawler\Steps\Dom\HtmlElement;
use Crwlr\Crawler\Steps\Loading\Http;
use Crwlr\Crawler\UserAgents\UserAgent;
use Crwlr\Crawler\UserAgents\UserAgentInterface;
use Crwlr\Url\Url;
use Crwlr\Utils\Microseconds;
use GuzzleHttp\Client;
use PHPUnit\Framework\TestCase;
use Psr\Http\Client\ClientInterface;
use Psr\Http\Message\RequestInterface;
use Psr\Http\Message\ResponseInterface;
use Psr\Http\Message\UriInterface;
use Psr\Log\LoggerInterface;

use function tests\helper_generatorToArray;

/**
 * A TestLoader that tracks all the loaded URLs in a public property.
 */

class TestLoader extends HttpLoader
{
    /**
     * @var string[]
     */
    public array $loadedUrls = [];

    public function __construct(
        UserAgentInterface $userAgent,
        ?ClientInterface $httpClient = null,
        ?LoggerInterface $logger = null,
        ?Throttler $throttler = null,
        RetryErrorResponseHandler $retryErrorResponseHandler = new RetryErrorResponseHandler(),
        array $defaultGuzzleClientConfig = [],
    ) {
        parent::__construct(
            $userAgent,
            $httpClient,
            $logger,
            $throttler,
            $retryErrorResponseHandler,
            $defaultGuzzleClientConfig,
        );

        $this->robotsTxtHandler = new class ($this, $this->logger) extends RobotsTxtHandler {
            public function isAllowed(UriInterface|Url|string $url): bool
            {
                if (is_string($url)) {
                    $url = Url::parse($url);
                } elseif ($url instanceof UriInterface) {
                    $url = Url::parse($url);
                }

                if ($url->path() === '/not-allowed') {
                    return false;
                }

                return parent::isAllowed($url);
            }
        };
    }

    public function load(mixed $subject): ?RespondedRequest
    {
        $request = $this->validateSubjectType($subject);

        $this->loadedUrls[] = $request->getUri()->__toString();

        return parent::load($subject);
    }
}

/**
 * To check if the Crawler stays on the same host or same domain when crawling, the PSR-18 HTTP ClientInterface
 * of this Crawler's Loader, replaces the host in the request URI just before sending the Request. The Loader thinks
 * it actually loaded the page from the incoming URI and the returned RespondedRequest object also has that original URI
 * as effectiveUri (except if the requested page redirects).
 */

class Crawler extends HttpCrawler
{
    public function loader(UserAgentInterface $userAgent, LoggerInterface $logger): TestLoader
    {
        $client = new class implements ClientInterface {
            private Client $guzzleClient;

            public function __construct()
            {
                $this->guzzleClient = new Client();
            }

            public function sendRequest(RequestInterface $request): ResponseInterface
            {
                $request = $request->withUri($request->getUri()->withHost('localhost')->withPort(8000));

                return $this->guzzleClient->sendRequest($request);
            }
        };

        $loader = new TestLoader($userAgent, $client, $logger);

        // To not slow down tests unnecessarily
        $loader->throttle()
            ->waitBetween(new MultipleOf(0.0001), new MultipleOf(0.0002))
            ->waitAtLeast(Microseconds::fromSeconds(0.0001));

        return $loader;
    }

    protected function userAgent(): UserAgentInterface
    {
        return new UserAgent('SomeUserAgent');
    }

    /**
     * This method is here for the return type, so phpstan doesn't complain.
     */
    public function getLoader(): TestLoader
    {
        return parent::getLoader(); // @phpstan-ignore-line
    }
}

/** @var TestCase $this */

it('stays on the same host by default', function () {
    $crawler = (new Crawler())
        ->input('http://www.example.com/crawling/main')
        ->addStep(Http::crawl());

    $crawler->runAndTraverse();

    expect($crawler->getLoader()->loadedUrls)->not()->toContain('http://foo.example.com/crawling/main-on-subdomain');
});

it('stays on the same domain when method sameDomain() is called', function () {
    $crawler = (new Crawler())
        ->input('http://www.example.com/crawling/main')
        ->addStep(Http::crawl()->sameDomain());

    $crawler->runAndTraverse();

    expect($crawler->getLoader()->loadedUrls)->toContain('http://foo.example.com/crawling/main-on-subdomain')
        ->and($crawler->getLoader()->loadedUrls)->not()->toContain('https://www.crwlr.software/packages/crawler');
});

it('stays on the same host when method sameHost() is called', function () {
    $crawler = (new Crawler())
        ->input('http://www.example.com/crawling/main')
        ->addStep(
            Http::crawl()
                ->sameDomain()
                ->sameHost(),
        );

    $crawler->runAndTraverse();

    expect($crawler->getLoader()->loadedUrls)->not()->toContain('http://foo.example.com/crawling/main-on-subdomain');
});

it('crawls every page of a website that is linked somewhere', function () {
    $crawler = (new Crawler())
        ->input('http://www.example.com/crawling/main')
        ->addStep(Http::crawl());

    $crawler->runAndTraverse();

    expect($crawler->getLoader()->loadedUrls)->toHaveCount(6)
        ->and($crawler->getLoader()->loadedUrls)->toContain('http://www.example.com/crawling/main')
        ->and($crawler->getLoader()->loadedUrls)->toContain('http://www.example.com/crawling/sub1')
        ->and($crawler->getLoader()->loadedUrls)->toContain('http://www.example.com/crawling/sub1/sub1')
        ->and($crawler->getLoader()->loadedUrls)->toContain('http://www.example.com/crawling/sub2')
        ->and($crawler->getLoader()->loadedUrls)->toContain('http://www.example.com/crawling/sub2/sub1')
        ->and($crawler->getLoader()->loadedUrls)->toContain('http://www.example.com/crawling/sub2/sub1/sub1');
});

it('crawls only to a certain depth when the crawl depth is defined', function () {
    $crawler = (new Crawler())
        ->input('http://www.example.com/crawling/main')
        ->addStep(Http::crawl()->depth(1));

    $crawler->runAndTraverse();

    expect($crawler->getLoader()->loadedUrls)->toHaveCount(3);

    $crawler = (new Crawler())
        ->input('http://www.example.com/crawling/main')
        ->addStep(Http::crawl()->depth(2));

    $crawler->runAndTraverse();

    expect($crawler->getLoader()->loadedUrls)->toHaveCount(5);
});

it('extracts URLs from a sitemap if you call method inputIsSitemap()', function () {
    $crawler = (new Crawler())
        ->input('http://www.example.com/crawling/sitemap.xml')
        ->addStep(Http::crawl()->inputIsSitemap());

    $crawler->runAndTraverse();

    expect($crawler->getLoader()->loadedUrls)->toHaveCount(7);
});

it('fails to extract URLs if you provide a sitemap as input and don\'t call inputIsSitemap()', function () {
    $crawler = (new Crawler())
        ->input('http://www.example.com/crawling/sitemap.xml')
        ->addStep(Http::crawl());

    $crawler->runAndTraverse();

    expect($crawler->getLoader()->loadedUrls)->toHaveCount(1);
});

it(
    'extracts URLs from a sitemap where the <urlset> tag contains attributes that cause symfony DomCrawler to fail',
    function () {
        $crawler = (new Crawler())
            ->input('http://www.example.com/crawling/sitemap2.xml')
            ->addStep(Http::crawl()->inputIsSitemap());

        $crawler->runAndTraverse();

        expect($crawler->getLoader()->loadedUrls)->toHaveCount(7);
    },
);

it('loads only pages where the path starts with a certain string when method pathStartsWith() is called', function () {
    $crawler = (new Crawler())
        ->input('http://www.example.com/crawling/sitemap.xml')
        ->addStep(
            Http::crawl()
                ->inputIsSitemap()
                ->pathStartsWith('/crawling/sub1'),
        );

    $crawler->runAndTraverse();

    expect($crawler->getLoader()->loadedUrls)->toHaveCount(3)
        ->and($crawler->getLoader()->loadedUrls)->toContain('http://www.example.com/crawling/sitemap.xml')
        ->and($crawler->getLoader()->loadedUrls)->toContain('http://www.example.com/crawling/sub1')
        ->and($crawler->getLoader()->loadedUrls)->toContain('http://www.example.com/crawling/sub1/sub1');
});

it('loads only URLs where the path matches a regex when method pathMatches() is used', function () {
    $crawler = (new Crawler())
        ->input('http://www.example.com/crawling/sitemap.xml')
        ->addStep(
            Http::crawl()
                ->inputIsSitemap()
                ->pathMatches('/^\/crawling\/sub[12]$/'),
        );

    $crawler->runAndTraverse();

    expect($crawler->getLoader()->loadedUrls)->toHaveCount(3);
});

it('loads only URLs where the Closure passed to method customFilter() returns true', function () {
    $crawler = (new Crawler())
        ->input('http://www.example.com/crawling/sitemap.xml')
        ->addStep(
            Http::crawl()
                ->inputIsSitemap()
                ->customFilter(function (Url $url) {
                    return in_array($url->path(), [
                        '/crawling/main',
                        '/crawling/sub1/sub1',
                        '/crawling/sub2/sub1/sub1',
                    ], true);
                }),
        );

    $crawler->runAndTraverse();

    expect($crawler->getLoader()->loadedUrls)->toHaveCount(4)
        ->and($crawler->getLoader()->loadedUrls)->toContain('http://www.example.com/crawling/main')
        ->and($crawler->getLoader()->loadedUrls)->toContain('http://www.example.com/crawling/sub1/sub1')
        ->and($crawler->getLoader()->loadedUrls)->toContain('http://www.example.com/crawling/sub2/sub1/sub1');
});

it(
    'receives the link element where the URL was found, as second param in the Closure passed to method ' .
    'customFilter() when it was found in an HTML document',
    function () {
        $crawler = (new Crawler())
            ->input('http://www.example.com/crawling/main')
            ->addStep(
                Http::crawl()
                    ->customFilter(function (Url $url, ?HtmlElement $linkElement) {
                        return $linkElement && str_contains($linkElement->text(), 'Subpage 2');
                    }),
            );

        $crawler->runAndTraverse();

        expect($crawler->getLoader()->loadedUrls)->toHaveCount(4)
            ->and($crawler->getLoader()->loadedUrls)->toContain('http://www.example.com/crawling/main')
            ->and($crawler->getLoader()->loadedUrls)->toContain('http://www.example.com/crawling/sub2')
            ->and($crawler->getLoader()->loadedUrls)->toContain('http://www.example.com/crawling/sub2/sub1')
            ->and($crawler->getLoader()->loadedUrls)->toContain('http://www.example.com/crawling/sub2/sub1/sub1');
    },
);

it(
    'loads all pages, but yields only responses where the URL path starts with a certain string, when methods ' .
    'pathStartsWith() and loadAllButYieldOnlyMatching() are called',
    function () {
        $crawler = (new Crawler())
            ->input('http://www.example.com/crawling/sitemap.xml')
            ->addStep(
                Http::crawl()
                    ->inputIsSitemap()
                    ->pathStartsWith('/crawling/sub2')
                    ->loadAllButYieldOnlyMatching(),
            );

        $results = helper_generatorToArray($crawler->run());

        expect($crawler->getLoader()->loadedUrls)->toHaveCount(7)
            ->and($results)->toHaveCount(3);
    },
);

it(
    'loads all URLs, but yields only responses where the URL path matches a regex, when methods pathMatches() and ' .
    'loadAllButYieldOnlyMatching() are called',
    function () {
        $crawler = (new Crawler())
            ->input('http://www.example.com/crawling/sitemap.xml')
            ->addStep(
                Http::crawl()
                    ->inputIsSitemap()
                    ->pathMatches('/^\/crawling\/sub[12]$/')
                    ->loadAllButYieldOnlyMatching(),
            );

        $results = helper_generatorToArray($crawler->run());

        expect($crawler->getLoader()->loadedUrls)->toHaveCount(7)
            ->and($results)->toHaveCount(2);
    },
);

it(
    'loads all URLs but yields only responses where the Closure passed to method customFilter() returns true, when ' .
    'methods customFilter() and loadAllButYieldOnlyMatching() are called',
    function () {
        $crawler = (new Crawler())
            ->input('http://www.example.com/crawling/sitemap.xml')
            ->addStep(
                Http::crawl()
                    ->inputIsSitemap()
                    ->customFilter(function (Url $url) {
                        return in_array($url->path(), [
                            '/crawling/main',
                            '/crawling/sub1/sub1',
                            '/crawling/sub2/sub1/sub1',
                        ], true);
                    })
                    ->loadAllButYieldOnlyMatching(),
            );

        $results = helper_generatorToArray($crawler->run());

        expect($crawler->getLoader()->loadedUrls)->toHaveCount(7)
            ->and($results)->toHaveCount(3);
    },
);

it(
    'keeps the fragment parts in URLs and treats the same URL with a different fragment part as separate URLs when ' .
    'keepUrlFragment() was called',
    function () {
        // Explanation: in almost all cases URLs with a fragment part at the end (#something) will respond with the
        // same content. So, to avoid loading the same page multiple times, the step throws away the fragment part of
        // discovered URLs by default.
        $crawler = (new Crawler())
            ->input('http://www.example.com/crawling/main')
            ->addStep(Http::crawl()->keepUrlFragment()->keep(['url']));

        $results = helper_generatorToArray($crawler->run());

        expect($results)->toHaveCount(8);

        $urls = [];

        foreach ($results as $result) {
            $urls[] = $result->get('url');
        }

        expect($urls)->toContain('http://www.example.com/crawling/sub2')
            ->and($urls)->toContain('http://www.example.com/crawling/sub2#fragment1')
            ->and($urls)->toContain('http://www.example.com/crawling/sub2#fragment2');
    },
);

it('stops crawling when maxOutputs is reached', function () {
    $crawler = (new Crawler())
        ->input('http://www.example.com/crawling/main')
        ->addStep(
            Http::crawl()
                ->keepUrlFragment()
                ->maxOutputs(4),
        );

    $results = helper_generatorToArray($crawler->run());

    expect($results)->toHaveCount(4)
        ->and($crawler->getLoader()->loadedUrls)->toHaveCount(4);
});

it('uses canonical links when useCanonicalLinks() is called', function () {
    $crawler = (new Crawler())
        ->input('http://www.example.com/crawling/main')
        ->addStep(
            Http::crawl()
                ->useCanonicalLinks()
                ->keep(['url']),
        );

    $results = helper_generatorToArray($crawler->run());

    $resultUrls = array_map(function (Result $result) {
        return $result->get('url');
    }, $results);

    expect($resultUrls)
        ->toBe([
            'http://www.example.com/crawling/main',
            'http://www.example.com/crawling/sub1/sub1',       // actual loaded url was sub1, but canonical is sub1/sub1
            'http://www.example.com/crawling/sub2',
            'http://www.example.com/crawling/sub2/sub1/sub1',
        ])
        ->and($crawler->getLoader()->loadedUrls)
        ->toBe([
            'http://www.example.com/crawling/main',
            'http://www.example.com/crawling/sub1',            // => /crawling/sub1/sub1 => this URL wasn't loaded yet,
            'http://www.example.com/crawling/sub2',            // so when the link is discovered it won't load it.
            'http://www.example.com/crawling/sub2/sub1',       // => /crawling/sub1/sub1 => this URL was already loaded,
            'http://www.example.com/crawling/sub2/sub1/sub1',  // so the response is not yielded as a separate result.
        ]);
});

it('does not yield the same page twice when a URL was redirected to an already loaded page', function () {
    $crawler = (new Crawler())
        ->input('http://www.example.com/crawling/redirect')
        ->addStep(Http::crawl()->keep(['url']));

    $results = helper_generatorToArray($crawler->run());

    $resultUrls = array_map(function (Result $result) {
        return $result->get('url');
    }, $results);

    expect($resultUrls)
        ->toContain('http://www.example.com/crawling/main')
        ->and($resultUrls)
        ->not()
        ->toContain('http://www.example.com/crawling/redirect')
        ->and($this->getActualOutputForAssertion())
        ->toContain('Was already loaded before. Do not process this page again.');
});

it('does not produce a fatal error when the initial request fails', function () {
    $crawler = (new Crawler())
        ->input('http://www.example.com/not-allowed')
        ->addStep(Http::crawl()->keep(['url']));

    $results = helper_generatorToArray($crawler->run());

    expect($results)->toHaveCount(0);
});


================================================
FILE: tests/_Integration/Http/ErrorResponsesTest.php
================================================
<?php

namespace tests\_Integration\Http;

use Crwlr\Crawler\HttpCrawler;
use Crwlr\Crawler\Loader\Http\Exceptions\LoadingException;
use Crwlr\Crawler\Loader\LoaderInterface;
use Crwlr\Crawler\Steps\Loading\Http;
use Crwlr\Crawler\UserAgents\UserAgent;
use Crwlr\Crawler\UserAgents\UserAgentInterface;
use Psr\Log\LoggerInterface;
use tests\_Stubs\DummyLogger;

use function tests\helper_generatorToArray;
use function tests\helper_getFastLoader;

/**
 * @method DummyLogger getLogger()
 */
class ErrorCrawler extends HttpCrawler
{
    protected function logger(): LoggerInterface
    {
        return new DummyLogger();
    }

    protected function userAgent(): UserAgentInterface
    {
        return new UserAgent('SomeBot');
    }

    public function loader(UserAgentInterface $userAgent, LoggerInterface $logger): LoaderInterface
    {
        return helper_getFastLoader($userAgent, $logger);
    }
}

it('does not yield client error responses by default', function (string $method) {
    $crawler = new ErrorCrawler();

    $crawler->inputs(['http://localhost:8000/client-error-response'])
        ->addStep(Http::{$method}()->keepAs('response'));

    $results = helper_generatorToArray($crawler->run());

    expect($results)->toBeEmpty();
})->with(['get', 'post', 'put', 'patch', 'delete']);

it('does not yield server error responses by default', function (string $method) {
    $crawler = new ErrorCrawler();

    $crawler->inputs(['http://localhost:8000/server-error-response'])
        ->addStep(Http::{$method}()->keepAs('response'));

    $results = helper_generatorToArray($crawler->run());

    expect($results)->toBeEmpty();
})->with(['get', 'post', 'put', 'patch', 'delete']);

it('yields client error responses when yieldErrorResponses() was called', function (string $method) {
    $crawler = new ErrorCrawler();

    $crawler->inputs(['http://localhost:8000/client-error-response'])
        ->addStep(Http::{$method}()->yieldErrorResponses()->keepAs('response'));

    $results = helper_generatorToArray($crawler->run());

    expect($results)->toHaveCount(1);
})->with(['get', 'post', 'put', 'patch', 'delete']);

it('yields server error responses when yieldErrorResponses() was called', function (string $method) {
    $crawler = new ErrorCrawler();

    $crawler->inputs(['http://localhost:8000/server-error-response'])
        ->addStep(Http::{$method}()->yieldErrorResponses()->keepAs('response'));

    $results = helper_generatorToArray($crawler->run());

    expect($results)->toHaveCount(1);
})->with(['get', 'post', 'put', 'patch', 'delete']);

it(
    'goes on crawling after a client error response when stopOnErrorResponse() wasn\'t called',
    function (string $method) {
        $crawler = new ErrorCrawler();

        $crawler->inputs(['http://localhost:8000/client-error-response', 'http://localhost:8000/simple-listing'])
            ->addStep(Http::{$method}()->keepAs('response'));

        $results = helper_generatorToArray($crawler->run());

        expect($results)->toHaveCount(1);
    },
)->with(['get', 'post', 'put', 'patch', 'delete']);

it(
    'goes on crawling after a server error response when stopOnErrorResponse() wasn\'t called',
    function (string $method) {
        $crawler = new ErrorCrawler();

        $crawler->inputs(['http://localhost:8000/server-error-response', 'http://localhost:8000/simple-listing'])
            ->addStep(Http::{$method}()->keepAs('response'));

        $results = helper_generatorToArray($crawler->run());

        expect($results)->toHaveCount(1);
    },
)->with(['get', 'post', 'put', 'patch', 'delete']);

it(
    'stops crawling (throws exception) after a client error response when the stopOnErrorResponse() method was called',
    function (string $method) {
        $crawler = new ErrorCrawler();

        $crawler->inputs(['http://localhost:8000/client-error-response', 'http://localhost:8000/simple-listing'])
            ->addStep(Http::{$method}()->stopOnErrorResponse());

        $crawler->runAndTraverse();
    },
)->with(['get', 'post', 'put', 'patch', 'delete'])->throws(LoadingException::class);

it(
    'stops crawling (throws exception) after a server error response when the stopOnErrorResponse() method was called',
    function (string $method) {
        $crawler = new ErrorCrawler();

        $crawler->inputs(['http://localhost:8000/client-error-response', 'http://localhost:8000/simple-listing'])
            ->addStep(
                Http::{$method}()
                    ->stopOnErrorResponse(),
            );

        $crawler->runAndTraverse();
    },
)->with(['get', 'post', 'put', 'patch', 'delete'])->throws(LoadingException::class);

it('does not log warnings about multiple loader hook calls when stopOnErrorResponse() is used', function () {
    $crawler = new ErrorCrawler();

    $crawler->inputs(['http://localhost:8000/hello-world', 'http://localhost:8000/simple-listing'])
        ->addStep(Http::get()->stopOnErrorResponse());

    $crawler->runAndTraverse();

    foreach ($crawler->getLogger()->messages as $message) {
        expect($message['message'])->not->toContain(' was already called in this load call.');
    }
});


================================================
FILE: tests/_Integration/Http/GzipTest.php
================================================
<?php

namespace tests\_Integration\Http;

use Crwlr\Crawler\HttpCrawler;
use Crwlr\Crawler\Loader\Http\Messages\RespondedRequest;
use Crwlr\Crawler\Loader\LoaderInterface;
use Crwlr\Crawler\Result;
use Crwlr\Crawler\Steps\Loading\Http;
use Crwlr\Crawler\UserAgents\UserAgent;
use Crwlr\Crawler\UserAgents\UserAgentInterface;
use Psr\Log\LoggerInterface;

use function tests\helper_generatorToArray;
use function tests\helper_getFastLoader;

class GzipCrawler extends HttpCrawler
{
    protected function userAgent(): UserAgentInterface
    {
        return new UserAgent('HelloWorldBot');
    }

    public function loader(UserAgentInterface $userAgent, LoggerInterface $logger): LoaderInterface
    {
        return helper_getFastLoader($userAgent, $logger);
    }
}

it('uncompresses gzip compressed response body when content-type header is sent', function () {
    $crawler = new GzipCrawler();

    $crawler->input('http://localhost:8000/gzip')
        ->addStep(Http::get()->keepAs('response'));

    $results = helper_generatorToArray($crawler->run());

    expect($results[0])->toBeInstanceOf(Result::class)
        ->and($results[0]->get('response'))->toBeInstanceOf(RespondedRequest::class)
        ->and(Http::getBodyString($results[0]->get('response')))->toBe('This is a gzip compressed string');
});


================================================
FILE: tests/_Integration/Http/HeadlessBrowserTest.php
================================================
<?php

namespace tests\_Integration\Http;

use Crwlr\Crawler\Cache\FileCache;
use Crwlr\Crawler\HttpCrawler;
use Crwlr\Crawler\Loader\Http\Browser\ScreenshotConfig;
use Crwlr\Crawler\Loader\Http\Cookies\Cookie;
use Crwlr\Crawler\Loader\Http\Cookies\CookieJar;
use Crwlr\Crawler\Loader\Http\HttpLoader;
use Crwlr\Crawler\Loader\LoaderInterface;
use Crwlr\Crawler\Steps\Dom\HtmlDocument;
use Crwlr\Crawler\Steps\Html;
use Crwlr\Crawler\Steps\Loading\Http;
use Crwlr\Crawler\Steps\Loading\Http\Browser\BrowserAction;
use Crwlr\Crawler\Steps\Step;
use Crwlr\Crawler\UserAgents\UserAgent;
use Crwlr\Crawler\UserAgents\UserAgentInterface;
use Generator;
use Psr\Log\LoggerInterface;

use function tests\helper_cachedir;
use function tests\helper_generatorToArray;
use function tests\helper_getFastLoader;
use function tests\helper_resetStorageDir;
use function tests\helper_storagedir;

class HeadlessBrowserCrawler extends HttpCrawler
{
    protected function userAgent(): UserAgentInterface
    {
        return new UserAgent('HeadlessBrowserBot');
    }

    public function loader(UserAgentInterface $userAgent, LoggerInterface $logger): LoaderInterface
    {
        $loader = helper_getFastLoader($userAgent, $logger);

        $loader->useHeadlessBrowser();

        return $loader;
    }
}

class GetJsonFromResponseHtmlBody extends Step
{
    protected function invoke(mixed $input): Generator
    {
        $html = Http::getBodyString($input->response);

        $jsonString = (new HtmlDocument($html))->querySelector('body pre')?->text() ?? '';

        yield json_decode($jsonString, true);
    }
}

class GetStringFromResponseHtmlBody extends Step
{
    protected function invoke(mixed $input): Generator
    {
        $html = Http::getBodyString($input->response);

        yield (new HtmlDocument($html))->querySelector('body')?->text() ?? '';
    }
}

/**
 * @return Cookie[]
 */
function helper_getCookiesByDomainFromLoader(HttpLoader $loader, string $domain): array
{
    $cookieJar = invade($loader)->cookieJar;

    /** @var CookieJar $cookieJar */

    return $cookieJar->allByDomain($domain);
}

it('automatically uses the Loader\'s user agent', function () {
    $crawler = new HeadlessBrowserCrawler();

    $crawler->input('http://localhost:8000/print-headers')
        ->addStep(Http::get())
        ->addStep((new GetJsonFromResponseHtmlBody())->keepAs('responseBody'));

    $results = helper_generatorToArray($crawler->run());

    expect($results)->toHaveCount(1)
        ->and($results[0]->get('responseBody'))->toBeArray()
        ->and($results[0]->get('responseBody'))->toHaveKey('User-Agent')
        ->and($results[0]->get('responseBody')['User-Agent'])->toBe('HeadlessBrowserBot');
});

it(
    'does not use the user-agent defined in the crawler, when useNativeUserAgent() was called on the browser loader ' .
    'helper',
    function () {
        $crawler = new HeadlessBrowserCrawler();

        $crawler
            ->getLoader()
            ->browser()
            ->useNativeUserAgent();

        $crawler->input('http://localhost:8000/print-headers')
            ->addStep(Http::get())
            ->addStep((new GetJsonFromResponseHtmlBody())->keepAs('responseBody'));

        $results = helper_generatorToArray($crawler->run());

        expect($results)->toHaveCount(1)
            ->and($results[0]->get('responseBody'))->toBeArray()
            ->and($results[0]->get('responseBody'))->toHaveKey('User-Agent')
            ->and($results[0]->get('responseBody')['User-Agent'])->toStartWith('Mozilla/5.0 (');
    },
);

it('uses cookies', function () {
    $crawler = new HeadlessBrowserCrawler();

    $crawler
        ->input('http://localhost:8000/set-cookie')
        ->addStep(Http::get())
        ->addStep(new class extends Step {
            protected function invoke(mixed $input): Generator
            {
                yield 'http://localhost:8000/print-cookie';
            }
        })
        ->addStep(Http::get())
        ->addStep((new GetStringFromResponseHtmlBody())->keepAs('printed-cookie'));

    $results = helper_generatorToArray($crawler->run());

    expect($results)->toHaveCount(1)
        ->and($results[0]->get('printed-cookie'))->toBeString()
        ->and($results[0]->get('printed-cookie'))->toBe('foo123');
});

it('does not use cookies when HttpLoader::dontUseCookies() was called', function () {
    $crawler = new HeadlessBrowserCrawler();

    $crawler->getLoader()->dontUseCookies();

    $crawler
        ->input('http://localhost:8000/set-cookie')
        ->addStep(Http::get())
        ->addStep(new class extends Step {
            protected function invoke(mixed $input): Generator
            {
                yield 'http://localhost:8000/print-cookie';
            }
        })
        ->addStep(Http::get())
        ->addStep((new GetStringFromResponseHtmlBody())->keepAs('printed-cookie'));

    $results = helper_generatorToArray($crawler->run());

    expect($results)->toHaveCount(1)
        ->and($results[0]->get('printed-cookie'))->toBeEmpty();
});

it('renders javascript', function () {
    $crawler = new HeadlessBrowserCrawler();

    $crawler->input('http://localhost:8000/js-rendering')
        ->addStep(Http::get())
        ->addStep(
            Html::root()
                ->extract(['content' => '#content p']),
        );

    $results = helper_generatorToArray($crawler->run());

    expect($results)->toHaveCount(1)
        ->and($results[0]->toArray())->toBe([
            'content' => 'This was added through javascript',
        ]);
});

it('gets cookies that are set via javascript', function () {
    $crawler = new HeadlessBrowserCrawler();

    $cache = new FileCache(helper_cachedir());

    $cache->clear();

    $crawler->getLoader()->setCache($cache);

    $crawler
        ->input('http://localhost:8000/set-js-cookie')
        ->addStep(Http::get());

    helper_generatorToArray($crawler->run());

    $cookiesInJar = helper_getCookiesByDomainFromLoader($crawler->getLoader(), 'localhost');

    $testCookie = $cookiesInJar['testcookie'] ?? null;

    expect($cookiesInJar)->toHaveCount(1)
        ->and($testCookie?->name())->toBe('testcookie')
        ->and($testCookie?->value())->toBe('javascriptcookie');

    // Check that cookie is not added to the cookiejar when the response was served from cache.
    $crawler = new HeadlessBrowserCrawler();

    $crawler->getLoader()->setCache($cache);

    $crawler
        ->input('http://localhost:8000/set-js-cookie')
        ->addStep(Http::get());

    helper_generatorToArray($crawler->run());

    $cookiesInJar = helper_getCookiesByDomainFromLoader($crawler->getLoader(), 'localhost');

    expect($cookiesInJar)->toHaveCount(0);
});

it('gets a cookie that is set via a click, executed via post browser navigate hook', function () {
    $crawler = new HeadlessBrowserCrawler();

    $crawler
        ->input('http://localhost:8000/set-delayed-js-cookie')
        ->addStep(
            Http::get()
                ->postBrowserNavigateHook(BrowserAction::clickElement('#consent_btn')),
        )
        ->addStep(new class extends Step {
            protected function invoke(mixed $input): Generator
            {
                yield 'http://localhost:8000/print-cookie';
            }
        })
        ->addStep(Http::get())
        ->addStep((new GetStringFromResponseHtmlBody())->keepAs('printed-cookie'));

    $results = helper_generatorToArray($crawler->run());

    expect($results)->toHaveCount(1)
        ->and($results[0]->get('printed-cookie'))->toBeString()
        ->and($results[0]->get('printed-cookie'))->toBe('javascriptcookie');

    $cookiesInJar = helper_getCookiesByDomainFromLoader($crawler->getLoader(), 'localhost');

    $testCookie = $cookiesInJar['testcookie'] ?? null;

    expect($cookiesInJar)->toHaveCount(1)
        ->and($testCookie?->name())->toBe('testcookie')
        ->and($testCookie?->value())->toBe('javascriptcookie');
});

it(
    'sending cookies works correctly when the loader is not configured to use the browser but two steps use the ' .
    'browser by calling the useBrowser() method of Http steps',
    function () {
        $crawler = HttpCrawler::make()->withMozilla5CompatibleUserAgent();

        $crawler
            ->input('http://localhost:8000/set-multiple-js-cookies')
            ->addStep(Http::get()->useBrowser())
            ->addStep(new class extends Step {
                protected function invoke(mixed $input): Generator
                {
                    yield 'http://localhost:8000/print-cookies';
                }
            })
            ->addStep(Http::get()->useBrowser())
            ->addStep((new GetStringFromResponseHtmlBody())->keepAs('printed-cookies'));

        $results = helper_generatorToArray($crawler->run());

        expect($results)->toHaveCount(1)
            ->and($results[0]->get('printed-cookies'))->toBeString()
            ->and($results[0]->get('printed-cookies'))
            ->toBe('cookie3=cookie3value;cookie2=cookie2value;cookie1=cookie1value');

        $cookiesInJar = helper_getCookiesByDomainFromLoader($crawler->getLoader(), 'localhost');

        expect($cookiesInJar)->toHaveCount(3)
            ->and($cookiesInJar['cookie1']->value())->toBe('cookie1value')
            ->and($cookiesInJar['cookie2']->value())->toBe('cookie2value')
            ->and($cookiesInJar['cookie3']->value())->toBe('cookie3value');
    },
);

test(
    'BrowserAction::clickElement(), clickInsideShadowDom(), evaluate(), moveMouseToElement(), ' .
    'moveMouseToPosition(), scrollDown(), scrollUp() and typeText() work as expected',
    function () {
        $crawler = new HeadlessBrowserCrawler();

        $crawler
            ->getLoader()
            ->browser()
            ->includeShadowElementsInHtml();

        $crawler
            ->input('http://localhost:8000/browser-actions')
            ->addStep(
                Http::get()
                    // Inserting the #click_element is delayed in the page, so this also tests, that the
                    // BrowserAction::clickElement() action automatically waits for an element matching the selector
                    // to be present.
                    ->postBrowserNavigateHook(BrowserAction::clickElement('#click_element'))
                    ->postBrowserNavigateHook(BrowserAction::screenshot(ScreenshotConfig::make(helper_storagedir())))
                    ->postBrowserNavigateHook(BrowserAction::clickInsideShadowDom('#shadow_host', '#shadow_click_div'))
                    ->postBrowserNavigateHook(
                        BrowserAction::evaluate(
                            'document.getElementById(\'evaluation_container\').innerHTML = \'evaluated\'',
                        ),
                    )
                    ->postBrowserNavigateHook(BrowserAction::moveMouseToElement('#mouseover_check_1'))
                    ->postBrowserNavigateHook(BrowserAction::moveMouseToPosition(305, 405))
                    ->postBrowserNavigateHook(BrowserAction::scrollDown(4000))
                    ->postBrowserNavigateHook(
                        BrowserAction::screenshot(
                            ScreenshotConfig::make(helper_storagedir())
                                ->setImageFileType('jpeg')
                                ->setQuality(20)
                                ->setFullPage(),
                        ),
                    )
                    ->postBrowserNavigateHook(BrowserAction::scrollUp(2000))
                    ->postBrowserNavigateHook(BrowserAction::scrollUp(2000))
                    ->postBrowserNavigateHook(BrowserAction::clickElement('#input'))
                    ->postBrowserNavigateHook(BrowserAction::typeText('typing text works'))
                    ->keep(['body', 'screenshots']),
            );

        $results = helper_generatorToArray($crawler->run());

        $body = $results[0]->get('body');

        $screenshots = $results[0]->get('screenshots');

        expect($body)->toContain('<div id="click_worked">yes</div>')
            // This also tests the `HeadlessBrowserLoaderHelper::includeShadowElementsInHtml()` method,
            // because even if the click worked, with the normal way of getting HTML this wouldn't be
            // included in the returned HTML.
            ->and($body)->toContain('<div id="shadow_host"><div id="shadow_click_div">clicked</div></div>')
            ->and($body)->toContain('<div id="evaluation_container">evaluated</div>')
            ->and($body)->toContain('<div id="mouseover_check_1">mouse was here</div>')
            ->and($body)->toContain('<div id="mouseover_check_2">mouse was here</div>')
            ->and($body)->toContain('<div id="scroll_down_check">scrolled down</div>')
            ->and($body)->toContain('<div id="scroll_up_check">scrolled up</div>')
            ->and($body)->toContain('<div id="input_value">typing text works</div>')
            ->and($screenshots)->toHaveCount(2)
            ->and($screenshots[0])->toEndWith('.png')
            ->and($screenshots[1])->toEndWith('.jpeg');

        if (function_exists('getimagesize')) {
            $screenshot1Size = getimagesize($screenshots[0]);

            $screenshot2Size = getimagesize($screenshots[1]);

            if (is_array($screenshot1Size) && is_array($screenshot2Size)) {
                expect($screenshot1Size[1])->toBeLessThan(2100)
                    ->and($screenshot2Size[1])->toBeGreaterThan(4000);
            }
        }

        helper_resetStorageDir();
    },
);

test('BrowserAction::waitUntilDocumentContainsElement() works as expected', function () {
    $crawler = new HeadlessBrowserCrawler();

    $crawler
        ->input('http://localhost:8000/browser-actions/wait')
        ->addStep(
            Http::get()
                ->postBrowserNavigateHook(
                    BrowserAction::waitUntilDocumentContainsElement('#delayed_container'),
                )
                ->keep('body'),
        );

    $results = helper_generatorToArray($crawler->run());

    $body = $results[0]->get('body');

    expect($body)->toContain('<div id="delayed_container">hooray</div>');
});

test('BrowserAction::clickElementAndWaitForReload() works as expected', function () {
    $crawler = new HeadlessBrowserCrawler();

    $crawler
        ->input('http://localhost:8000/browser-actions/click-and-wait-for-reload')
        ->addStep(
            Http::get()
                ->postBrowserNavigateHook(BrowserAction::clickElementAndWaitForReload('#click'))
                ->keep('body'),
        );

    $results = helper_generatorToArray($crawler->run());

    $body = $results[0]->get('body');

    expect($body)->toContain('<div id="reloaded">yes</div>');
});

test(
    'when on the click and wait for reload page, and the element is only clicked but we don\'t wait for reload, ' .
    'we don\'t get the reloaded page content',
    function () {
        $crawler = new HeadlessBrowserCrawler();

        $crawler
            ->input('http://localhost:8000/browser-actions/click-and-wait-for-reload')
            ->addStep(
                Http::get()
                    ->postBrowserNavigateHook(BrowserAction::clickElement('#click'))
                    ->keep('body'),
            );

        $results = helper_generatorToArray($crawler->run());

        $body = $results[0]->get('body');

        expect($body)->not()->toContain('<div id="reloaded">yes</div>');
    },
);

test(
    'when on the click and wait for reload page, and the element is clicked and we also wait for reload, we get the ' .
    'reloaded page content',
    function () {
        $crawler = new HeadlessBrowserCrawler();

        $crawler
            ->input('http://localhost:8000/browser-actions/click-and-wait-for-reload')
            ->addStep(
                Http::get()
                    ->postBrowserNavigateHook(BrowserAction::clickElement('#click'))
                    ->postBrowserNavigateHook(BrowserAction::waitForReload())
                    ->keep('body'),
            );

        $results = helper_generatorToArray($crawler->run());

        $body = $results[0]->get('body');

        expect($body)->toContain('<div id="reloaded">yes</div>');
    },
);

test('BrowserAction::evaluateAndWaitForReload() works as expected', function () {
    $crawler = new HeadlessBrowserCrawler();

    $crawler
        ->input('http://localhost:8000/browser-actions/evaluate-and-wait-for-reload')
        ->addStep(
            Http::get()
                ->postBrowserNavigateHook(
                    BrowserAction::evaluateAndWaitForReload(
                        'window.location.href = \'http://localhost:8000/browser-actions/' .
                            'evaluate-and-wait-for-reload-reloaded\'',
                    ),
                )
                ->keep('body'),
        );

    $results = helper_generatorToArray($crawler->run());

    $body = $results[0]->get('body');

    expect($body)->toContain('<div id="reloaded">yay</div>');
});

test('BrowserAction::wait() works as expected', function () {
    $crawler = new HeadlessBrowserCrawler();

    $crawler
        ->input('http://localhost:8000/browser-actions/wait')
        ->addStep(
            Http::get()
                ->postBrowserNavigateHook(BrowserAction::wait(0.3))
                ->keep('body'),
        );

    $results = helper_generatorToArray($crawler->run());

    $body = $results[0]->get('body');

    expect($body)->toContain('<div id="delayed_container">hooray</div>');
});

it('executes the javascript code provided via HeadlessBrowserLoaderHelper::setPageInitScript()', function () {
    $crawler = new HeadlessBrowserCrawler();

    $crawler
        ->getLoader()
        ->browser()
        ->setPageInitScript('window._secret_content = \'secret content\'');

    $crawler
        ->input('http://localhost:8000/page-init-script')
        ->addStep(Http::get())
        ->addStep(Html::root()->extract(['content' => '#content']));

    $results = helper_generatorToArray($crawler->run());

    expect($results[0]->get('content'))->toBe('secret content');
});

it('gets the source of an XML response without being wrapped in an HTML document', function () {
    $crawler = new HeadlessBrowserCrawler();

    $crawler
        ->input('http://localhost:8000/rss-feed')
        ->addStep(Http::get()->keep(['body']));

    $results = helper_generatorToArray($crawler->run());

    expect($results[0]->get('body'))->toStartWith('<?xml version="1.0" encoding="utf-8"?>' . PHP_EOL . '<rss');
});

it(
    'gets the source of an XML response without being wrapped in an HTML document even when chrome does not ' .
    'identify the document as an XML document',
    function () {
        $crawler = new HeadlessBrowserCrawler();

        $crawler
            ->input('http://localhost:8000/broken-mime-type-rss')
            ->addStep(Http::get()->keep(['body']));

        $results = helper_generatorToArray($crawler->run());

        expect($results[0]->get('body'))->toStartWith('<?xml version="1.0" encoding="UTF-8"?>');
    },
);


================================================
FILE: tests/_Integration/Http/Html/PaginatedListingTest.php
================================================
<?php

namespace tests\_Integration\Http\Html;

use Crwlr\Crawler\HttpCrawler;
use Crwlr\Crawler\Loader\LoaderInterface;
use Crwlr\Crawler\Steps\Html;
use Crwlr\Crawler\Steps\Loading\Http;
use Crwlr\Crawler\UserAgents\BotUserAgent;
use Crwlr\Crawler\UserAgents\UserAgentInterface;
use Psr\Log\LoggerInterface;

use function tests\helper_generatorToArray;
use function tests\helper_getFastLoader;

it('paginates through pagination', function () {
    $crawler = new class extends HttpCrawler {
        protected function userAgent(): UserAgentInterface
        {
            return new BotUserAgent('MyBot');
        }

        public function loader(UserAgentInterface $userAgent, LoggerInterface $logger): LoaderInterface
        {
            return helper_getFastLoader($userAgent, $logger);
        }
    };

    $crawler->input('http://localhost:8000/paginated-listing');

    $crawler
        ->addStep(Http::get()->paginate('#nextPage'))
        ->addStep(Html::getLinks('#listing .item a')->keepAs('url'))
        ->addStep(Http::get())
        ->addStep(
            Html::first('article')
                ->extract(['title' => 'h1', 'number' => '.someNumber'])
                ->keep(),
        );

    $results = helper_generatorToArray($crawler->run());

    expect($results)->toHaveCount(10)
        ->and($results[0]->toArray())->toBe([
            'url' => 'http://localhost:8000/paginated-listing/items/1',
            'title' => 'Some Item 1',
            'number' => '10',
        ])
        ->and($results[9]->toArray())->toBe([
            'url' => 'http://localhost:8000/paginated-listing/items/10',
            'title' => 'Some Item 10',
            'number' => '100',
        ]);
});


================================================
FILE: tests/_Integration/Http/Html/SimpleListingTest.php
================================================
<?php

namespace tests\_Integration\Http\Html;

use Crwlr\Crawler\HttpCrawler;
use Crwlr\Crawler\Loader\LoaderInterface;
use Crwlr\Crawler\Steps\Html;
use Crwlr\Crawler\Steps\Loading\Http;
use Crwlr\Crawler\UserAgents\BotUserAgent;
use Crwlr\Crawler\UserAgents\UserAgentInterface;
use Psr\Log\LoggerInterface;

use function tests\helper_generatorToArray;
use function tests\helper_getFastLoader;

it('gets all the links from a listing and gets data from the detail pages', function () {
    $crawler = new class extends HttpCrawler {
        protected function userAgent(): UserAgentInterface
        {
            return new BotUserAgent('MyBot');
        }

        public function loader(UserAgentInterface $userAgent, LoggerInterface $logger): LoaderInterface
        {
            return helper_getFastLoader($userAgent, $logger);
        }
    };

    $crawler->input('http://localhost:8000/simple-listing');

    $crawler->addStep(Http::get())
        ->addStep(Html::getLinks('.listingItem a'))
        ->addStep(Http::get())
        ->addStep(
            Html::first('article')
                ->extract([
                    'title' => 'h1',
                    'date' => '.date',
                    'author' => '.articleAuthor',
                ])
                ->keep(),
        );

    $results = helper_generatorToArray($crawler->run());

    expect($results)->toHaveCount(3)
        ->and($results[0]->toArray())->toBe([
            'title' => 'Some Article 1',
            'date' => '2022-04-13',
            'author' => 'Christian Olear',
        ])
        ->and($results[1]->toArray())->toBe([
            'title' => 'Some Article 2',
            'date' => '2022-04-14',
            'author' => 'Christian Olear',
        ])
        ->and($results[2]->toArray())->toBe([
            'title' => 'Some Article 3',
            'date' => '2022-04-15',
            'author' => 'Christian Olear',
        ]);
});


================================================
FILE: tests/_Integration/Http/PaginationTest.php
================================================
<?php

namespace tests\_Integration\Http;

use Crwlr\Crawler\HttpCrawler;
use Crwlr\Crawler\Loader\LoaderInterface;
use Crwlr\Crawler\Steps\Loading\Http;
use Crwlr\Crawler\UserAgents\UserAgent;
use Crwlr\Crawler\UserAgents\UserAgentInterface;
use PHPUnit\Framework\TestCase;
use Psr\Log\LoggerInterface;

use function tests\helper_generatorToArray;
use function tests\helper_getFastLoader;

class PaginationCrawler extends HttpCrawler
{
    protected function userAgent(): UserAgentInterface
    {
        return new UserAgent('PaginationCrawler');
    }

    protected function loader(UserAgentInterface $userAgent, LoggerInterface $logger): LoaderInterface
    {
        return helper_getFastLoader($userAgent, $logger);
    }
}

/** @var TestCase $this */

it('iterates through pagination with the simple website paginator', function () {
    $crawler = new PaginationCrawler();

    $crawler->input('http://localhost:8000/paginated-listing')
        ->addStep(Http::get()->paginate('#pagination'));

    $results = helper_generatorToArray($crawler->run());

    expect($results)->toHaveCount(5);
});

it('only iterates pagination until max pages limit is reached', function () {
    $crawler = new PaginationCrawler();

    $crawler->input('http://localhost:8000/paginated-listing')
        ->addStep(Http::get()->paginate('#pagination', 2));

    $results = helper_generatorToArray($crawler->run());

    expect($results)->toHaveCount(2)
        ->and($this->getActualOutputForAssertion())->toContain('Max pages limit reached');
});

it('resets the finished paginating state after each processed (/paginated) input', function () {
    $crawler = new PaginationCrawler();

    $crawler
        ->inputs(['http://localhost:8000/paginated-listing', 'http://localhost:8000/paginated-listing?foo=bar'])
        ->addStep(Http::get()->paginate('#pagination', 2)->outputKey('response'));

    $results = helper_generatorToArray($crawler->run());

    expect($results)->toHaveCount(4);
});


================================================
FILE: tests/_Integration/Http/ProxyingTest.php
================================================
<?php

use Crwlr\Crawler\Result;
use Crwlr\Crawler\Steps\Loading\Http;
use Symfony\Component\Process\Process;

use function tests\helper_getFastCrawler;

class ProxyServerProcesses
{
    public const PORTS = [8001, 8002, 8003];

    /**
     * @var array<int, ?Process>
     */
    public static array $processes = [8001 => null, 8002 => null, 8003 => null];
}

beforeEach(function () {
    $startedProcesses = false;

    foreach (ProxyServerProcesses::PORTS as $port) {
        if (!ProxyServerProcesses::$processes[$port]) {
            ProxyServerProcesses::$processes[$port] = Process::fromShellCommandline(
                'php -S localhost:' . $port . ' ' . __DIR__ . '/../ProxyServer.php',
            );

            ProxyServerProcesses::$processes[$port]->start();

            $startedProcesses = true;
        }
    }

    if ($startedProcesses) {
        usleep(100_000);
    }
});

afterAll(function () {
    foreach (ProxyServerProcesses::PORTS as $port) {
        ProxyServerProcesses::$processes[$port]?->stop(3, SIGINT);

        ProxyServerProcesses::$processes[$port] = null;
    }
});

it('uses a proxy when the useProxy() method of the loader was called', function () {
    $crawler = helper_getFastCrawler();

    $crawler->getLoader()->useProxy('http://localhost:8001');

    $crawler
        ->input('http://www.crwlr.software/packages')
        ->addStep(Http::get()->keep(['body']));

    $results = iterator_to_array($crawler->run());

    expect($results[0])
        ->toBeInstanceOf(Result::class)
        ->and($results[0]->get('body'))
        ->toContain('Proxy Server Response for http://www.crwlr.software/packages');
});

it('uses correct method, headers and HTTP version in the proxied request', function () {
    $crawler = helper_getFastCrawler();

    $crawler->getLoader()->useProxy('http://localhost:8001');

    $crawler
        ->input('http://www.crwlr.software/packages')
        ->addStep(
            Http::put(['Accept-Encoding' => 'gzip, deflate, br'], 'Hello World', '1.0')
                ->keep(['body']),
        );

    $results = iterator_to_array($crawler->run());

    expect($results[0])
        ->toBeInstanceOf(Result::class)
        ->and($results[0]->get('body'))
        ->toContain('Protocol Version: HTTP/1.0')
        ->toContain('Request Method: PUT')
        ->toContain('Request Body: Hello World')
        ->toContain('["Accept-Encoding"]=>' . PHP_EOL . '  string(17) "gzip, deflate, br"');
});

it('uses rotating proxies when the useRotatingProxies() method of the loader was called', function () {
    $crawler = helper_getFastCrawler();

    $crawler->getLoader()->useRotatingProxies([
        'http://localhost:8001',
        'http://localhost:8002',
        'http://localhost:8003',
    ]);

    $crawler
        ->input([
            'http://www.crwlr.software/packages/crawler/v1.1/getting-started',
            'http://www.crwlr.software/packages/url/v2.0/getting-started',
            'http://www.crwlr.software/packages/query-string/v1.0/getting-started',
            'http://www.crwlr.software/packages/robots-txt/v1.1/getting-started',
        ])
        ->addStep(Http::get()->keep(['body']));

    $results = iterator_to_array($crawler->run());

    expect($results)->toHaveCount(4)
        ->and($results[0])
        ->toBeInstanceOf(Result::class)
        ->and($results[0]->get('body'))
        ->toContain('Port: 8001')           // First request with first proxy
        ->and($results[1])
        ->toBeInstanceOf(Result::class)
        ->and($results[1]->get('body'))
        ->toContain('Port: 8002')           // Second request with second proxy
        ->and($results[2])
        ->toBeInstanceOf(Result::class)
        ->and($results[2]->get('body'))
        ->toContain('Port: 8003')           // Third request with third proxy
        ->and($results[3])
        ->toBeInstanceOf(Result::class)
        ->and($results[3]->get('body'))
        ->toContain('Port: 8001');          // And finally the fourth request with the first proxy again.
});

it('can also use a proxy when using the headless browser', function () {
    $crawler = helper_getFastCrawler();

    $crawler
        ->getLoader()
        ->useHeadlessBrowser()
        ->useProxy('http://localhost:8001');

    $crawler
        ->input('http://www.crwlr.software/blog')
        ->addStep(
            Http::get(['Accept-Language' => 'de-DE,de;q=0.9,en-US;q=0.8,en;q=0.7'])
                ->keep(['body']),
        );

    $results = iterator_to_array($crawler->run());

    expect($results[0])
        ->toBeInstanceOf(Result::class)
        ->and($results[0]->get('body'))
        ->toContain('["Accept-Language"]=&gt;' . PHP_EOL . '  string(35) "de-DE,de;q=0.9,en-US;q=0.8,en;q=0.7"');
});

it('can also use rotating proxies when using the headless browser', function () {
    $crawler = helper_getFastCrawler();

    $crawler
        ->getLoader()
        ->useHeadlessBrowser()
        ->useRotatingProxies([
            'http://localhost:8001',
            'http://localhost:8002',
        ]);

    $crawler
        ->input([
            'http://www.crwlr.software/packages/crawler/v1.1',
            'http://www.crwlr.software/packages/url/v2.0',
            'http://www.crwlr.software/packages/query-string/v1.0',
        ])
        ->addStep(Http::get()->keep(['body']));

    $results = iterator_to_array($crawler->run());

    expect($results)->toHaveCount(3)
        ->and($results[0])
        ->toBeInstanceOf(Result::class)
        ->and($results[0]->get('body'))
        ->toContain('Port: 8001')           // First request with first proxy
        ->and($results[1])
        ->toBeInstanceOf(Result::class)
        ->and($results[1]->get('body'))
        ->toContain('Port: 8002')           // Second request with second proxy
        ->and($results[2])
        ->toBeInstanceOf(Result::class)
        ->and($results[2]->get('body'))
        ->toContain('Port: 8001');          // And finally the third request with the first proxy again.
});


================================================
FILE: tests/_Integration/Http/PublisherExampleTest.php
================================================
<?php

namespace tests\_Integration\Http;

use Crwlr\Crawler\HttpCrawler;
use Crwlr\Crawler\Loader\LoaderInterface;
use Crwlr\Crawler\Steps\Dom;
use Crwlr\Crawler\Steps\Html;
use Crwlr\Crawler\Steps\Loading\Http;
use Crwlr\Crawler\UserAgents\UserAgent;
use Crwlr\Crawler\UserAgents\UserAgentInterface;
use Psr\Log\LoggerInterface;

use function tests\helper_generatorToArray;
use function tests\helper_getFastLoader;

class PublisherExampleCrawler extends HttpCrawler
{
    public function loader(UserAgentInterface $userAgent, LoggerInterface $logger): LoaderInterface
    {
        return helper_getFastLoader($userAgent, $logger);
    }

    protected function userAgent(): UserAgentInterface
    {
        return new UserAgent('SomeUserAgent');
    }
}

test('Http steps can also deal with multiple URLs as one array input', function () {
    $crawler = new PublisherExampleCrawler();

    $crawler
        ->input('http://localhost:8000/publisher/authors')
        ->addStep(Http::get())
        ->addStep(Html::getLinks('#authors a'))
        ->addStep(Http::get())
        ->addStep(
            Html::root()
                ->extract([
                    'author' => 'h1',
                    'bookUrls' => Dom::cssSelector('#author-data .books a.book')->attribute('href')->toAbsoluteUrl(),
                ])
                ->keep(['author']),
        )
        ->addStep(Http::get()->useInputKey('bookUrls'))
        ->addStep(
            Html::root()
                ->extract(['book' => 'h1'])
                ->keep(),
        );

    $results = helper_generatorToArray($crawler->run());

    expect($results)->toHaveCount(5)
        ->and($results[0]->toArray())->toBe([
            'author' => 'John Example',
            'book' => 'Some novel',
        ])
        ->and($results[1]->toArray())->toBe([
            'author' => 'John Example',
            'book' => 'Another novel',
        ])
        ->and($results[2]->toArray())->toBe([
            'author' => 'Susan Example',
            'book' => 'Poems #1',
        ])
        ->and($results[3]->toArray())->toBe([
            'author' => 'Susan Example',
            'book' => 'Poems #2',
        ])
        ->and($results[4]->toArray())->toBe([
            'author' => 'Susan Example',
            'book' => 'Poems #3',
        ]);
});

it('turns an array of URLs to nested extracted data from those child pages using sub crawlers', function () {
    $crawlerBuilder = new class {
        public function build(): \Crwlr\Crawler\Crawler
        {
            $crawler = new PublisherExampleCrawler();

            return $crawler
                ->input('http://localhost:8000/publisher/authors')
                ->addStep(Http::get())
                ->addStep(Html::getLinks('#authors a'))
                ->addStep(Http::get())
                ->addStep($this->extractAuthorData());
        }

        private function extractAuthorData(): Html
        {
            return Html::root()
                ->extract([
                    'name' => 'h1',
                    'age' => '#author-data .age',
                    'bornIn' => '#author-data .born-in',
                    'books' => Dom::cssSelector('#author-data .books a.book')->link(),
                ])
                ->subCrawlerFor('books', function (\Crwlr\Crawler\Crawler $crawler) {
                    return $crawler
                        ->addStep(Http::get())
                        ->addStep(
                            $this->extractBookData(),
                        );
                });
        }

        private function extractBookData(): Html
        {
            return Html::root()
                ->extract(['title' => 'h1', 'editions' => Dom::cssSelector('#editions a')->link()])
                ->subCrawlerFor('editions', function (\Crwlr\Crawler\Crawler $crawler) {
                    return $crawler
                        ->addStep(Http::get())
                        ->addStep($this->extractEditionData());
                });
        }

        private function extractEditionData(): Html
        {
            return Html::root()
                ->extract(['year' => '.year', 'publisher' => '.publishingCompany']);
        }
    };

    $results = helper_generatorToArray($crawlerBuilder->build()->run());

    expect($results)->toHaveCount(2)
        ->and($results[0]->toArray())->toBe([
            'name' => 'John Example',
            'age' => '51',
            'bornIn' => 'Lisbon',
            'books' => [
                [
                    'title' => 'Some novel',
                    'editions' => [
                        ['year' => '1996', 'publisher' => 'Foo'],
                        ['year' => '2005', 'publisher' => 'Foo'],
                    ],
                ],
                [
                    'title' => 'Another novel',
                    'editions' => [
                        ['year' => '2001', 'publisher' => 'Foo'],
                        ['year' => '2009', 'publisher' => 'Bar'],
                        ['year' => '2017', 'publisher' => 'Bar'],
                    ],
                ],
            ],
        ])
        ->and($results[1]->toArray())->toBe([
            'name' => 'Susan Example',
            'age' => '49',
            'bornIn' => 'Athens',
            'books' => [
                [
                    'title' => 'Poems #1',
                    'editions' => [
                        ['year' => '2008', 'publisher' => 'Poems'],
                        ['year' => '2009', 'publisher' => 'Poems'],
                    ],
                ],
                [
                    'title' => 'Poems #2',
                    'editions' => [
                        ['year' => '2011', 'publisher' => 'Poems'],
                        ['year' => '2014', 'publisher' => 'New Poems'],
                    ],
                ],
                [
                    'title' => 'Poems #3',
                    'editions' => [
                        ['year' => '2013', 'publisher' => 'Poems'],
                        ['year' => '2017', 'publisher' => 'New Poems'],
                    ],
                ],
            ],
        ]);
});

test('it can also keep the URLs, provided to the sub crawler', function () {
    $crawlerBuilder = new class {
        public function build(): \Crwlr\Crawler\Crawler
        {
            $crawler = new PublisherExampleCrawler();

            return $crawler
                ->input('http://localhost:8000/publisher/authors')
                ->addStep(Http::get())
                ->addStep(Html::getLinks('#authors a'))
                ->addStep(Http::get())
                ->addStep($this->extractAuthorData());
        }

        private function extractAuthorData(): Html
        {
            return Html::root()
                ->extract([
                    'name' => 'h1',
                    'age' => '#author-data .age',
                    'bornIn' => '#author-data .born-in',
                    'books' => Dom::cssSelector('#author-data .books a.book')->link(),
                ])
                ->subCrawlerFor('books', function (\Crwlr\Crawler\Crawler $crawler) {
                    return $crawler
                        ->addStep(Http::get()->keepInputAs('url'))
                        ->addStep($this->extractBookData());
                });
        }

        private function extractBookData(): Html
        {
            return Html::root()
                ->extract(['title' => 'h1', 'editions' => Dom::cssSelector('#editions a')->link()])
                ->subCrawlerFor('editions', function (\Crwlr\Crawler\Crawler $crawler) {
                    return $crawler
                        ->addStep(Http::get()->keepInputAs('url'))
                        ->addStep($this->extractEditionData());
                });
        }

        private function extractEditionData(): Html
        {
            return Html::root()
                ->extract(['year' => '.year', 'publisher' => '.publishingCompany']);
        }
    };

    $results = helper_generatorToArray($crawlerBuilder->build()->run());

    expect($results)->toHaveCount(2)
        ->and($results[0]->toArray())->toBe([
            'name' => 'John Example',
            'age' => '51',
            'bornIn' => 'Lisbon',
            'books' => [
                [
                    'url' => 'http://localhost:8000/publisher/books/1',
                    'title' => 'Some novel',
                    'editions' => [
                        [
                            'url' => 'http://localhost:8000/publisher/books/1/edition/1',
                            'year' => '1996',
                            'publisher' => 'Foo',
                        ],
                        [
                            'url' => 'http://localhost:8000/publisher/books/1/edition/2',
                            'year' => '2005',
                            'publisher' => 'Foo',
                        ],
                    ],
                ],
                [
                    'url' => 'http://localhost:8000/publisher/books/2',
                    'title' => 'Another novel',
                    'editions' => [
                        [
                            'url' => 'http://localhost:8000/publisher/books/2/edition/1',
                            'year' => '2001',
                            'publisher' => 'Foo',
                        ],
                        [
                            'url' => 'http://localhost:8000/publisher/books/2/edition/2',
                            'year' => '2009',
                            'publisher' => 'Bar',
                        ],
                        [
                            'url' => 'http://localhost:8000/publisher/books/2/edition/3',
                            'year' => '2017',
                            'publisher' => 'Bar',
                        ],
                    ],
                ],
            ],
        ])
        ->and($results[1]->toArray())->toBe([
            'name' => 'Susan Example',
            'age' => '49',
            'bornIn' => 'Athens',
            'books' => [
                [
                    'url' => 'http://localhost:8000/publisher/books/3',
                    'title' => 'Poems #1',
                    'editions' => [
                        [
                            'url' => 'http://localhost:8000/publisher/books/3/edition/1',
                            'year' => '2008',
                            'publisher' => 'Poems',
                        ],
                        [
                            'url' => 'http://localhost:8000/publisher/books/3/edition/2',
                            'year' => '2009',
                            'publisher' => 'Poems',
                        ],
                    ],
                ],
                [
                    'url' => 'http://localhost:8000/publisher/books/4',
                    'title' => 'Poems #2',
                    'editions' => [
                        [
                            'url' => 'http://localhost:8000/publisher/books/4/edition/1',
                            'year' => '2011',
                            'publisher' => 'Poems',
                        ],
                        [
                            'url' => 'http://localhost:8000/publisher/books/4/edition/2',
                            'year' => '2014',
                            'publisher' => 'New Poems',
                        ],
                    ],
                ],
                [
                    'url' => 'http://localhost:8000/publisher/books/5',
                    'title' => 'Poems #3',
                    'editions' => [
                        [
                            'url' => 'http://localhost:8000/publisher/books/5/edition/1',
                            'year' => '2013',
                            'publisher' => 'Poems',
                        ],
                        [
                            'url' => 'http://localhost:8000/publisher/books/5/edition/2',
                            'year' => '2017',
                            'publisher' => 'New Poems',
                        ],
                    ],
                ],
            ],
        ]);
});


================================================
FILE: tests/_Integration/Http/QueryParamPaginationTest.php
================================================
<?php

namespace tests\_Integration\Http;

use Crwlr\Crawler\HttpCrawler;
use Crwlr\Crawler\Loader\LoaderInterface;
use Crwlr\Crawler\Steps\Loading\Http;
use Crwlr\Crawler\Steps\Loading\Http\Paginator;
use Crwlr\Crawler\Steps\Loading\Http\Paginators\QueryParamsPaginator;
use Crwlr\Crawler\Steps\Loading\Http\Paginators\StopRules\PaginatorStopRules;
use Crwlr\Crawler\UserAgents\UserAgent;
use Crwlr\Crawler\UserAgents\UserAgentInterface;
use PHPUnit\Framework\TestCase;
use Psr\Log\LoggerInterface;

use function tests\helper_generatorToArray;
use function tests\helper_getFastLoader;

class QueryParamPaginationCrawler extends HttpCrawler
{
    protected function userAgent(): UserAgentInterface
    {
        return new UserAgent('QueryParamPaginationCrawler');
    }

    protected function loader(UserAgentInterface $userAgent, LoggerInterface $logger): LoaderInterface
    {
        return helper_getFastLoader($userAgent, $logger);
    }
}

/** @var TestCase $this */

it('paginates using query params sent in the request body', function () {
    $crawler = new QueryParamPaginationCrawler();

    $crawler
        ->input('http://localhost:8000/query-param-pagination')
        ->addStep(
            Http::post(body: 'page=1')
                ->paginate(
                    Paginator::queryParams(5)
                        ->inBody()
                        ->increase('page')
                        ->stopWhen(PaginatorStopRules::isEmptyInJson('data.items')),
                )->keep(['body']),
        );

    $results = helper_generatorToArray($crawler->run());

    expect($results)->toHaveCount(4);
});

it('also paginates using query params sent in the request body, when used in combination with static URL', function () {
    $crawler = new QueryParamPaginationCrawler();

    $crawler
        ->input('foo')
        ->addStep(
            Http::post(body: 'page=1')
                ->staticUrl('http://localhost:8000/query-param-pagination')
                ->paginate(
                    Paginator::queryParams(3)
                        ->inBody()
                        ->increase('page'),
                )->keep(['body']),
        );

    $results = helper_generatorToArray($crawler->run());

    expect($results)->toHaveCount(3);
});

it('paginates using URL query params', function () {
    $crawler = new QueryParamPaginationCrawler();

    $crawler
        ->input('http://localhost:8000/query-param-pagination?page=1')
        ->addStep(
            Http::get()
                ->paginate(
                    Paginator::queryParams(5)
                        ->inUrl()
                        ->increase('page')
                        ->stopWhen(PaginatorStopRules::isEmptyInJson('data.items')),
                )->keep(['body']),
        );

    $results = helper_generatorToArray($crawler->run());

    expect($results)->toHaveCount(4);
});

it('paginates only until the max pages limit', function () {
    $crawler = new QueryParamPaginationCrawler();

    $crawler
        ->input('http://localhost:8000/query-param-pagination?page=1')
        ->addStep(
            Http::get()
                ->paginate(
                    QueryParamsPaginator::paramsInUrl(2)
                        ->increase('page')
                        ->stopWhen(PaginatorStopRules::isEmptyInJson('data.items')),
                )->keep(['body']),
        );

    $results = helper_generatorToArray($crawler->run());

    expect($results)->toHaveCount(2);
});

it('resets the finished paginating state after each processed (/paginated) input', function () {
    $crawler = new QueryParamPaginationCrawler();

    $crawler
        ->inputs([
            'http://localhost:8000/query-param-pagination?page=1',
            'http://localhost:8000/query-param-pagination?page=1&foo=bar',
        ])
        ->addStep(
            Http::get()
                ->paginate(
                    QueryParamsPaginator::paramsInUrl(2)
                        ->increase('page')
                        ->stopWhen(PaginatorStopRules::isEmptyInJson('data.items')),
                )->keep(['body']),
        );

    $results = helper_generatorToArray($crawler->run());

    expect($results)->toHaveCount(4);
});


================================================
FILE: tests/_Integration/Http/RedirectTest.php
================================================
<?php

namespace tests\_Integration\Http;

use Crwlr\Crawler\Cache\Exceptions\MissingZlibExtensionException;
use Crwlr\Crawler\HttpCrawler;
use Crwlr\Crawler\Loader\Http\HttpLoader;
use Crwlr\Crawler\Loader\Http\Messages\RespondedRequest;
use Crwlr\Crawler\Loader\LoaderInterface;
use Crwlr\Crawler\Steps\Loading\Http;
use Crwlr\Crawler\Steps\Step;
use Crwlr\Crawler\UserAgents\UserAgent;
use Crwlr\Crawler\UserAgents\UserAgentInterface;
use Generator;
use PHPUnit\Framework\TestCase;
use Psr\Log\LoggerInterface;

use function tests\helper_generatorToArray;

class RedirectTestCrawler extends HttpCrawler
{
    protected function userAgent(): UserAgentInterface
    {
        return new UserAgent('RedirectBot');
    }
}

class GetResponseBodyAsString extends Step
{
    /**
     * @param RespondedRequest $input
     * @throws MissingZlibExtensionException
     */
    protected function invoke(mixed $input): Generator
    {
        yield Http::getBodyString($input);
    }
}

/** @var TestCase $this */

it('follows redirects', function () {
    $crawler = new RedirectTestCrawler();

    $crawler
        ->input('http://localhost:8000/redirect?stopAt=5')
        ->addStep(Http::get())
        ->addStep((new GetResponseBodyAsString())->keepAs('body'));

    $results = helper_generatorToArray($crawler->run());

    expect($results)->toHaveCount(1)
        ->and($results[0]->get('body'))->toBe('success after 5 redirects');
});

it('stops at 10 redirects by default', function () {
    $crawler = new RedirectTestCrawler();

    $crawler
        ->input('http://localhost:8000/redirect?stopAt=11')
        ->addStep(Http::get())
        ->addStep((new GetResponseBodyAsString())->keepAs('body'));

    $results = helper_generatorToArray($crawler->run());

    expect($results)->toHaveCount(0);

    $logOutput = $this->getActualOutputForAssertion();

    expect($logOutput)->toContain('Failed to load http://localhost:8000/redirect?stopAt=11: Too many redirects.');
});

test('you can set your own max redirects limit', function () {
    $crawler = new class extends HttpCrawler {
        protected function userAgent(): UserAgentInterface
        {
            return new UserAgent('RedirectBot');
        }

        protected function loader(UserAgentInterface $userAgent, LoggerInterface $logger): LoaderInterface
        {
            $loader = parent::loader($userAgent, $logger);

            if ($loader instanceof HttpLoader) {
                $loader->setMaxRedirects(15);
            }

            return $loader;
        }
    };

    $crawler
        ->input('http://localhost:8000/redirect?stopAt=11')
        ->addStep(Http::get())
        ->addStep((new GetResponseBodyAsString())->keepAs('body'));

    $results = helper_generatorToArray($crawler->run());

    expect($results)->toHaveCount(1)
        ->and($results[0]->get('body'))->toBe('success after 11 redirects');
});


================================================
FILE: tests/_Integration/Http/RequestParamsFromInputTest.php
================================================
<?php

namespace tests\_Integration\Http;

use Crwlr\Crawler\Steps\Json;
use Crwlr\Crawler\Steps\Loading\Http;
use Crwlr\Crawler\Steps\Step;
use Generator;

use function tests\helper_generatorToArray;
use function tests\helper_getFastCrawler;

test('Http steps can receive url, body and headers from an input array', function () {
    $paramsStep = new class extends Step {
        protected function invoke(mixed $input): Generator
        {
            yield [
                'url' => 'http://localhost:8000/print-headers',
                'body' => 'test',
                'headers' => [
                    'header-x' => 'foo',
                    'header-y' => ['bar'],
                ],
                'header-y' => 'baz',
                'header-z' => ['quz'],
            ];
        }
    };

    $crawler = helper_getFastCrawler();

    $crawler
        ->input('anything')
        ->addStep($paramsStep)
        ->addStep(
            Http::get()
                ->useInputKeyAsBody('body')
                ->useInputKeyAsHeaders('headers')
                ->useInputKeyAsHeader('header-y', 'header-y')
                ->useInputKeyAsHeader('header-z', 'header-z'),
        )
        ->addStep(Json::all());

    $results = helper_generatorToArray($crawler->run());

    expect($results)->toHaveCount(1);

    $result = $results[0]->toArray();

    expect($result['Content-Length'])->toBe('4');

    expect($result['header-x'])->toBe('foo');

    expect($result['header-y'])->toBe('bar, baz');

    expect($result['header-z'])->toBe('quz');
});


================================================
FILE: tests/_Integration/Http/RetryErrorResponsesTest.php
================================================
<?php

namespace tests\_Integration\Http;

use Crwlr\Crawler\HttpCrawler;
use Crwlr\Crawler\Loader\Http\HttpLoader;
use Crwlr\Crawler\Loader\Http\Politeness\RetryErrorResponseHandler;
use Crwlr\Crawler\Loader\LoaderInterface;
use Crwlr\Crawler\Steps\Loading\Http;
use Crwlr\Crawler\UserAgents\UserAgent;
use Crwlr\Crawler\UserAgents\UserAgentInterface;
use Psr\Log\LoggerInterface;

use function tests\helper_generatorToArray;

class RetryErrorResponsesCrawler extends HttpCrawler
{
    protected function userAgent(): UserAgentInterface
    {
        return new UserAgent('SomeBot');
    }

    public function loader(UserAgentInterface $userAgent, LoggerInterface $logger): LoaderInterface
    {
        return new HttpLoader(
            $userAgent,
            logger: $logger,
            retryErrorResponseHandler: new RetryErrorResponseHandler(2, [1, 2], 3),
        );
    }
}

it('retries after defined number of seconds', function ($path) {
    $crawler = new RetryErrorResponsesCrawler();

    $crawler->input('http://localhost:8000' . $path)
        ->addStep(Http::get());

    $start = microtime(true);

    helper_generatorToArray($crawler->run());

    $end = microtime(true);

    $diff = $end - $start;

    expect($diff)->toBeGreaterThan(3.0);

    expect($diff)->toBeLessThan(3.5);
})->with(['/too-many-requests', '/service-unavailable']);

it(
    'starts the first retry after the number of seconds returned in the Retry-After HTTP header',
    function (string $path) {
        $crawler = new RetryErrorResponsesCrawler();

        $crawler
            ->input('http://localhost:8000' . $path . '/retry-after')
            ->addStep(Http::get());

        $start = microtime(true);

        helper_generatorToArray($crawler->run());

        $end = microtime(true);

        $diff = $end - $start;

        expect($diff)->toBeGreaterThan(4.0);

        expect($diff)->toBeLessThan(4.5);
    },
)->with(['/too-many-requests', '/service-unavailable']);

it('goes on crawling when a retry receives a successful response', function (string $path) {
    $crawler = new RetryErrorResponsesCrawler();

    $crawler->input('http://localhost:8000' . $path . '/succeed-on-second-attempt')
        ->addStep(Http::get());

    $start = microtime(true);

    $results = helper_generatorToArray($crawler->run());

    $end = microtime(true);

    $diff = $end - $start;

    expect($results)->toHaveCount(1);

    expect($diff)->toBeGreaterThan(1.0);

    expect($diff)->toBeLessThan(1.5);
})->with(['/too-many-requests', '/service-unavailable']);


================================================
FILE: tests/_Integration/Http/RobotsTxtTest.php
================================================
<?php

namespace tests\_Integration\Http;

use Crwlr\Crawler\HttpCrawler;
use Crwlr\Crawler\Loader\LoaderInterface;
use Crwlr\Crawler\Steps\Html;
use Crwlr\Crawler\Steps\Loading\Http;
use Crwlr\Crawler\UserAgents\BotUserAgent;
use Crwlr\Crawler\UserAgents\UserAgentInterface;
use Psr\Log\LoggerInterface;
use tests\_Stubs\DummyLogger;

use function tests\helper_generatorToArray;
use function tests\helper_getFastLoader;

/**
 * @method DummyLogger getLogger()
 */
class RobotsTxtCrawler extends HttpCrawler
{
    protected function logger(): LoggerInterface
    {
        return new DummyLogger();
    }

    protected function userAgent(): UserAgentInterface
    {
        return new BotUserAgent('MyBot');
    }

    public function loader(UserAgentInterface $userAgent, LoggerInterface $logger): LoaderInterface
    {
        return helper_getFastLoader($userAgent, $logger);
    }
}

it('does not warn about loader hooks being called multiple times', function () {
    // This occurred because the RobotsTxtHandler, used by the HttpLoader, loads the robots.txt via HttpLoader::load().
    // The call to the RobotsTxtHandler is triggered from within HttpLoader::load(), after the loader hooks
    // had already been reset at the start of the load() method. Resetting the loader hooks not only at the beginning
    // but also at the end of HttpLoader::load() resolves the issue.
    $crawler = new RobotsTxtCrawler();

    $crawler
        ->input('http://localhost:8000/hello-world')
        ->addStep(Http::get())
        ->addStep(Html::root()->extract('body')->keepAs('body'));

    $results = helper_generatorToArray($crawler->run());

    expect($results[0]->get('body'))->toBe('Hello World!');

    $logger = $crawler->getLogger();

    foreach ($logger->messages as $message) {
        expect($message['message'])->not->toContain(' was already called in this load call.');
    }
});

it('also does not warn about loader hooks being called multiple times when loadOrFail() is used', function () {
    // See comment in the test above.
    $crawler = new RobotsTxtCrawler();

    $crawler
        ->input('http://localhost:8000/hello-world')
        ->addStep(Http::get()->stopOnErrorResponse())
        ->addStep(Html::root()->extract('body')->keepAs('body'));

    $results = helper_generatorToArray($crawler->run());

    expect($results[0]->get('body'))->toBe('Hello World!');

    $logger = $crawler->getLogger();

    foreach ($logger->messages as $message) {
        expect($message['message'])->not->toContain(' was already called in this load call.');
    }
});


================================================
FILE: tests/_Integration/Http/TimeoutTest.php
================================================
<?php

namespace tests\_Integration\Http;

use Crwlr\Crawler\HttpCrawler;
use Crwlr\Crawler\Loader\Http\HttpLoader;
use Crwlr\Crawler\Loader\LoaderInterface;
use Crwlr\Crawler\Steps\Loading\Http;
use Crwlr\Crawler\UserAgents\UserAgent;
use Crwlr\Crawler\UserAgents\UserAgentInterface;
use PHPUnit\Framework\TestCase;
use Psr\Log\LoggerInterface;

/** @var TestCase $this */

it('Fails when timeout is exceeded', function () {
    $crawler = new class extends HttpCrawler {
        protected function userAgent(): UserAgentInterface
        {
            return new UserAgent('SomeUserAgent');
        }

        public function loader(UserAgentInterface $userAgent, LoggerInterface $logger): LoaderInterface
        {
            return new HttpLoader($userAgent, logger: $logger, defaultGuzzleClientConfig: [
                'connect_timeout' => 1,
                'timeout' => 1,
            ]);
        }
    };

    $crawler->input('http://localhost:8000/sleep')
        ->addStep(Http::get());

    $crawler->runAndTraverse();

    expect($this->getActualOutputForAssertion())->toContain('Operation timed out');
});


================================================
FILE: tests/_Integration/ProxyServer.php
================================================
<?php

echo "Proxy Server Response for " . ($_SERVER['REQUEST_URI'] ?? '?') . PHP_EOL . PHP_EOL;

echo "Port: " . $_SERVER['SERVER_PORT'] . PHP_EOL;

echo "Protocol Version: " . $_SERVER['SERVER_PROTOCOL'] . PHP_EOL;

echo "Request Method: " . $_SERVER['REQUEST_METHOD'] . PHP_EOL;

echo "Request Body: " . file_get_contents('php://input') . PHP_EOL;

var_dump(getallheaders());


================================================
FILE: tests/_Integration/Server.php
================================================
<?php

$route = $_SERVER['REQUEST_URI'];

function getParamAfter(string $route, string $after): string
{
    if ($after === '') {
        return $route;
    }

    $result = explode($after, $route);

    return explode('/', $result[1])[0];
}

if ($route === '/simple-listing') {
    return include(__DIR__ . '/_Server/SimpleListing.php');
}

if (str_starts_with($route, '/simple-listing/article/')) {
    $articleId = getParamAfter($route, '/simple-listing/article/');

    return include(__DIR__ . '/_Server/SimpleListing/Detail.php');
}

if (str_starts_with($route, '/paginated-listing')) {
    if (str_starts_with($route, '/paginated-listing/items/')) {
        $itemId = getParamAfter($route, '/paginated-listing/items/');

        return include(__DIR__ . '/_Server/PaginatedListing/Detail.php');
    }

    return include(__DIR__ . '/_Server/PaginatedListing.php');
}

if (str_starts_with($route, '/query-param-pagination')) {
    return include(__DIR__ . '/_Server/QueryParamPagination.php');
}

if ($route === '/blog-post-with-json-ld') {
    return include(__DIR__ . '/_Server/BlogPostWithJsonLd.php');
}

if ($route === '/js-rendering') {
    return include(__DIR__ . '/_Server/JsGeneratedContent.php');
}

if ($route === '/print-headers') {
    return include(__DIR__ . '/_Server/PrintHeaders.php');
}

if ($route === '/set-cookie') {
    return include(__DIR__ . '/_Server/SetCookie.php');
}

if ($route === '/set-js-cookie') {
    return include(__DIR__ . '/_Server/SetCookieJs.php');
}

if ($route === '/scripts/set-cookie.js') {
    echo <<<JS
        document.addEventListener("DOMContentLoaded", function () {
            document.getElementById('consent_btn').addEventListener('click', function (ev) {
                ev.preventDefault();
                document.cookie = "testcookie=javascriptcookie";
            }, false);
        }, false);
        JS;
    return;
}

if ($route === '/set-delayed-js-cookie') {
    return include(__DIR__ . '/_Server/SetDelayedCookieJs.php');
}

if ($route === '/set-multiple-js-cookies') {
    return include(__DIR__ . '/_Server/SetMultipleCookiesJs.php');
}

if (str_starts_with($route, '/browser-actions')) {
    if ($route === '/browser-actions') {
        return include(__DIR__ . '/_Server/BrowserActions/Main.php');
    }

    if (str_starts_with($route, '/browser-actions/click-and-wait-for-reload')) {
        return include(__DIR__ . '/_Server/BrowserActions/ClickAndWaitForReload.php');
    }

    if ($route === '/browser-actions/evaluate-and-wait-for-reload') {
        return include(__DIR__ . '/_Server/BrowserActions/EvaluateAndWaitForReload.php');
    }

    if ($route === '/browser-actions/evaluate-and-wait-for-reload-reloaded') {
        return include(__DIR__ . '/_Server/BrowserActions/EvaluateAndWaitForReloadReloaded.php');
    }

    if ($route === '/browser-actions/wait') {
        return include(__DIR__ . '/_Server/BrowserActions/Wait.php');
    }
}

if ($route === '/print-cookie') {
    return include(__DIR__ . '/_Server/PrintCookie.php');
}

if ($route === '/print-cookies') {
    return include(__DIR__ . '/_Server/PrintCookies.php');
}

if (str_starts_with($route, '/crawling')) {
    return include(__DIR__ . '/_Server/Crawling.php');
}

if (str_starts_with($route, '/too-many-requests')) {
    if (str_ends_with($route, '/succeed-on-second-attempt')) {
        session_start();

        $isSecondRequest = isset($_SESSION["isSecondRequest"]) && $_SESSION["isSecondRequest"] === true;

        if (!$isSecondRequest) {
            $_SESSION["isSecondRequest"] = true;
        }
    }

    $retryAfter = str_ends_with($route, '/retry-after') ? 2 : null;

    return include(__DIR__ . '/_Server/TooManyRequests.php');
}

if (str_starts_with($route, '/service-unavailable')) {
    if (str_ends_with($route, '/succeed-on-second-attempt')) {
        session_start();

        $isSecondRequest = isset($_SESSION["isSecondRequest"]) && $_SESSION["isSecondRequest"] === true;

        if (!$isSecondRequest) {
            $_SESSION["isSecondRequest"] = true;
        }
    }

    $retryAfter = str_ends_with($route, '/retry-after') ? 2 : null;

    return include(__DIR__ . '/_Server/TooManyRequests.php');
}

if (str_starts_with($route, '/client-error-response')) {
    $responseCodes = [400, 401, 404, 405, 410];

    http_response_code($responseCodes[rand(0, 4)]);

    return;
}

if (str_starts_with($route, '/server-error-response')) {
    $responseCodes = [500, 502, 505, 521];

    http_response_code($responseCodes[rand(0, 3)]);

    return;
}

if (str_starts_with($route, '/gzip')) {
    header('Content-Type: application/x-gzip');

    echo gzencode('This is a gzip compressed string');
}

if (str_starts_with($route, '/sleep')) {
    usleep(1050000);

    return;
}

if (str_starts_with($route, '/publisher')) {
    if ($route === '/publisher/authors') {
        return include(__DIR__ . '/_Server/Publisher/AuthorsListPage.php');
    } elseif (str_starts_with($route, '/publisher/authors/')) {
        $author = getParamAfter($route, '/publisher/authors/');

        return include(__DIR__ . '/_Server/Publisher/AuthorDetailPage.php');
    } elseif (str_starts_with($route, '/publisher/books/') && str_contains($route, '/edition/')) {
        $bookNo = (int) getParamAfter($route, '/publisher/books/');

        $edition = (int) getParamAfter($route, '/edition/');

        return include(__DIR__ . '/_Server/Publisher/EditionDetailPage.php');
    } elseif (str_starts_with($route, '/publisher/books/')) {
        $bookNo = (int) getParamAfter($route, '/publisher/books/');

        return include(__DIR__ . '/_Server/Publisher/BookDetailPage.php');
    }
}

if (str_starts_with($route, '/redirect')) {
    $redirectNo = (int) ($_GET['no'] ?? 0);

    $stopAt = $_GET['stopAt'] ?? null;

    if ($stopAt && is_numeric($stopAt)) {
        $stopAt = (int) $stopAt;

        if ($redirectNo >= $stopAt) {
            echo 'success after ' . $redirectNo . ' redirects';

            return;
        } else {
            $stopAt = '&stopAt=' . $stopAt;
        }
    } else {
        $stopAt = '';
    }

    header('Location: http://localhost:8000/redirect?no=' . ($redirectNo + 1) . $stopAt);
}

if (str_starts_with($route, '/non-utf-8-charset')) {
    return include(__DIR__ . '/_Server/NonUtf8.php');
}

if (str_starts_with($route, '/page-init-script')) {
    return include(__DIR__ . '/_Server/PageInitScript.php');
}

if ($route === '/rss-feed') {
    header('Content-Type: text/xml; charset=utf-8');

    return include(__DIR__ . '/_Server/RssFeed.php');
}

if ($route === '/broken-mime-type-rss') {
    header('Content-Type: application/rss+xml; charset=UTF-8');

    return include(__DIR__ . '/_Server/BrokenMimeTypeRss.php');
}

if ($route === '/robots.txt') {
    return <<<ROBOTSTXT
        User-Agent: *
        Disallow:
        ROBOTSTXT;
}

if ($route === '/hello-world') {
    return include(__DIR__ . '/_Server/HelloWorld.php');
}


================================================
FILE: tests/_Integration/_Server/BlogPostWithJsonLd.php
================================================
<!doctype html>
<html lang="en">
<head>
    <meta charset=utf-8>
    <title>Prevent Homograph Attacks using the crwlr/url Package - crwlr.software</title>
</head>
<body id="crw">
<nav>
    <div class="inner">
        <a href="https://www.crwlr.software" class="logo" title="crwlr.software"></a>
        <ul>
            <li class="sub-nav-parent"><a href="https://www.crwlr.software/packages" title="Overview of PHP packages">Packages</a></li>
            <li><a href="https://www.crwlr.software/blog" title="Blog about crawling and scraping with PHP">Blog</a></li>
            <li><a href="https://www.crwlr.software/contact" title="Get in touch">Contact</a></li>
        </ul>
    </div>
</nav>
<main id="content">
    <div class="inner">
        <article class="blog-post">
            <h1>Prevent Homograph Attacks using the crwlr/url Package</h1>
            <div class="date">2022-01-19</div>
            <p>This post is not crawling/scraping related, but about another
                valuable use case for the url package, to prevent so-called
                homograph attacks.</p>
            <h2>About the attack</h2>
            <p>Homograph attacks are using internationalized domain names (IDN) for
                malicious links including domains that look like trusted organizations.
                You might know attacks where they want to trick you with typos
                like faecbook or things like zeros instead of Os (g00gle).
                Using internationalized domain names this kind of attack is even
                harder to spot because they are using characters that almost exactly
                look like other characters (also depending on the font they're
                displayed with).</p>
            <h3>Can you see the difference between those two As?</h3>
            <p>a а</p>
            <p>No? But in fact they aren't the same. The second one is a Cyrillic
                character.<br />
                You can check it e.g. by using PHP's ord function.</p>
            <pre><code class="language-php">var_dump(ord('a')); // int(97)
var_dump(ord('а')); // int(208)</code></pre>
            <p>Browsers already implemented mechanisms to warn users that a page
                they're visiting might not be as legitimate as they thought.</p>
            <p>But still: if on your website, you are linking to urls originating
                from user input, it'd be a good idea to have an eye on urls
                containing internationalized domain names.</p>
            <h2>How to identify IDN urls using the Url class</h2>
            <p>The Url class has the handy <code>hasIdn</code> method:</p>
            <pre><code class="language-php">$legitUrl = Url::parse('https://www.apple.com');
$seemsLegitUrl = Url::parse('https://www.аpple.com');

var_dump($legitUrl-&gt;hasIdn());              // bool(false)
var_dump($seemsLegitUrl-&gt;hasIdn());         // bool(true)

var_dump($legitUrl-&gt;__toString());          // string(21) "https://www.apple.com"
var_dump($seemsLegitUrl-&gt;__toString());     // string(28) "https://www.xn--pple-43d.com"</code></pre>
            <p>So you see, it's very easy to identify IDN urls with it. Of course
                there are many legitimate IDN domains, so you might not want to
                automatically block all of them. I'd suggest you could put some kind
                of monitoring in place that notifies you about users posting links
                to IDNs.</p>
            <p>Maybe you're operating in a country where IDNs are very common. Maybe
                in that case you can find a way to automatically sort out legitimate
                uses from your area.</p>
        </article>
        <script type="application/ld+json">{"@context":"https:\/\/schema.org","@type":"BlogPosting","headline":"Prevent Homograph Attacks using the crwlr\/url Package","author":{"@type":"Person","name":"Christian Olear","alternateName":"Otsch"},"description":"Homograph attacks are using internationalized domain names (IDN) for malicious links including domains that look like trusted organizations. You can use the crwlr Url class to detect and monitor urls containing IDNs in your user's input.","dateCreated":"2022-01-19","datePublished":"2022-01-19","keywords":"homograph, attack, security, idn, internationalized domain names, prevention, url, uri"}</script>
    </div>
</main>
<footer>
    <div class="inner">
        <div class="tiles">
            <div class="tile-hidden">
                <p class="no-margin-top">Follow crwlr.software on</p>
                <a href="https://github.com/crwlrsoft" target="_blank" rel="noopener"title="crwlr.software on GitHub">GitHub</a>
                <a href="https://twitter.com/crwlrsoft" target="_blank" rel="noopener"title="Follow crwlr.software on Twitter!">Twitter</a>
            </div>
            <div class="tile-hidden">
                <a href="/privacy">Privacy</a>
                <a href="/imprint">Imprint</a>
            </div>
        </div>
    </div>
</footer>
</body>
</html>


================================================
FILE: tests/_Integration/_Server/BrokenMimeTypeRss.php
================================================
<?xml version="1.0" encoding="UTF-8"?><rss version="2.0"
                                           xmlns:content="http://purl.org/rss/1.0/modules/content/"
                                           xmlns:wfw="http://wellformedweb.org/CommentAPI/"
                                           xmlns:dc="http://purl.org/dc/elements/1.1/"
                                           xmlns:atom="http://www.w3.org/2005/Atom"
                                           xmlns:sy="http://purl.org/rss/1.0/modules/syndication/"
                                           xmlns:slash="http://purl.org/rss/1.0/modules/slash/"
>

    <channel>
        <title>Lorem ipsum</title>
        <atom:link href="https://www.example.com/feed/" rel="self" type="application/rss+xml" />
        <link>https://www.example.com/</link>
        <description>Lorem ipsum dolor sit amet</description>
        <lastBuildDate>Fri, 10 Jan 2025 10:48:01 +0000</lastBuildDate>
        <language>en</language>
        <sy:updatePeriod>
            hourly	</sy:updatePeriod>
        <sy:updateFrequency>
            1	</sy:updateFrequency>

        <item>
            <title>Foo</title>
            <link>https://www.example.com/some-article</link>
            <comments>https://www.example.com/some-article#comments</comments>

            <dc:creator><![CDATA[Christian Olear]]></dc:creator>
            <pubDate>Fri, 10 Jan 2025 10:48:01 +0000</pubDate>
            <category><![CDATA[Foo]]></category>
            <category><![CDATA[Bar]]></category>
            <category><![CDATA[Baz]]></category>
            <guid isPermaLink="false">https://www.example.com/?a=123</guid>

            <description><![CDATA[<p>Lorem ipsum dolor</p><p>sit amet</p>]]></description>


            <enclosure url="https://www.example.com/some-article/image.jpg" type="image/jpeg" />	</item>
    </channel>
</rss>


================================================
FILE: tests/_Integration/_Server/BrowserActions/ClickAndWaitForReload.php
================================================
<!doctype html>
<html lang="de">
<head>
    <meta charset=utf-8>
    <title>Hello World</title>
</head>
<body>
<div>
    <div id="click">Click here</div>

    <script>
        document.getElementById('click').addEventListener('click', function (ev) {
            setTimeout(function () {
                window.location.href = '/browser-actions/click-and-wait-for-reload?reloaded=1';
            }, 200);
        })
    </script>

    <?php if (isset($_GET['reloaded'])) { ?>
        <div id="reloaded">yes</div>
    <?php } ?>
</div>
</body>
</html>


================================================
FILE: tests/_Integration/_Server/BrowserActions/EvaluateAndWaitForReload.php
================================================
<!doctype html>
<html lang="de">
<head><meta charset=utf-8><title>Hello World</title></head>
<body></body>
</html>


================================================
FILE: tests/_Integration/_Server/BrowserActions/EvaluateAndWaitForReloadReloaded.php
================================================
<!doctype html>
<html lang="de">
<head><meta charset=utf-8><title>Hello World</title></head>
<body>
<div id="reloaded">yay</div>
</body>
</html>


================================================
FILE: tests/_Integration/_Server/BrowserActions/Main.php
================================================
<!doctype html>
<html lang="de">
<head>
    <meta charset=utf-8>
    <title>Hello World</title>
    <style>
        #mouseover_check_1 { position: absolute; top: 200px; right: 100px; }
        #mouseover_check_2 { position: absolute; top: 400px; left:300px; }
        #scroll_down_check { position: absolute; top: 4000px; }
        #scroll_up_check { position: absolute; top: 2000px; }
    </style>
</head>
<body>
<div>
    <div id="click_el_wrapper"></div>
    <div id="shadow_host"></div>
    <div id="evaluation_container"></div>
    <div id="input_wrapper">
        <div id="input_value"></div>
        <input type="text" id="input" />
    </div>
    <div id="mouseover_check_1">mouse wasn't here yet</div>
    <div id="mouseover_check_2">mouse wasn't here yet</div>
    <div id="scroll_up_check">not scrolled up yet</div>
    <div id="scroll_down_check">not scrolled down yet</div>

    <script>
        setTimeout(function () {
            document.getElementById('click_el_wrapper').innerHTML = '<div id="click_worked"></div>' + "\n" +
                '<div id="click_element" onclick="document.getElementById(\'click_worked\').innerHTML = \'yes\'">' +
                'Click me</div>';
        }, 200);
        const shadowHost = document.getElementById('shadow_host');
        const shadowDom = shadowHost.attachShadow({ mode: 'open' });
        const shadowClickDiv = document.createElement('div');
        shadowClickDiv.id = 'shadow_click_div';
        shadowClickDiv.innerHTML = 'Not clicked yet';
        shadowClickDiv.addEventListener('click', function () {
            this.innerHTML = 'clicked';
        }, false);
        shadowDom.appendChild(shadowClickDiv);
        document.getElementById('mouseover_check_1').addEventListener('mouseover', function () {
            this.innerHTML = 'mouse was here';
        });
        document.getElementById('mouseover_check_2').addEventListener('mouseover', function () {
            this.innerHTML = 'mouse was here';
        });
        document.addEventListener('scroll', function () {
            const elementIsVisibleInViewport = (el, partiallyVisible = false) => {
                const { top, left, bottom, right } = el.getBoundingClientRect();
                const { innerHeight, innerWidth } = window;
                return partiallyVisible
                    ? ((top > 0 && top < innerHeight) ||
                        (bottom > 0 && bottom < innerHeight)) &&
                    ((left > 0 && left < innerWidth) || (right > 0 && right < innerWidth))
                    : top >= 0 && left >= 0 && bottom <= innerHeight && right <= innerWidth;
            };

            const scrollDownCheckEl = document.getElementById('scroll_down_check');
            const scrollUpCheckEl = document.getElementById('scroll_up_check');

            if (elementIsVisibleInViewport(scrollDownCheckEl, true) && scrollDownCheckEl.innerHTML !== 'scrolled down') {
                scrollDownCheckEl.innerHTML = 'scrolled down';
            }

            if (
                elementIsVisibleInViewport(scrollUpCheckEl, true) &&
                scrollDownCheckEl.innerHTML === 'scrolled down' &&
                scrollUpCheckEl.innerHTML !== 'scrolled up'
            ) {
                scrollUpCheckEl.innerHTML = 'scrolled up';
            }
        }, false);
        document.getElementById('input').addEventListener('input', function (ev) {
            document.getElementById('input_value').innerHTML = document.getElementById('input').value;
        }, false);
    </script>
</div>
</body>
</html>


================================================
FILE: tests/_Integration/_Server/BrowserActions/Wait.php
================================================
<!doctype html>
<html lang="de">
<head>
    <meta charset=utf-8>
    <title>Hello World</title>
</head>
<body>
<div>
    <div id="insert_here"></div>

    <script>
        setTimeout(function () {
            document.getElementById('insert_here').innerHTML = '<div id="delayed_container">hooray</div>';
        }, 200);
    </script>
</div>
</body>
</html>


================================================
FILE: tests/_Integration/_Server/Crawling.php
================================================
<?php

/**
 * Structure:
 *
 * /crawling/main
 *  => /crawling/sub1
 *      => /crawling/sub1/sub1
 *  => /crawling/sub2
 *      => /crawling/sub2/sub1
 *          => /crawling/sub2/sub1/sub1
 */

if ($route === '/crawling/sitemap.xml') {
    echo <<<XML
<?xml version="1.0" encoding="UTF-8"?>
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
<url><loc>http://www.example.com/crawling/main</loc></url>
<url><loc>http://www.example.com/crawling/sub1</loc></url>
<url><loc>http://www.example.com/crawling/sub1/sub1</loc></url>
<url><loc>http://www.example.com/crawling/sub2</loc></url>
<url><loc>http://www.example.com/crawling/sub2/sub1</loc></url>
<url><loc>http://www.example.com/crawling/sub2/sub1/sub1</loc></url>
</urlset>
XML;
}

if ($route === '/crawling/sitemap2.xml') {
    echo <<<XML
<?xml version="1.0" encoding="UTF-8"?><?xml-stylesheet type="text/xsl" href="/typo3/sysext/seo/Resources/Public/CSS/Sitemap.xsl"?>
<urlset xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:image="http://www.google.com/schemas/sitemap-image/1.1" xsi:schemaLocation="http://www.sitemaps.org/schemas/sitemap/0.9 http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd http://www.google.com/schemas/sitemap-image/1.1 http://www.google.com/schemas/sitemap-image/1.1/sitemap-image.xsd" xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
<url><loc>http://www.example.com/crawling/main</loc></url>
<url><loc>http://www.example.com/crawling/sub1</loc></url>
<url><loc>http://www.example.com/crawling/sub1/sub1</loc></url>
<url><loc>http://www.example.com/crawling/sub2</loc></url>
<url><loc>http://www.example.com/crawling/sub2/sub1</loc></url>
<url><loc>http://www.example.com/crawling/sub2/sub1/sub1</loc></url>
</urlset>
XML;
}

if ($route === '/crawling' || $route === '/crawling/redirect') {
    header('Location: http://www.example.com/crawling/main?redirect=1', true, 301);

    return '';
}

if ($route === '/crawling/main' || $route === '/crawling/main?redirect=1') {
    $showRedirectLinkHtml = '';

    if (!empty($_GET['redirect'] ?? null)) {
        $showRedirectLinkHtml = PHP_EOL . '<a href="/crawling">link</a>';
    }

    echo <<<HTML
        <!doctype html>
        <html lang="en">
        <body>
            {$showRedirectLinkHtml}

            <a href="/crawling/sub1">Subpage 1</a> <br>
            <a href="/crawling/sub2">Subpage 2</a> <br>
            <a href="/crawling/sub2#fragment1">Subpage 2 - Fragment 1</a> <br>
            <a href="/crawling/sub2#fragment2">Subpage 2 - Fragment 2</a> <br>

            <a href="https://www.crwlr.software/packages/crawler">External link</a>

            <a href="mailto:somebody@example.com">mailto link</a>
            <a href="javascript:alert('hello');">javascript link</a>
            <a href="tel:+499123456789">phone link</a>

            <a href="//">broken link</a>
        </body>
        </html>
        HTML;
}

if ($route === '/crawling/sub1') {
    echo <<<HTML
        <!doctype html>
        <html lang="en">
        <head>
            <title>foo</title>
            <base href="/crawling/">
            <link rel="canonical" href="/crawling/sub1/sub1" />
        </head>
        <body>
            <a href="sub1/sub1">Subpage 1 of Subpage 1</a> <br>

            <a href="https://www.foo.com">External link</a>

            <a href="http://foo.example.com/crawling/main-on-subdomain">Link to subdomain</a>
        </body>
        </html>
        HTML;
}

if ($route === '/crawling/sub1/sub1') {
    echo <<<HTML
        <!doctype html>
        <html lang="en">
        <body>
            <h1>Final level of sub1</h1>
            <h2>Subpage 1 of Subpage 1</h2>
            <a href="/crawling/main">Back to main</a>
        </body>
        </html>
        HTML;
}

if ($route === '/crawling/sub2') {
    echo <<<HTML
        <!doctype html>
        <html lang="en">
        <body>
            <a href="/crawling/sub2/sub1">Subpage 1 of Subpage 2</a>
        </body>
        </html>
        HTML;
}

if ($route === '/crawling/sub2/sub1') {
    echo <<<HTML
        <!doctype html>
        <html lang="en">
        <head>
            <title>foo</title>
            <link rel="canonical" href="/crawling/sub1/sub1" />
        </head>
        <body>
            <a href="/crawling/sub2/sub1/sub1">Subpage 1 of Subpage 1 of Subpage 2</a>
        </body>
        </html>
        HTML;
}

if ($route === '/crawling/sub2/sub1/sub1') {
    echo <<<HTML
        <!doctype html>
        <html lang="en">
        <body>
            <h1>Final level of sub2</h1>
            <h2>Subpage 1 of Subpage 1 of Subpage 2</h2>
            <a href="/crawling/sub2">Back to Subpage 2</a>
        </body>
        </html>
        HTML;
}

if ($route === '/crawling/main-on-subdomain') {
    echo <<<HTML
        <!doctype html>
        <html lang="en">
        <body>
            <h1>Main page on subdomain</h1>
        </body>
        </html>
        HTML;
}


================================================
FILE: tests/_Integration/_Server/HelloWorld.php
================================================
<!Doctype html>
<html>
<head>
    <title>Hello World!</title>
</head>
<body>
Hello World!
</body>
</html>


================================================
FILE: tests/_Integration/_Server/JsGeneratedContent.php
================================================
<!Doctype html>
<html lang="en">
<head>
    <title>JS Generated Content</title>
</head>
<body>
<div id="content">
</div>
<script>
    document.querySelector('#content').innerHTML = '<p>This was added through javascript</p>';
</script>
</body>
</html>


================================================
FILE: tests/_Integration/_Server/NonUtf8.php
================================================
<!Doctype html>
<html>
<head>
    <meta charset="UTF-8">
    <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
    <title>Non UTF-8 charset page</title>
</head>
<body>
<div class="element">
<?php
    $string = '';

    // 178 is square (² in ISO-8859-1) but broken in UTF-8
    foreach ([48, 32, 108, 47, 109, 178] as $ord) {
        $string .= chr($ord);
    }

    echo $string;
?>
</div>
</body>
</html>


================================================
FILE: tests/_Integration/_Server/PageInitScript.php
================================================
<!Doctype html>
<html>
<head>
</head>
<body>
<div id="content"></div>
<script>
    document.getElementById('content').innerHTML = window._secret_content;
</script>
</body>
</html>


================================================
FILE: tests/_Integration/_Server/PaginatedListing/Detail.php
================================================
<!Doctype html>
<html>
<head>
    <title>Paginated listing item detail</title>
</head>
<body>
<article>
    <h1>Some Item <?=$itemId?></h1>
    <span class="someNumber"><?=($itemId * 10)?></span>
</article>
</body>
</html>


================================================
FILE: tests/_Integration/_Server/PaginatedListing.php
================================================
<!Doctype html>
<html>
<head>
    <title>Paginated Listing</title>
</head>
<body>
<div id="listing">
    <?php
        $page = $_GET['page'] ?? 1;
        $additionalFooQueryParam = '';
        $itemsPerPage = 3;

        if (!empty($_GET['foo'])) {
            $additionalFooQueryParam = '&foo=' . $_GET['foo'];
        }

        if ($page < 4) {
            for ($i = 1; $i < 4; $i++) {
                $itemNumber = (($page - 1) * $itemsPerPage) + $i; ?>
                <div class="item">
                    <a href="/paginated-listing/items/<?=$itemNumber?>">Item <?=$itemNumber?></a>
                    <p>asdlfkj asdlfka jsdlfk ajsdflk</p>
                </div>
            <?php } ?>
        <?php } else {
            $itemNumber = (($page - 1) * $itemsPerPage) + 1; ?>
            <div class="item">
                <a href="/paginated-listing/items/<?=$itemNumber?>">Item <?=$itemNumber?></a>
                <p>asdflk jasdlfk asdlfk asldfk</p>
            </div>
        <?php } ?>

    <div id="pagination">
        <?php if ($page > 1) { ?>
            <a id="prevPage" href="/paginated-listing?page=<?=($page - 1) . $additionalFooQueryParam?>">&lt;&lt;</a>
        <?php } ?>

        <?php if ($page < 4) { ?>
            <a id="nextPage" href="/paginated-listing?page=<?=($page + 1) . $additionalFooQueryParam?>">&gt;&gt;</a>
        <?php } ?>
    </div>
</div>
</body>
</html>


================================================
FILE: tests/_Integration/_Server/PrintCookie.php
================================================
<?php

echo $_COOKIE['testcookie'] ?? '';


================================================
FILE: tests/_Integration/_Server/PrintCookies.php
================================================
<?php

if (is_array($_COOKIE)) {
    $lastKey = array_key_last($_COOKIE);

    foreach ($_COOKIE as $key => $value) {
        echo $key . '=' . $value . ($key !== $lastKey ? ';' : '');
    }
}


================================================
FILE: tests/_Integration/_Server/PrintHeaders.php
================================================
<?php

header('Content-Type: application/json');

echo json_encode(getallheaders());


================================================
FILE: tests/_Integration/_Server/Publisher/AuthorDetailPage.php
================================================
<!doctype html>
<html lang="en">
<head>
    <meta charset=utf-8>
    <title><?=$author?></title>
</head>
<body>
<h1>
    <?php
        if ($author === 'john') {
            echo "John Example";
        } else {
            echo "Susan Example";
        }
    ?>
</h1>

<div id="author-data">
    <div class="age">
        <?php
            if ($author === 'john') {
                echo "51";
            } else {
                echo "49";
            }
        ?>
    </div>
    <div class="born-in">
        <?php
            if ($author === 'john') {
                echo "Lisbon";
            } else {
                echo "Athens";
            }
        ?>
    </div>
    <div class="books">
        <?php
        if ($author === 'john') { ?>
            <a class="book" href="/publisher/books/1"><img src="/images/book1.jpg" /></a>
            <a class="book" href="/publisher/books/2"><img src="/images/book2.jpg" /></a>
        <?php } else { ?>
            <a class="book" href="/publisher/books/3"><img src="/images/book3.jpg" /></a>
            <a class="book" href="/publisher/books/4"><img src="/images/book4.jpg" /></a>
            <a class="book" href="/publisher/books/5"><img src="/images/book5.jpg" /></a>
        <?php } ?>
    </div>
</div>
</body>
</html>


================================================
FILE: tests/_Integration/_Server/Publisher/AuthorsListPage.php
================================================
<!doctype html>
<html lang="en">
<head>
    <meta charset=utf-8>
    <title>Example Publishing Authors</title>
</head>
<body>
    <h1>Our authors</h1>

    <div id="authors">
        <a href="/publisher/authors/john">John Example</a> <br>
        <a href="/publisher/authors/susan">Susan Example</a>
    </div>
</body>
</html>


================================================
FILE: tests/_Integration/_Server/Publisher/BookDetailPage.php
================================================
<!doctype html>
<html lang="en">
<head>
    <meta charset=utf-8>
    <title>Book</title>
</head>
<body>
<h1>
    <?php
        if ($bookNo === 1) {
            echo "Some novel";
        } elseif ($bookNo === 2) {
            echo "Another novel";
        } elseif ($bookNo === 3) {
            echo "Poems #1";
        } elseif ($bookNo === 4) {
            echo "Poems #2";
        } elseif ($bookNo === 5) {
            echo "Poems #3";
        }
    ?>
</h1>

<div id="editions">
<?php
    if (in_array($bookNo, [1, 3, 4, 5])) {
        // Some Novel
        echo '<a href="/publisher/books/' . $bookNo . '/edition/1">First Edition</a> ' .
            '<a href="/publisher/books/' . $bookNo . '/edition/2">Second Edition</a>';
    } elseif ($bookNo === 2) {
        // Another Novel
        echo '<a href="/publisher/books/' . $bookNo . '/edition/1">First Edition</a> ' .
            '<a href="/publisher/books/' . $bookNo . '/edition/2">Second Edition</a> ' .
            '<a href="/publisher/books/' . $bookNo . '/edition/3">Third Edition</a>';
    }
?>
</div>
</body>
</html>


================================================
FILE: tests/_Integration/_Server/Publisher/EditionDetailPage.php
================================================
<!doctype html>
<html lang="en">
<head>
    <meta charset=utf-8>
    <title>Book Edition</title>
</head>
<body>
<?php
    if ($bookNo === 1) {
        // Some Novel
        if ($edition === 1) {
            echo '<span class="year">1996</span> <span class="publishingCompany">Foo</span>';
        } elseif ($edition === 2) {
            echo '<span class="year">2005</span> <span class="publishingCompany">Foo</span>';
        }
    } elseif ($bookNo === 2) {
        // Another Novel
        if ($edition === 1) {
            echo '<span class="year">2001</span> <span class="publishingCompany">Foo</span>';
        } elseif ($edition === 2) {
            echo '<span class="year">2009</span> <span class="publishingCompany">Bar</span>';
        } elseif ($edition === 3) {
            echo '<span class="year">2017</span> <span class="publishingCompany">Bar</span>';
        }
    } elseif ($bookNo === 3) {
        // Poems #1
        if ($edition === 1) {
            echo '<span class="year">2008</span> <span class="publishingCompany">Poems</span>';
        } elseif ($edition === 2) {
            echo '<span class="year">2009</span> <span class="publishingCompany">Poems</span>';
        }
    } elseif ($bookNo === 4) {
        // Poems #2
        if ($edition === 1) {
            echo '<span class="year">2011</span> <span class="publishingCompany">Poems</span>';
        } elseif ($edition === 2) {
            echo '<span class="year">2014</span> <span class="publishingCompany">New Poems</span>';
        }
    } elseif ($bookNo === 5) {
        // Poems #3
        if ($edition === 1) {
            echo '<span class="year">2013</span> <span class="publishingCompany">Poems</span>';
        } elseif ($edition === 2) {
            echo '<span class="year">2017</span> <span class="publishingCompany">New Poems</span>';
        }
    }
?>
</body>
</html>


================================================
FILE: tests/_Integration/_Server/QueryParamPagination.php
================================================
<?php

if (isset($_GET['page'])) {
    $query = 'page=' . $_GET['page'];
} else {
    $query = file_get_contents('php://input');
}

if (in_array($query, ['page=1', 'page=2', 'page=3'], true)) {
    echo '{ "data": { "items": ["one", "two", "three"] } }';
} else {
    echo '{ "data": { "items": [] } }';
}


================================================
FILE: tests/_Integration/_Server/RssFeed.php
================================================
<?xml version="1.0" encoding="utf-8"?>
<rss version="2.0" xml:base="https://www.example.com" xmlns:dc="http://purl.org/dc/elements/1.1/"
     xmlns:atom="http://www.w3.org/2005/Atom" xmlns:media="http://search.yahoo.com/mrss/"
     xmlns:content="http://purl.org/rss/1.0/modules/content/">
    <channel>
        <title>Example</title>
        <link>https://www.example.com</link>
        <description>Public RSS feed</description>
        <language>en</language>
        <atom:link href="https://www.example.com/feeds/rss.xml" rel="self" type="application/rss+xml" />
        <item>
            <title><![CDATA[Foo, bar, baz]]></title>
            <link><![CDATA[https://www.example.com/foo/bar-baz]]></link>
            <description><![CDATA[Lorem ipsum]]></description>
            <pubDate>Wed, 08 Jan 2025 12:14:47 GMT</pubDate>
            <dc:creator>Christian Olear</dc:creator>
            <guid isPermaLink="false"><![CDATA[https://www.example.com/foo/bar-baz]]></guid>
            <media:thumbnail url="https://images.example.com/foo-bar-baz.jpg" />
        </item>
    </channel>
</rss>


================================================
FILE: tests/_Integration/_Server/ServiceUnavailable.php
================================================
<?php

if (!isset($isSecondRequest) || $isSecondRequest !== true) {
    http_response_code(503);
}

if (isset($retryAfter)) {
    header('Retry-After: ' . $retryAfter);
}


================================================
FILE: tests/_Integration/_Server/SetCookie.php
================================================
<?php

setcookie('testcookie', 'foo123');

echo "set cookie";


================================================
FILE: tests/_Integration/_Server/SetCookieJs.php
================================================
<!doctype html>
<html lang="de">
<head><meta charset=utf-8><title>yo</title></head>
<body>
<div>{$cookies}</div>
<script>document.cookie = "testcookie=javascriptcookie";</script>
</body>
</html>


================================================
FILE: tests/_Integration/_Server/SetDelayedCookieJs.php
================================================
<!doctype html>
<html lang="de">
<head>
    <meta charset=utf-8><title>Hey</title>
    <script src="/scripts/set-cookie.js"></script>
</head>
<body>
<div>
    <button type="button" id="consent_btn">
        Accept Cookie
    </button>
</div>
</body>
</html>


================================================
FILE: tests/_Integration/_Server/SetMultipleCookiesJs.php
================================================
<!doctype html>
<html lang="de">
<head><meta charset=utf-8><title>yo</title></head>
<body>
<script>
    document.cookie = "cookie1=cookie1value";
    document.cookie = "cookie2=cookie2value";
    document.cookie = "cookie3=cookie3value";
</script>
</body>
</html>


================================================
FILE: tests/_Integration/_Server/SimpleListing/Detail.php
================================================
<!Doctype html>
<html>
<head>
    <title>Simple listing article detail</title>
</head>
<body>
<article>
    <h1>Some Article <?=$articleId?></h1>
    <span class="date">2022-04-<?=(12 + $articleId)?></span>
    <span class="articleAuthor">Christian Olear</span>
</article>
</body>
</html>


================================================
FILE: tests/_Integration/_Server/SimpleListing.php
================================================
<!Doctype html>
<html>
<head>
    <title>Simple article listing</title>
</head>
<body>
    <div id="listing">
        <div class="listingItem">
            <a href="/simple-listing/article/1">Article 1</a>
            <p>asdfa sdlfka sdflkja sdflkj</p>
        </div>
        <div class="listingItem">
            <a href="/simple-listing/article/2">Article 2</a>
            <p>asldfkj aldfk jaslfk asdjflkajsdlf</p>
        </div>
        <div class="listingItem">
            <a href="/simple-listing/article/3">Article 3</a>
            <p>asldfk aslfdkjasd flkajsdfl kajsdflakjsdlf</p>
        </div>
    </div>
</body>
</html>


================================================
FILE: tests/_Integration/_Server/TooManyRequests.php
================================================
<?php

if (!isset($isSecondRequest) || $isSecondRequest !== true) {
    http_response_code(429);
}

if (isset($retryAfter)) {
    header('Retry-After: ' . $retryAfter);
}


================================================
FILE: tests/_Stubs/AbstractTestPaginator.php
================================================
<?php

namespace tests\_Stubs;

use Crwlr\Crawler\Steps\Loading\Http\AbstractPaginator;
use Crwlr\Crawler\Steps\Loading\Http\Paginator;
use GuzzleHttp\Psr7\Request;
use Psr\Http\Message\RequestInterface;

class AbstractTestPaginator extends AbstractPaginator
{
    public function __construct(
        int $maxPages = Paginator::MAX_PAGES_DEFAULT,
        private readonly string $nextUrl = 'https://www.example.com/bar',
    ) {
        parent::__construct($maxPages);
    }

    public function getNextRequest(): ?RequestInterface
    {
        return new Request('GET', $this->nextUrl);
    }

    /**
     * @return array<string, true>
     */
    public function getLoaded(): array
    {
        return $this->loaded;
    }

    public function getLoadedCount(): int
    {
        return $this->loadedCount;
    }

    public function getLatestRequest(): ?RequestInterface
    {
        return $this->latestRequest;
    }

    public function limitReached(): bool
    {
        return $this->maxPagesReached();
    }

    public function setFinished(): AbstractPaginator
    {
        return parent::setFinished();
    }
}


================================================
FILE: tests/_Stubs/Crawlers/DummyOne.php
================================================
<?php

namespace tests\_Stubs\Crawlers;

use Crwlr\Crawler\Crawler;
use Crwlr\Crawler\Loader\LoaderInterface;
use Crwlr\Crawler\UserAgents\BotUserAgent;
use Crwlr\Crawler\UserAgents\UserAgentInterface;
use Mockery;
use Psr\Log\LoggerInterface;

class DummyOne extends Crawler
{
    /**
     * @return BotUserAgent
     */
    public function userAgent(): UserAgentInterface
    {
        return new BotUserAgent('FooBot');
    }

    public function loader(UserAgentInterface $userAgent, LoggerInterface $logger): LoaderInterface
    {
        return Mockery::mock(LoaderInterface::class);
    }
}


================================================
FILE: tests/_Stubs/Crawlers/DummyTwo/DummyTwoLoader.php
================================================
<?php

namespace tests\_Stubs\Crawlers\DummyTwo;

use Crwlr\Crawler\Loader\Http\HttpLoader;

class DummyTwoLoader extends HttpLoader
{
    public string $testProperty = 'foo';
}


================================================
FILE: tests/_Stubs/Crawlers/DummyTwo/DummyTwoLogger.php
================================================
<?php

namespace tests\_Stubs\Crawlers\DummyTwo;

use Crwlr\Crawler\Logger\CliLogger;

class DummyTwoLogger extends CliLogger
{
    public string $testProperty = 'foo';
}


================================================
FILE: tests/_Stubs/Crawlers/DummyTwo/DummyTwoUserAgent.php
================================================
<?php

namespace tests\_Stubs\Crawlers\DummyTwo;

use Crwlr\Crawler\UserAgents\BotUserAgent;

class DummyTwoUserAgent extends BotUserAgent
{
    public string $testProperty = 'foo';
}


================================================
FILE: tests/_Stubs/Crawlers/DummyTwo.php
================================================
<?php

namespace tests\_Stubs\Crawlers;

use Crwlr\Crawler\Crawler;
use Crwlr\Crawler\Loader\LoaderInterface;
use Crwlr\Crawler\UserAgents\UserAgentInterface;
use Psr\Log\LoggerInterface;
use tests\_Stubs\Crawlers\DummyTwo\DummyTwoLoader;
use tests\_Stubs\Crawlers\DummyTwo\DummyTwoLogger;
use tests\_Stubs\Crawlers\DummyTwo\DummyTwoUserAgent;

/**
 * @property DummyTwoUserAgent $userAgent
 * @property DummyTwoLogger $logger
 * @property DummyTwoLoader $loader
 * @method DummyTwoUserAgent getUserAgent()
 * @method DummyTwoLogger getLogger()
 * @method DummyTwoLoader getLoader()
 */

class DummyTwo extends Crawler
{
    public int $userAgentCalled = 0;

    public int $loggerCalled = 0;

    public int $loaderCalled = 0;

    /**
     * @return DummyTwoUserAgent
     */
    protected function userAgent(): UserAgentInterface
    {
        $this->userAgentCalled += 1;

        return new DummyTwoUserAgent('FooBot');
    }

    /**
     * @return DummyTwoLogger
     */
    protected function logger(): LoggerInterface
    {
        $this->loggerCalled += 1;

        return new DummyTwoLogger();
    }

    /**
     * @return DummyTwoLoader
     */
    protected function loader(UserAgentInterface $userAgent, LoggerInterface $logger): LoaderInterface
    {
        $this->loaderCalled += 1;

        return new DummyTwoLoader($userAgent, null, $logger);
    }
}


================================================
FILE: tests/_Stubs/DummyLogger.php
================================================
<?php

namespace tests\_Stubs;

use InvalidArgumentException;
use Psr\Log\LoggerInterface;
use Stringable;
use UnexpectedValueException;

class DummyLogger implements LoggerInterface
{
    /**
     * @var array<int, array<string, string>>
     */
    public array $messages = [];

    public function emergency(string|Stringable $message, array $context = []): void
    {
        $this->log('emergency', $message, $context);
    }

    public function alert(string|Stringable $message, array $context = []): void
    {
        $this->log('alert', $message, $context);
    }

    public function critical(string|Stringable $message, array $context = []): void
    {
        $this->log('critical', $message, $context);
    }

    public function error(string|Stringable $message, array $context = []): void
    {
        $this->log('error', $message, $context);
    }

    public function warning(string|Stringable $message, array $context = []): void
    {
        $this->log('warning', $message, $context);
    }

    public function notice(string|Stringable $message, array $context = []): void
    {
        $this->log('notice', $message, $context);
    }

    public function info(string|Stringable $message, array $context = []): void
    {
        $this->log('info', $message, $context);
    }

    public function debug(string|Stringable $message, array $context = []): void
    {
        $this->log('debug', $message, $context);
    }

    /**
     * @param mixed $level
     * @param mixed[] $context
     */
    public function log($level, string|Stringable $message, array $context = []): void
    {
        if (!is_string($level)) {
            throw new InvalidArgumentException('Level must be string.');
        }

        if (!in_array($level, ['emergency', 'alert', 'critical', 'error', 'warning', 'notice', 'info', 'debug'], true)) {
            throw new UnexpectedValueException('Unknown log level.');
        }

        $this->messages[] = ['level' => $level, 'message' => $message];
    }
}


================================================
FILE: tests/_Stubs/PhantasyLoader.php
================================================
<?php

namespace tests\_Stubs;

use Crwlr\Crawler\Loader\Loader;

class PhantasyLoader extends Loader
{
    public function load(mixed $subject): mixed
    {
        return 'loaded ' . $subject;
    }

    public function loadOrFail(mixed $subject): mixed
    {
        return 'loaded ' . $subject;
    }
}


================================================
FILE: tests/_Stubs/RespondedRequestChild.php
================================================
<?php

namespace tests\_Stubs;

use Crwlr\Crawler\Loader\Http\Messages\RespondedRequest;
use Exception;

class RespondedRequestChild extends RespondedRequest
{
    /**
     * @throws Exception
     */
    public static function fromRespondedRequest(RespondedRequest $respondedRequest): self
    {
        return new self($respondedRequest->request, $respondedRequest->response);
    }

    public static function fromArray(array $data): RespondedRequestChild
    {
        $respondedRequest = parent::fromArray($data);

        return self::fromRespondedRequest($respondedRequest);
    }

    public function itseme(): string
    {
        return 'mario';
    }
}


================================================
FILE: tests/_Temp/_cachedir/.gitkeep
================================================


================================================
FILE: tests/_Temp/_storagedir/.gitkeep
================================================